In [15]:
# All imports here
import pandas as pd
import numpy as np
from datetime import time, datetime
from tqdm import tqdm
import os

In [16]:
# Define the path for Data Set and CSV files
dataSetPaths = [f"/content/drive/MyDrive/Netflix Movie recommendation/data/combined_data_{index}.txt" for index in range(1,5)]
csvPath = "/content/drive/MyDrive/Netflix Movie recommendation/data/ratings.csv"
trainDataPath = "/content/drive/MyDrive/Netflix Movie recommendation/data/train.csv"
testDataPath = "/content/drive/MyDrive/Netflix Movie recommendation/data/test.csv"

In [17]:
# Helper method to convert Input Data to CSV format ([movie, user, rating, date])
def convertInputToCSV(filePaths, csvPath):

  start = datetime.now()
  curMovie = None

  # Open the CSV file
  data = open(csvPath, "w")

  # Read input files one by one, and write to CSV in required format
  for filePath in filePaths:
    print("Processing File:", filePath)
    with open(filePath, "r") as fp:
      for line in fp.readlines():
        line = line.strip()
        # Get the current Movie ID
        if ":" in line:
          curMovie = line.strip()[:-1]
        else:
          userId, rating, date = line.split(",")
          data.write(",".join([curMovie, userId, rating, date]))
          data.write("\n")

  print("Total Time taken:", datetime.now()-start)

  data.close()

In [18]:
# Write to CSV only if already not present
if not os.path.exists(csvPath):
  convertInputToCSV(dataSetPaths, csvPath)

In [19]:
# Read the CSV to pandas
ratings = pd.read_csv(csvPath, sep=',', names=["movie", "user", "rating", "date"], dtype={'movie': int, 'user': int, "rating": int})
ratings["date"] = pd.to_datetime(ratings["date"], format="%Y-%m-%d")

KeyboardInterrupt: ignored

In [8]:
# Sort the ratings in ascending order to facilitate time based split
ratings.sort_values(by="date", inplace=True)

In [9]:
# Get day of the week for experimenting purpose
ratings['day_of_week'] = ratings['date'].dt.day_name()

In [10]:
# Print out basic statistics
print("Number of ratings :",ratings.shape[0])
print("Number of Users   :", len(np.unique(ratings.user)))
print("Number of movies  :", len(np.unique(ratings.movie)))

Number of ratings : 100480507
Number of Users   : 480189
Number of movies  : 17770


In [12]:
# Split the data into Train and Test (80: 20 split)
splitIndex = int(0.8*len(ratings))

if not os.path.exists(trainDataPath):
  ratings.iloc[:splitIndex,:].to_csv(trainDataPath, index=False)

if not os.path.exists(testDataPath):
  ratings.iloc[splitIndex:,:].to_csv(testDataPath, index = False)