In [None]:
import pandas as pd
import numpy as np
from datetime import time, datetime
from tqdm import tqdm
import os
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from scipy import sparse
from scipy.sparse import csr_matrix
import random
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Path 
trainSparseMatrixPath = "/content/drive/MyDrive/Netflix Movie recommendation/data/train_sparse.npz"
testSparseMatrixPath = "/content/drive/MyDrive/Netflix Movie recommendation/data/test_sparse.npz"
sampledTrainSparseMatrixPath =  "/content/drive/MyDrive/Netflix Movie recommendation/data/sample_train_sparse.npz"
sampledTestSparseMatrixPath =  "/content/drive/MyDrive/Netflix Movie recommendation/data/sample_test_sparse.npz"
trainDataFramePath = "/content/drive/MyDrive/Netflix Movie recommendation/data/train_data_frame.csv"

In [None]:
# Get the train and test sparse matrix
trainSparse = sparse.load_npz(trainSparseMatrixPath)
testSparse = sparse.load_npz(testSparseMatrixPath)

In [None]:
def getSampleData(sparseMatrix, numUsers, numMovies, filePath):

  if not os.path.exists(filePath):
    # Get Users, Movies and Ratings from Sparse matrix
    users, movies, ratings = sparse.find(sparseMatrix)

    # Get Unique Users
    uniqueUsers = np.unique(users)
    uniqueMovies = np.unique(movies)

    # Get Sample Users and Movies
    np.random.seed(45)
    sampledUsers = np.random.choice(uniqueUsers, numUsers, replace=False)
    sampledMovies = np.random.choice(uniqueMovies, numMovies, replace=False)

    # Select the ratings where both user and movie is present
    mask = np.logical_and(np.isin(users, sampledUsers), np.isin(movies, sampledMovies))
    sampledSparseMatrix = sparse.csr_matrix((ratings[mask], (users[mask], movies[mask])), shape=(max(sampledUsers)+1, max(sampledMovies)+1))
    
    # Write the Sparse matrix to file
    sparse.save_npz(filePath, sampledSparseMatrix)

  else:
    # Load the existing data
    sampledSparseMatrix = sparse.load_npz(filePath)

  return sampledSparseMatrix

In [None]:
start = datetime.now()
sampledTrainSparseMatrix = getSampleData(trainSparse, 20000, 5000, sampledTrainSparseMatrixPath)
print("Time Taken: ", datetime.now()-start)

Time Taken:  0:00:33.351740


In [None]:
start = datetime.now()
sampledTestSparseMatrix = getSampleData(testSparse, 10000, 2000, sampledTestSparseMatrixPath)
print("Time Taken: ", datetime.now()-start)

Time Taken:  0:00:10.267055


Feature Engineering

In [None]:
# Helper method
def getAverageRatings(sparseMatrix, isUser):
    
    # Average ratings of user/axes
    ax = 1 if isUser else 0 # 1 - User axes,0 - Movie axes

    # ".A1" is for converting Column_Matrix to 1-D numpy array 
    sumOfRatings = sparseMatrix.sum(axis=ax).A1
    # Boolean matrix of ratings ( whether a user rated that movie or not)
    isRated = sparseMatrix!=0
    # no of ratings that each user OR movie..
    numRatings = isRated.sum(axis=ax).A1
    
    # max_user and max_movie ids in sparse matrix 
    maxUser, maxMovie = sparseMatrix.shape
    # creae a dictonary of users and their average ratigns..
    averageRatings = { i : sumOfRatings[i]/numRatings[i]
                                 for i in range(maxUser if isUser else maxMovie) 
                                    if numRatings[i] !=0}

    # return that dictionary of average ratings
    return averageRatings

In [None]:
# Get Users movies and ratings from the Train Sparse
trainUsers, trainMovies, trainRatings = sparse.find(sampledTrainSparseMatrix)

In [None]:
# Initialize an empty dictionary
sampleTrainAverages = {}

In [None]:
# Feature 1 - Global average movie rating
sampleTrainAverages['global_rating'] = trainRatings.sum()/len(trainRatings)

In [None]:
# Feature 2 - Average ratings per user
sampleTrainAverages['user_rating'] = getAverageRatings(sampledTrainSparseMatrix, isUser=True)

In [None]:
# Feature 3 - Average ratings per movie
sampleTrainAverages['movie_rating'] = getAverageRatings(sampledTrainSparseMatrix, isUser=False)

In [None]:
sampleTrainAverages.keys()

dict_keys(['global_rating', 'user_rating', 'movie_rating'])