In [1]:
import pandas as pd
import numpy as np
from datetime import time, datetime
from tqdm import tqdm
import os
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from scipy import sparse
from scipy.sparse import csr_matrix
import random
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Path 
trainSparseMatrixPath = "/content/drive/MyDrive/Netflix Movie recommendation/data/train_sparse.npz"
testSparseMatrixPath = "/content/drive/MyDrive/Netflix Movie recommendation/data/test_sparse.npz"
sampledTrainSparseMatrixPath =  "/content/drive/MyDrive/Netflix Movie recommendation/data/sample_train_sparse.npz"
sampledTestSparseMatrixPath =  "/content/drive/MyDrive/Netflix Movie recommendation/data/sample_test_sparse.npz"
trainDataFramePath = "/content/drive/MyDrive/Netflix Movie recommendation/data/updated_train_data_frame.csv"
testDataFramePath = "/content/drive/MyDrive/Netflix Movie recommendation/data/test_data_frame.csv"

In [5]:
#Load the sampled train sparse matrix
sampledTrainSparseMatrix = sparse.load_npz(sampledTrainSparseMatrixPath)

In [6]:
# Get Users movies and ratings from the Train Sparse
trainUsers, trainMovies, trainRatings = sparse.find(sampledTrainSparseMatrix)

In [7]:
# Helper method
def getAverageRatings(sparseMatrix, isUser):
    
    # Average ratings of user/axes
    ax = 1 if isUser else 0 # 1 - User axes,0 - Movie axes

    # ".A1" is for converting Column_Matrix to 1-D numpy array 
    sumOfRatings = sparseMatrix.sum(axis=ax).A1
    # Boolean matrix of ratings ( whether a user rated that movie or not)
    isRated = sparseMatrix!=0
    # no of ratings that each user OR movie..
    numRatings = isRated.sum(axis=ax).A1
    
    # max_user and max_movie ids in sparse matrix 
    maxUser, maxMovie = sparseMatrix.shape
    # creae a dictonary of users and their average ratigns..
    averageRatings = { i : sumOfRatings[i]/numRatings[i]
                                 for i in range(maxUser if isUser else maxMovie) 
                                    if numRatings[i] !=0}

    # return that dictionary of average ratings
    return averageRatings

In [8]:
# Initialize an empty dictionary
sampleTrainAverages = {}

In [9]:
# Feature 1 - Global average movie rating
sampleTrainAverages['global_rating'] = trainRatings.sum()/len(trainRatings)

In [10]:
# Feature 2 - Average ratings per user
sampleTrainAverages['user_rating'] = getAverageRatings(sampledTrainSparseMatrix, isUser=True)

In [11]:
# Feature 3 - Average ratings per movie
sampleTrainAverages['movie_rating'] = getAverageRatings(sampledTrainSparseMatrix, isUser=False)

In [None]:
def computeTrainDataFrame(trainUsers, trainMovies, trainRatings, trainDataFramePath, sampledTrainSparseMatrix):
  start = datetime.now()
  if not os.path.isfile(trainDataFramePath):
    print('preparing {} tuples for the dataset..\n'.format(len(trainRatings)))
    with open(trainDataFramePath, mode='w') as reg_data_file:
      count = 0
      for (user, movie, rating)  in zip(trainUsers, trainMovies, trainRatings):
        st = datetime.now()  

        #--------------------- Ratings of "movie" by similar users of "user" ---------------------
        # compute the similar Users of the "user"        
        user_sim = cosine_similarity(sampledTrainSparseMatrix[user], sampledTrainSparseMatrix).ravel()
        top_sim_users = user_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
        # get the ratings of most similar users for this movie
        top_ratings = sampledTrainSparseMatrix[top_sim_users, movie].toarray().ravel()
        # we will make it's length "5" by adding movie averages to .
        top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
        top_sim_users_ratings.extend([sampleTrainAverages['movie_rating'][movie]]*(5 - len(top_sim_users_ratings)))


        #--------------------- Ratings by "user"  to similar movies of "movie" ---------------------
        # compute the similar movies of the "movie"        
        movie_sim = cosine_similarity(sampledTrainSparseMatrix[:,movie].T, sampledTrainSparseMatrix.T).ravel()
        top_sim_movies = movie_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
        # get the ratings of most similar movie rated by this user..
        top_ratings = sampledTrainSparseMatrix[user, top_sim_movies].toarray().ravel()
        # we will make it's length "5" by adding user averages to.
        top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
        top_sim_movies_ratings.extend([sampleTrainAverages['user_rating'][user]]*(5-len(top_sim_movies_ratings))) 
        #     print(top_sim_movies_ratings, end=" : -- ")

        
        #-----------------prepare the row to be stores in a file-----------------#
        row = list()
        row.append(user)
        row.append(movie)
        # Now add the other features to this data...
        row.append(sampleTrainAverages['global_rating']) # first feature
        # next 5 features are similar_users "movie" ratings
        row.extend(top_sim_users_ratings)
        # next 5 features are "user" ratings for similar_movies
        row.extend(top_sim_movies_ratings)
        # Avg_user rating
        row.append(sampleTrainAverages['user_rating'][user])
        # Avg_movie rating
        row.append(sampleTrainAverages['movie_rating'][movie])
        # finally, The actual Rating of this user-movie pair...
        row.append(rating)
        count = count + 1
        # add rows to the file opened..
        reg_data_file.write(','.join(map(str, row)))
        reg_data_file.write('\n')        
        if (count)%10000 == 0:
          # print(','.join(map(str, row)))
          print("Done for {} rows----- {}".format(count, datetime.now() - start))
  print(datetime.now() - start)

In [None]:
print('\n No of ratings in Our Sampled train matrix is : {}\n'.format(sampledTrainSparseMatrix.count_nonzero()))


 No of ratings in Our Sampled train matrix is : 1133751



In [None]:
computeTrainDataFrame(trainUsers, trainMovies, trainRatings, trainDataFramePath, sampledTrainSparseMatrix)

preparing 1133751 tuples for the dataset..

Done for 10000 rows----- 1:11:48.346610
Done for 20000 rows----- 2:24:05.319792
Done for 30000 rows----- 3:37:27.358510
Done for 40000 rows----- 4:48:26.700101
Done for 50000 rows----- 5:48:37.987074
Done for 60000 rows----- 6:47:43.417676
Done for 70000 rows----- 7:47:25.396886
Done for 80000 rows----- 8:47:44.330563
Done for 90000 rows----- 9:48:02.609811
Done for 100000 rows----- 10:48:20.896428
Done for 110000 rows----- 11:52:00.038452


In [None]:
df = pd.read_csv(trainDataFramePath)
df.shape

(110972, 16)

## Prepare the Test Data

In [20]:
def computeTestDataFrame(testUsers, testMovies, testRatings, testDataFramePath, sampledTrainSparseMatrix):
  start = datetime.now()

  if os.path.isfile(testDataFramePath):
      print("It is already created...")
  else:

      print('preparing {} tuples for the dataset..\n'.format(len(testRatings)))
      with open(testDataFramePath, mode='w') as reg_data_file:
          count = 0 
          for (user, movie, rating)  in zip(testUsers, testMovies, testRatings):
              st = datetime.now()

          #--------------------- Ratings of "movie" by similar users of "user" ---------------------
              #print(user, movie)
              try:
                  # compute the similar Users of the "user"        
                  user_sim = cosine_similarity(sampledTrainSparseMatrix[user], sampledTrainSparseMatrix).ravel()
                  top_sim_users = user_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
                  # get the ratings of most similar users for this movie
                  top_ratings = sampledTrainSparseMatrix[top_sim_users, movie].toarray().ravel()
                  # we will make it's length "5" by adding movie averages to .
                  top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
                  top_sim_users_ratings.extend([sampleTrainAverages['movie_rating'][movie]]*(5 - len(top_sim_users_ratings)))
                  # print(top_sim_users_ratings, end="--")

              except (IndexError, KeyError):
                  # It is a new User or new Movie or there are no ratings for given user for top similar movies...
                  ########## Cold STart Problem ##########
                  top_sim_users_ratings.extend([sampleTrainAverages['global_rating']]*(5 - len(top_sim_users_ratings)))
                  #print(top_sim_users_ratings)
              except:
                  print(user, movie)
                  # we just want KeyErrors to be resolved. Not every Exception...
                  raise



              #--------------------- Ratings by "user"  to similar movies of "movie" ---------------------
              try:
                  # compute the similar movies of the "movie"        
                  movie_sim = cosine_similarity(sampledTrainSparseMatrix[:,movie].T, sampledTrainSparseMatrix.T).ravel()
                  top_sim_movies = movie_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
                  # get the ratings of most similar movie rated by this user..
                  top_ratings = sampledTrainSparseMatrix[user, top_sim_movies].toarray().ravel()
                  # we will make it's length "5" by adding user averages to.
                  top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
                  top_sim_movies_ratings.extend([sampleTrainAverages['user_rating'][user]]*(5-len(top_sim_movies_ratings))) 
                  #print(top_sim_movies_ratings)
              except (IndexError, KeyError):
                  #print(top_sim_movies_ratings, end=" : -- ")
                  top_sim_movies_ratings.extend([sampleTrainAverages['global_rating']]*(5-len(top_sim_movies_ratings)))
                  #print(top_sim_movies_ratings)
              except :
                  raise

              #-----------------prepare the row to be stores in a file-----------------#
              row = list()
              # add usser and movie name first
              row.append(user)
              row.append(movie)
              row.append(sampleTrainAverages['global_rating']) # first feature
              #print(row)
              # next 5 features are similar_users "movie" ratings
              row.extend(top_sim_users_ratings)
              #print(row)
              # next 5 features are "user" ratings for similar_movies
              row.extend(top_sim_movies_ratings)
              #print(row)
              # Avg_user rating
              try:
                  row.append(sampleTrainAverages['user_rating'][user])
              except KeyError:
                  row.append(sampleTrainAverages['global_rating'])
              except:
                  raise
              #print(row)
              # Avg_movie rating
              try:
                  row.append(sampleTrainAverages['movie_rating'][movie])
              except KeyError:
                  row.append(sampleTrainAverages['global_rating'])
              except:
                  raise
              #print(row)
              # finalley, The actual Rating of this user-movie pair...
              row.append(rating)
              #print(row)
              count = count + 1

              # add rows to the file opened..
              reg_data_file.write(','.join(map(str, row)))
              #print(','.join(map(str, row)))
              reg_data_file.write('\n')        
              if (count)%1000 == 0:
                  #print(','.join(map(str, row)))
                  print("Done for {} rows----- {}".format(count, datetime.now() - start))
      print("",datetime.now() - start)  

In [15]:
#Load the sampled test sparse matrix
sampledTestSparseMatrix = sparse.load_npz(sampledTestSparseMatrixPath)

In [16]:
# Get Users movies and ratings from the Train Sparse
testUsers, testMovies, testRatings = sparse.find(sampledTestSparseMatrix)

In [21]:
# Compute the Test Data Frame
computeTestDataFrame(testUsers, testMovies, testRatings, testDataFramePath, sampledTrainSparseMatrix)

preparing 66502 tuples for the dataset..

Done for 1000 rows----- 0:05:56.701928
Done for 2000 rows----- 0:11:56.594824
Done for 3000 rows----- 0:18:00.936589
Done for 4000 rows----- 0:24:00.528093
Done for 5000 rows----- 0:29:55.018181
Done for 6000 rows----- 0:35:50.227778
Done for 7000 rows----- 0:41:58.714149
Done for 8000 rows----- 0:48:06.345141
Done for 9000 rows----- 0:54:08.078330
Done for 10000 rows----- 1:00:12.648032
Done for 11000 rows----- 1:06:15.160540
Done for 12000 rows----- 1:12:18.066039
Done for 13000 rows----- 1:18:17.572583
Done for 14000 rows----- 1:24:11.160135
Done for 15000 rows----- 1:29:42.439150
Done for 16000 rows----- 1:35:15.193505
Done for 17000 rows----- 1:40:56.240879
Done for 18000 rows----- 1:46:26.791845
Done for 19000 rows----- 1:51:53.154474
Done for 20000 rows----- 1:57:05.630409
Done for 21000 rows----- 2:02:20.635658
Done for 22000 rows----- 2:07:55.272525
Done for 23000 rows----- 2:13:37.719128
Done for 24000 rows----- 2:19:02.238099
Done fo