In [58]:
import os
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [59]:
column_names1 = ['user id','movie id','rating','timestamp']
dtype = {'userId':int, 'movieId': int, 'rating': float, 'timestamp': str}
dataset = pd.read_csv('movie_lens_dataset/ratings_small.csv', dtype=dtype)
dataset.columns = column_names1
dataset.head()
# len(dataset["movie id"].unique())


Unnamed: 0,user id,movie id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [60]:
column_names2 = ["movie id", "movie title"]
movie_dataset = pd.read_csv('movie_lens_dataset/movies_metadata_processed.csv', encoding='latin-1')
movie_dataset = movie_dataset[["id", "title"]].rename(columns={"id": "movie id", "title": "movie title"})
movie_dataset.head()

  movie_dataset = pd.read_csv('movie_lens_dataset/movies_metadata_processed.csv', encoding='latin-1')


Unnamed: 0,movie id,movie title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [61]:
len(movie_dataset.groupby(by=column_names2[1:])),len(movie_dataset)

(41367, 41368)

## Merging required datasets

In [62]:
merged_dataset = pd.merge(dataset, movie_dataset, how='inner', on='movie id')
merged_dataset.head()
# merged_dataset.info()


Unnamed: 0,user id,movie id,rating,timestamp,movie title
0,1,1371,2.5,1260759135,Rocky III
1,4,1371,4.0,949810302,Rocky III
2,7,1371,3.0,851869160,Rocky III
3,19,1371,4.0,855193404,Rocky III
4,21,1371,3.0,853852263,Rocky III


In [63]:
# Example of a multiple rating scenario by an user to a specific movie:
merged_dataset[(merged_dataset['movie title'] == 'Chasing Amy') & (merged_dataset['user id'] == 306)]

Unnamed: 0,user id,movie id,rating,timestamp,movie title
35519,306,2255,2.0,940347467,Chasing Amy


In [64]:
# Merge rows with the same movie title and user id by aggregating them to their mean
merged_dataset["rating"] = pd.to_numeric(merged_dataset["rating"], downcast="float")
refined_dataset = merged_dataset.groupby(by=['user id','movie title'], as_index=False).agg({"rating":"mean"})
refined_dataset.head()
# refined_dataset[refined_dataset["user id"] == 1]


Unnamed: 0,user id,movie title,rating
0,1,American Pie,4.0
1,1,Confidentially Yours,2.5
2,1,Greed,1.0
3,1,Jay and Silent Bob Strike Back,2.0
4,1,My Tutor,2.0


In [65]:
with open("./pickle/refined_dataset.pickle", "wb") as handle:
    pickle.dump(refined_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("refined_dataset saved!")

refined_dataset saved!


## Training KNN model to build collaborative recommender

In [66]:
# Reshaping model in such a way that each user has n-dimensional rating space where n is total number of movies
user_to_movie_df = refined_dataset.pivot(
    index='user id',
     columns='movie title',
      values='rating').fillna(0)

user_to_movie_df.head()

movie title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,...And God Created Woman,00 Schneider - Jagd auf Nihil Baxter,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,Â¡Three Amigos!,Ã nos amours,Ãdipussi,Åaban OÄlu Åaban
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
with open("./pickle/user_to_movie.pickle", "wb") as handle:
    pickle.dump(user_to_movie_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("User_to_movie_df saved!")

User_to_movie_df saved!


In [68]:
# transform matrix to scipy sparse matrix
user_to_movie_sparse_df = csr_matrix(user_to_movie_df.values)
user_to_movie_sparse_df

<671x2658 sparse matrix of type '<class 'numpy.float32'>'
	with 42720 stored elements in Compressed Sparse Row format>

In [69]:
# Fitting KNN model to scipy sparse matrix

knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_to_movie_sparse_df)

In [70]:
# Giving Input as User id, Number of similar Users to be considered, Number of top movie we want to recommend

def new_recommender_system(user_df, n_similar_users, n_movies): #, user_to_movie_df, knn_model):
  
  print("Movie seen by the User:")
  print(list(user_df["movie title"]))
  print("")
  user_id = -1

  # def get_similar_users(user, user_to_movie_df, knn_model, n = 5):
  def get_similar_users(n = 5):

    knn_input_array = np.array([4.5 if col in movies else 0 for col in user_to_movie_df.columns])
    
    knn_input = np.asarray([knn_input_array])
    
    distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)
    
    print("Top",n,"users who are very much similar to the User-",user_id, "are: ")
    print(" ")

    for i in range(1,len(distances[0])):
      print(i,". User:", indices[0][i]+1, "separated by distance of",distances[0][i])
    print("")
    return indices.flatten()[1:] + 1, distances.flatten()[1:]


  def filtered_movie_recommendations(n = 10):
  
    first_zero_index = np.where(mean_rating_list == 0)[0][-1]
    sortd_index = np.argsort(mean_rating_list)[::-1]
    sortd_index = sortd_index[:list(sortd_index).index(first_zero_index)]
    n = min(len(sortd_index),n)
    # movies_watched = list(refined_dataset[refined_dataset['user id'] == user_id]['movie title'])
    movies_watched = list(user_df["movie title"])
    filtered_movie_list = list(movies_list[sortd_index])
    count = 0
    final_movie_list = []
    for i in filtered_movie_list:
      if i not in movies_watched:
        count+=1
        final_movie_list.append(i)
      if count == n:
        break
    if count == 0:
      print("There are no movies left which are not seen by the input users and seen by similar users. May be increasing the number of similar users who are to be considered may give a chance of suggesting an unseen good movie.")
    else:
      print(final_movie_list)

  similar_user_list, distance_list = get_similar_users(n_similar_users)
  weightage_list = distance_list/np.sum(distance_list)
  mov_rtngs_sim_users = user_to_movie_df.values[similar_user_list]
  movies_list = user_to_movie_df.columns
  weightage_list = weightage_list[:,np.newaxis] + np.zeros(len(movies_list))
  new_rating_matrix = weightage_list*mov_rtngs_sim_users
  mean_rating_list = new_rating_matrix.sum(axis =0)
  print("")
  print("Movies recommended based on similar users are: ")
  print("")
  filtered_movie_recommendations(n_movies)

In [76]:
## create a new user row from movies they like

movies = ["The Matrix",
"The Dark Knight",
"Toy Story",
"The Avengers"]

def new_user_from_movies(movies):
    user_id_list = [-1] * len(movies)
    rating_list = [4.5] * len(movies)
    keys = refined_dataset.columns
    values = [user_id_list, movies, rating_list]
    # dic = {key: value for key in keys for value in values}
    dic = {}
    for col_index in range(len(keys)):
        dic[keys[col_index]] = values[col_index]
    return pd.DataFrame(dic)

new_user_dataaset = new_user_from_movies(movies)
new_user_dataaset.head()
# type(new_user_dataaset["movie title"][0])

new_recommender_system(new_user_dataaset, 15,15)

Movie seen by the User:
['The Matrix', 'The Dark Knight', 'Toy Story', 'The Avengers']

Top 15 users who are very much similar to the User- -1 are: 
 
1 . User: 467 separated by distance of 0.9098737347810836
2 . User: 311 separated by distance of 0.9173625836791792
3 . User: 516 separated by distance of 0.919967980787191
4 . User: 602 separated by distance of 0.9399037689838747
5 . User: 659 separated by distance of 0.9400718706586568
6 . User: 472 separated by distance of 0.97028974759977
7 . User: 547 separated by distance of 0.9851330903193194
8 . User: 564 separated by distance of 0.986993228212468
9 . User: 444 separated by distance of 1.0
10 . User: 445 separated by distance of 1.0
11 . User: 446 separated by distance of 1.0
12 . User: 447 separated by distance of 1.0
13 . User: 448 separated by distance of 1.0
14 . User: 450 separated by distance of 1.0
15 . User: 454 separated by distance of 1.0


Movies recommended based on similar users are: 

['Monsoon Wedding', 'Terminator