In [1]:
import os
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
column_names1 = ['user id','movie id','rating','timestamp']
dtype = {'userId':int, 'movieId': int, 'rating': float, 'timestamp': str}
dataset = pd.read_csv('movie_lens_dataset/ratings_small.csv', dtype=dtype)
dataset.columns = column_names1
dataset.head()
# len(dataset["movie id"].unique())

# column_names1 = ['user id','movie id','rating','timestamp']
# dataset = pd.read_csv('movie_lens_dataset/u.data', sep='\t',header=None,names=column_names1)
# dataset.head() 

Unnamed: 0,user id,movie id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
column_names2 = ["movie id", "movie title"]
movie_dataset = pd.read_csv('movie_lens_dataset/movies_metadata_processed.csv', encoding='latin-1')
movie_dataset = movie_dataset[["id", "title"]].rename(columns={"id": "movie id", "title": "movie title"})
movie_dataset.head()

# d = 'movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'
# column_names2 = d.split(' | ')
# items_dataset = pd.read_csv('movie_lens_dataset/u.item', sep='|',header=None,names=column_names2,encoding='latin-1')
# items_dataset['movie title'] = items_dataset['movie title'].str.slice(0,-7)
# movie_dataset = items_dataset[['movie id','movie title']]
# movie_dataset.head()


  movie_dataset = pd.read_csv('movie_lens_dataset/movies_metadata_processed.csv', encoding='latin-1')


Unnamed: 0,movie id,movie title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [37]:
# len(items_dataset.groupby(by=column_names2[1:])),len(movie_dataset)

## Merging required datasets

In [5]:
merged_dataset = pd.merge(dataset, movie_dataset, how='inner', on='movie id')
merged_dataset.head()
# merged_dataset.info()


Unnamed: 0,user id,movie id,rating,timestamp,movie title
0,1,1371,2.5,1260759135,Rocky III
1,4,1371,4.0,949810302,Rocky III
2,7,1371,3.0,851869160,Rocky III
3,19,1371,4.0,855193404,Rocky III
4,21,1371,3.0,853852263,Rocky III


In [6]:
# Example of a multiple rating scenario by an user to a specific movie:
merged_dataset[(merged_dataset['movie title'] == 'Chasing Amy') & (merged_dataset['user id'] == 894)]

Unnamed: 0,user id,movie id,rating,timestamp,movie title


In [7]:
# Merge rows with the same movie title and user id by aggregating them to their mean
merged_dataset["rating"] = pd.to_numeric(merged_dataset["rating"], downcast="float")
refined_dataset = merged_dataset.groupby(by=['user id','movie title'], as_index=False).agg({"rating":"mean"}).iloc[1:]
refined_dataset.head()

# refined_dataset = merged_dataset.groupby(by=['user id','movie title'], as_index=False).agg({"rating":"mean"}).iloc[1:]
# refined_dataset.head()


Unnamed: 0,user id,movie title,rating
1,1,Confidentially Yours,2.5
2,1,Greed,1.0
3,1,Jay and Silent Bob Strike Back,2.0
4,1,My Tutor,2.0
5,1,Rocky III,2.5


In [8]:
refined_dataset[refined_dataset["movie title"].str.contains("catch me") ]

Unnamed: 0,user id,movie title,rating


In [9]:
with open("./pickle/refined_dataset.pickle", "wb") as handle:
    pickle.dump(refined_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("refined_dataset saved!")

refined_dataset saved!


## Training KNN model to build collaborative recommender

In [10]:
# Reshaping model in such a way that each user has n-dimensional rating space where n is total number of movies
user_to_movie_df = refined_dataset.pivot(
    index='user id',
     columns='movie title',
      values='rating').fillna(0)

user_to_movie_df.head()

movie title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,...And God Created Woman,00 Schneider - Jagd auf Nihil Baxter,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,Â¡Three Amigos!,Ã nos amours,Ãdipussi,Åaban OÄlu Åaban
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
with open("./pickle/user_to_movie.pickle", "wb") as handle:
    pickle.dump(user_to_movie_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("User_to_movie_df saved!")

User_to_movie_df saved!


In [12]:
# transform matrix to scipy sparse matrix
user_to_movie_sparse_df = csr_matrix(user_to_movie_df.values)
user_to_movie_sparse_df

<671x2658 sparse matrix of type '<class 'numpy.float32'>'
	with 42719 stored elements in Compressed Sparse Row format>

In [13]:
# Fitting KNN model to scipy sparse matrix

knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_to_movie_sparse_df)

In [14]:
# Giving Input as User id, Number of similar Users to be considered, Number of top movie we want to recommend

def new_recommender_system(user_df, n_similar_users, n_movies): #, user_to_movie_df, knn_model):
  
  print("Movie seen by the User:")
  print(list(user_df["movie title"]))
  print("")
  user_id = -1

  # def get_similar_users(user, user_to_movie_df, knn_model, n = 5):
  def get_similar_users(n = 5):
    movies = list(user_df["movie title"])

    knn_input_array = np.array([4.5 if col in movies else 0 for col in user_to_movie_df.columns])
    
    knn_input = np.asarray([knn_input_array])
    
    distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)
    
    print("Top",n,"users who are very much similar to the User-",user_id, "are: ")
    print(" ")

    for i in range(1,len(distances[0])):
      print(i,". User:", indices[0][i]+1, "separated by distance of",distances[0][i])
    print("")
    return indices.flatten()[1:] + 1, distances.flatten()[1:]


  def filtered_movie_recommendations(n = 10):
  
    first_zero_index = np.where(mean_rating_list == 0)[0][-1]
    sortd_index = np.argsort(mean_rating_list)[::-1]
    sortd_index = sortd_index[:list(sortd_index).index(first_zero_index)]
    n = min(len(sortd_index),n)
    # movies_watched = list(refined_dataset[refined_dataset['user id'] == user_id]['movie title'])
    movies_watched = list(user_df["movie title"])
    filtered_movie_list = list(movies_list[sortd_index])
    count = 0
    final_movie_list = []
    for i in filtered_movie_list:
      if i not in movies_watched:
        count+=1
        final_movie_list.append(i)
      if count == n:
        break
    if count == 0:
      print("There are no movies left which are not seen by the input users and seen by similar users. May be increasing the number of similar users who are to be considered may give a chance of suggesting an unseen good movie.")
    else:
      print(final_movie_list)

  similar_user_list, distance_list = get_similar_users(n_similar_users)
  weightage_list = distance_list/np.sum(distance_list)
  mov_rtngs_sim_users = user_to_movie_df.values[similar_user_list]
  movies_list = user_to_movie_df.columns
  weightage_list = weightage_list[:,np.newaxis] + np.zeros(len(movies_list))
  new_rating_matrix = weightage_list*mov_rtngs_sim_users
  mean_rating_list = new_rating_matrix.sum(axis =0)
  print("")
  print("Movies recommended based on similar users are: ")
  print("")
  filtered_movie_recommendations(n_movies)

In [15]:
## create a new user row from movies they like

movies = ["The Matrix",
"The Dark Knight",
"Toy Story",
"The Avengers"]

def new_user_from_movies(movies):
    user_id_list = [-1] * len(movies)
    rating_list = [4.5] * len(movies)
    keys = refined_dataset.columns
    values = [user_id_list, movies, rating_list]
    # dic = {key: value for key in keys for value in values}
    dic = {}
    for col_index in range(len(keys)):
        dic[keys[col_index]] = values[col_index]
    return pd.DataFrame(dic)

new_user_dataaset = new_user_from_movies(movies)
new_user_dataaset.head()
# type(new_user_dataaset["movie title"][0])

new_recommender_system(new_user_dataaset, 5,15)

Movie seen by the User:
['The Matrix', 'The Dark Knight', 'Toy Story', 'The Avengers']

Top 5 users who are very much similar to the User- -1 are: 
 
1 . User: 467 separated by distance of 0.9098737347810836
2 . User: 311 separated by distance of 0.9173625836791792
3 . User: 516 separated by distance of 0.919967980787191
4 . User: 602 separated by distance of 0.9399037689838747
5 . User: 659 separated by distance of 0.9400718706586568


Movies recommended based on similar users are: 

['Monsoon Wedding', 'Silent Hill', 'The 39 Steps', 'Once Were Warriors', 'Rain Man', 'Men in Black II', 'Terminator 3: Rise of the Machines', 'To Kill a Mockingbird', 'Three Colors: Red', 'Ariel', 'A Nightmare on Elm Street', 'Reservoir Dogs', 'Judgment Night', 'Grill Point', 'Sissi']


In [16]:
# load pickle file

user_to_movie_df = pd.read_pickle('pickle/user_to_movie.pickle')
refined_dataset = pd.read_pickle('pickle/refined_dataset.pickle')
dataset = pd.read_csv('movie_lens_dataset/movies_metadata_processed.csv', encoding='latin-1')

# transform matrix to scipy sparse matrix

user_to_movie_sparse_df = csr_matrix(user_to_movie_df.values)

# Fitting KNN model to scipy sparse matrix

knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_to_movie_sparse_df)

## create a new user row from movies they like

# movies = ['Air Bud', 'Air Force One', 'Beverly Hills Ninja', 'Booty Call', 'Bulletproof', 'Conspiracy Theory', 'Fargo', 'Jack', 'Jungle2Jungle', 'Liar Liar', 'Love Jones', "McHale's Navy", 'Men in Black', 'Mimic', 'Money Talks', 'Murder at 1600', 'Private Parts', 'Scream', 'Thin Line Between Love and Hate, A']

def new_user_from_movies(movies):
    user_id_list = [-1] * len(movies)
    rating_list = [4.5] * len(movies)
    keys = refined_dataset.columns
    values = [user_id_list, movies, rating_list]
    # dic = {key: value for key in keys for value in values}
    dic = {}
    for col_index in range(len(keys)):
        dic[keys[col_index]] = values[col_index]
    # print(pd.DataFrame(dic))
    return pd.DataFrame(dic)

#  Giving Input as User id, Number of similar Users to be considered, Number of top movie we want to recommend

def new_recommender_system(user_df, n_similar_users, n_movies): #, user_to_movie_df, knn_model):
  
  print("Movie seen by the User:")
  print(list(user_df["movie title"]))
  print("")
  user_id = -1

  # def get_similar_users(user, user_to_movie_df, knn_model, n = 5):
  def get_similar_users(n = 5):
    movies = list(user_df["movie title"])
    print(movies)
    knn_input_array = np.array([4.5 if col in movies else 0 for col in user_to_movie_df.columns])
    
    knn_input = np.asarray([knn_input_array])
    
    distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)
    
    print("Top",n,"users who are very much similar to the User-",user_id, "are: ")
    print(" ")

    for i in range(1,len(distances[0])):
      print(i,". User:", indices[0][i]+1, "separated by distance of",distances[0][i])
    print("")
    return indices.flatten()[1:] + 1, distances.flatten()[1:]


  def filtered_movie_recommendations(n = 10):
  
    first_zero_index = np.where(mean_rating_list == 0)[0][-1]
    sortd_index = np.argsort(mean_rating_list)[::-1]
    sortd_index = sortd_index[:list(sortd_index).index(first_zero_index)]
    n = min(len(sortd_index),n)
    # movies_watched = list(refined_dataset[refined_dataset['user id'] == user_id]['movie title'])
    movies_watched = list(user_df["movie title"])
    filtered_movie_list = list(movies_list[sortd_index])
    count = 0
    final_movie_list = []
    for i in filtered_movie_list:
      if i not in movies_watched:
        count+=1
        final_movie_list.append(i)
      if count == n:
        break
    if count == 0:
      print("There are no movies left which are not seen by the input users and seen by similar users. May be increasing the number of similar users who are to be considered may give a chance of suggesting an unseen good movie.")
    else:
      return final_movie_list

  similar_user_list, distance_list = get_similar_users(n_similar_users)
  weightage_list = distance_list/np.sum(distance_list)
  mov_rtngs_sim_users = user_to_movie_df.values[similar_user_list]
  movies_list = user_to_movie_df.columns
  weightage_list = weightage_list[:,np.newaxis] + np.zeros(len(movies_list))
  new_rating_matrix = weightage_list*mov_rtngs_sim_users
  mean_rating_list = new_rating_matrix.sum(axis =0)
  print("")
  print("Movies recommended based on similar users are: ")
  print("")
  return filtered_movie_recommendations(n_movies)    
  
def collaborative_recommender(movies):
    new_user_dataset = new_user_from_movies(movies)
    # new_user_dataset.head()
    titles = new_recommender_system(new_user_dataset, 5,15)
    return dataset[dataset["title"].isin(titles)][["title", "poster_path", "release_date"]]

    # # new_user_dataset.head()
    # titles = new_recommender_system(new_user_dataset, 15,15)
    # return dataset[dataset["title"].isin(titles)][["title", "poster_path", "release_date"]]

test_movies = ["The Devil Wears Prada",
"Mean Girls",
"Sex and the City",]

# new_user_dataset = new_user_from_movies(test_movies)


# print("Recommended movies: ", new_recommender_system(new_user_dataset, 15, 15))
# print("Recommended movies1: ", collaborative_recommender(test_movies))

out_df = collaborative_recommender(test_movies)
out_df
# title, poster_path


Movie seen by the User:
['The Devil Wears Prada', 'Mean Girls', 'Sex and the City']

['The Devil Wears Prada', 'Mean Girls', 'Sex and the City']
Top 5 users who are very much similar to the User- -1 are: 
 
1 . User: 210 separated by distance of 0.815916910936624
2 . User: 421 separated by distance of 0.8611579730998753
3 . User: 124 separated by distance of 0.8670355009370933
4 . User: 127 separated by distance of 0.8694417580332267
5 . User: 649 separated by distance of 0.8790165203760432


Movies recommended based on similar users are: 



  dataset = pd.read_csv('movie_lens_dataset/movies_metadata_processed.csv', encoding='latin-1')


Unnamed: 0,title,poster_path,release_date
884,2001: A Space Odyssey,/90T7b2LIrL07ndYQBmSm09yqVEH.jpg,1968-04-10
1148,To Kill a Mockingbird,/gQg6sPYfNTUlf8wEtydzWl09RyR.jpg,1962-12-25
1160,Psycho,/81d8oyEFgj7FlxJqSDXWr8JH8kV.jpg,1960-06-16
1619,Titanic,/kHXEpyfl6zqn8a6YuozZUujufXf.jpg,1997-11-18
2041,Rope,/8JUyhb3j4d4KMPizrPUyDcYfovY.jpg,1948-08-23
2110,Say Anything...,/oVlRVSZx5IpQCC6uymBYpP1fggB.jpg,1989-04-14
3328,Solaris,/nsGLKlwEOtqatz8yRdxOlAw5utr.jpg,1972-03-20
4920,Monsoon Wedding,/2LysDwqhK0GlGtQqjPCiDeiBoqU.jpg,2001-08-30
5232,Men in Black II,/qWjRfBwr4VculczswwojXgoU0mq.jpg,2002-07-03
6700,The Passion of Joan of Arc,/5HL0dEJfd7PF0eRiKz8BiNfe8Tf.jpg,1928-04-21
