<a href="https://colab.research.google.com/github/vaibhatt/ACM-Mentor-assignments/blob/master/movie_recommendations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import essential libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [None]:
#downloads movie lens dataset
!wget http://files.grouplens.org/datasets/movielens/ml-latest.zip
!unzip ml-latest.zip

--2020-08-02 12:33:31--  http://files.grouplens.org/datasets/movielens/ml-latest.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 277113433 (264M) [application/zip]
Saving to: ‘ml-latest.zip’


2020-08-02 12:33:49 (15.8 MB/s) - ‘ml-latest.zip’ saved [277113433/277113433]

Archive:  ml-latest.zip
   creating: ml-latest/
  inflating: ml-latest/links.csv     
  inflating: ml-latest/tags.csv      
  inflating: ml-latest/genome-tags.csv  
  inflating: ml-latest/ratings.csv   
  inflating: ml-latest/README.txt    
  inflating: ml-latest/genome-scores.csv  
  inflating: ml-latest/movies.csv    


In [None]:
ratings_df = pd.read_csv("/content/ml-latest/ratings.csv")
movies_df = pd.read_csv("/content/ml-latest/movies.csv")

In [None]:
#merge movies_df and ratings_df
#creates a column rating_count containing number of ratings for each movie
#creates a column mean_rating containing mean rating for each movie
#returns a dataframe containing movies having rating_count more than passed threshold
def process_df(movies_df,ratings_df,thresh = 10000):
  df = movies_df.merge(ratings_df,on="movieId")
  df = df.merge(pd.DataFrame(df.groupby('title')["rating"].mean()).rename(columns = {"rating":"mean_rating"}),on = "title")
  df = df.merge(pd.DataFrame(df.groupby("title")["userId"].count()).rename(columns = {"userId":"rate_count"}),on = "title")
  df_ratesorted = df[df.rate_count > thresh]
  return df_ratesorted
  


In [None]:
df = process_df(movies_df,ratings_df)
df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,mean_rating,rate_count
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,4.0,1113765937,3.886649,68469
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,5.0,948885850,3.886649,68469
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.5,1442169375,3.886649,68469
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,4.0,1370810063,3.886649,68469
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,22,4.0,1237622631,3.886649,68469
...,...,...,...,...,...,...,...,...
27254378,134853,Inside Out (2015),Adventure|Animation|Children|Comedy|Drama|Fantasy,283125,5.0,1477786484,3.960429,13659
27254379,134853,Inside Out (2015),Adventure|Animation|Children|Comedy|Drama|Fantasy,283163,5.0,1492645084,3.960429,13659
27254380,134853,Inside Out (2015),Adventure|Animation|Children|Comedy|Drama|Fantasy,283173,3.0,1487522775,3.960429,13659
27254381,134853,Inside Out (2015),Adventure|Animation|Children|Comedy|Drama|Fantasy,283183,3.5,1509584521,3.960429,13659


In [None]:
#create a pivot table of given dataframe and fills NA values with 0
def make_pivot(df):
  mat = df.pivot_table(index = "title",columns = "userId",values="rating")
  mat = mat.fillna(0)
  return mat

In [None]:
pivot_table = make_pivot(df)
movie_matrix = csr_matrix(pivot_table) #creates csr matrix

In [None]:
model = NearestNeighbors(algorithm = "brute")
model.fit(movie_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [None]:
#takes movie name and number of required recommendations as arguments and returns movies recommended
def recommend(movie_name,num_recommendations):
  n = np.where(pivot_table.index==movie_name)[0][0]
  distance,suggestion = model.kneighbors(pivot_table.iloc[n,:].values.reshape(1,-1),n_neighbors=num_recommendations+1)
  suggestion_list = list(suggestion[0][1:num_recommendations+1])
  print(f"{num_recommendations} movie recommendations for movie '{movie_name}' are:")
  n=1
  for i in suggestion_list:
    print(f"{n}. {pivot_table.index[i]}")
    n+=1

In [None]:
recommend(pivot_table.index[165],10)

10 movie recommendations for movie 'Curious Case of Benjamin Button, The (2008)' are:
1. Transformers (2007)
2. Day After Tomorrow, The (2004)
3. War of the Worlds (2005)
4. Ocean's Twelve (2004)
5. Batman & Robin (1997)
6. Lara Croft: Tomb Raider (2001)
7. Terminal, The (2004)
8. Troy (2004)
9. Fast and the Furious, The (2001)
10. Planet of the Apes (2001)
