# MOVIE RECOMMENDATION using POPULARITY BASED FILTERING

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
df_rating = pd.read_csv('D:/Internship/Content_Based_Filtering/ratings.csv', low_memory=False) #Rating file csv
df_movies = pd.read_csv('D:/Internship/Content_Based_Filtering/movies.csv', low_memory=False)  #Movies file csv

In [4]:
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [27]:
df_movie_ratings = pd.merge(df_movies, df_rating, on = 'movieId')   #merging rating and movies files
df_movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [59]:
#getting mean rating for the movies
ratings_mean_count = pd.DataFrame(df_movie_ratings.groupby('title')['rating'].mean())
#getting the count of users who have rated the movie
ratings_mean_count['rating_counts'] = pd.DataFrame(df_movie_ratings.groupby('title')['rating'].count())

In [60]:
#sorting the movies based on rating and count of votes
recommended_movie = pd.DataFrame(ratings_mean_count.sort_values(by = ["rating","rating_counts"], ascending = False))
#get top 10 movie recommendation based on popularity
recommended_movie.head(10)

Unnamed: 0_level_0,rating,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Belle époque (1992),5.0,2
Come and See (Idi i smotri) (1985),5.0,2
Enter the Void (2009),5.0,2
Heidi Fleiss: Hollywood Madam (1995),5.0,2
Jonah Who Will Be 25 in the Year 2000 (Jonas qui aura 25 ans en l'an 2000) (1976),5.0,2
Lamerica (1994),5.0,2
Lesson Faust (1994),5.0,2
'Salem's Lot (2004),5.0,1
12 Angry Men (1997),5.0,1
12 Chairs (1976),5.0,1


# MOVIE RECOMMENDATION using CONTENT BASED FILTERING

In [132]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [147]:
df_tags = pd.read_csv('D:/Internship/Content_Based_Filtering/tags.csv', low_memory=False)  #Tags file csv

In [148]:
movie_data = pd.merge(df_movies, df_tags, on = 'movieId')  #merging movies and tags files
movie_data.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932


In [149]:
#removing duplicate tags
movie_data = movie_data.drop_duplicates(subset='tag')
movie_data.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932
5,2,Jumanji (1995),Adventure|Children|Fantasy,62,Robin Williams,1528843907


In [153]:
#concatenating multiple tags for same titles
movie_data['tag'] = movie_data.groupby('title')['tag'].transform(lambda x : ' '.join(x))
movie_data.head(10)

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar fun,1139045764
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,pixar fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy magic board game Robin Williams game,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy magic board game Robin Williams game,1528843932
5,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy magic board game Robin Williams game,1528843907
6,2,Jumanji (1995),Adventure|Children|Fantasy,474,fantasy magic board game Robin Williams game,1137375552
7,3,Grumpier Old Men (1995),Comedy|Romance,289,moldy old,1143424860
8,3,Grumpier Old Men (1995),Comedy|Romance,289,moldy old,1143424860
9,5,Father of the Bride Part II (1995),Comedy,474,pregnancy remake,1137373903
10,5,Father of the Bride Part II (1995),Comedy,474,pregnancy remake,1137373903


In [154]:
movie_data = movie_data.drop_duplicates(subset='tag')
movie_data.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar fun,1139045764
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy magic board game Robin Williams game,1528843929
7,3,Grumpier Old Men (1995),Comedy|Romance,289,moldy old,1143424860
9,5,Father of the Bride Part II (1995),Comedy,474,pregnancy remake,1137373903
12,11,"American President, The (1995)",Comedy|Drama|Romance,474,politics president,1137374904


In [155]:
#removing '|' from genres column and replacing with a blank space
movie_data = movie_data.replace('\|',' ',regex=True).astype(str)
movie_data.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336,pixar fun,1139045764
3,2,Jumanji (1995),Adventure Children Fantasy,62,fantasy magic board game Robin Williams game,1528843929
7,3,Grumpier Old Men (1995),Comedy Romance,289,moldy old,1143424860
9,5,Father of the Bride Part II (1995),Comedy,474,pregnancy remake,1137373903
12,11,"American President, The (1995)",Comedy Drama Romance,474,politics president,1137374904


In [156]:
#Concatenating genres and tag for every movie
movie_data['text'] = movie_data['genres']+" "+movie_data['tag']
movie_data.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,text
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336,pixar fun,1139045764,Adventure Animation Children Comedy Fantasy pi...
3,2,Jumanji (1995),Adventure Children Fantasy,62,fantasy magic board game Robin Williams game,1528843929,Adventure Children Fantasy fantasy magic board...
7,3,Grumpier Old Men (1995),Comedy Romance,289,moldy old,1143424860,Comedy Romance moldy old
9,5,Father of the Bride Part II (1995),Comedy,474,pregnancy remake,1137373903,Comedy pregnancy remake
12,11,"American President, The (1995)",Comedy Drama Romance,474,politics president,1137374904,Comedy Drama Romance politics president


In [158]:
# convert TEXT column data to matrix to be able to compute similarity
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=0.0001)
tfidf_matrix = tfidf.fit_transform(movie_data['text'])

print(tfidf_matrix.shape)

(805, 4291)


In [161]:
# compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# create reverse title indices series
indices = pd.Series(movie_data.index, index=movie_data['title'])

In [167]:
def get_recommendation(title):
    # get index of given title
    ind = indices[title]
    
    # get similarity scores along with indices
    sim_scores = list(enumerate(cosine_sim[ind]))
    
    # sort the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # get top 10 similarity scores
    sim_scores = sim_scores[1:11]
    
    # get movie indices
    movie_indices = [i[0] for i in sim_scores]
    movie_scores = [i[1] for i in sim_scores]
    
    # get movie titles
    movies = movie_data.iloc[movie_indices]['title']
    
    return pd.DataFrame({'Recommended Movie':movies, 'Similarity Score':movie_scores})

In [168]:
n = input()
get_recommendation(n)

Toy Story (1995)


Unnamed: 0,Recommended Movie,Similarity Score
1271,"Bug's Life, A (1998)",0.592555
1527,Toy Story 2 (1999),0.427832
1816,Shrek (2001),0.406555
554,Space Jam (1996),0.387362
3380,The Lego Movie (2014),0.361274
2506,Shrek 2 (2004),0.357712
1204,"Lord of the Rings, The (1978)",0.350297
1216,Watership Down (1978),0.348897
1482,Who Framed Roger Rabbit? (1988),0.348242
1217,"Secret of NIMH, The (1982)",0.316334


In [169]:
n = input()
get_recommendation(n)

American President, The (1995)


Unnamed: 0,Recommended Movie,Similarity Score
3086,District 9 (2009),0.329033
2832,"Prestige, The (2006)",0.292839
2850,"Invisible, The (2007)",0.282986
407,Star Trek: Generations (1994),0.279975
979,Star Trek VI: The Undiscovered Country (1991),0.279731
2814,"Illusionist, The (2006)",0.267631
3245,The Hunger Games (2012),0.263647
3015,Moon (2009),0.259003
1103,"X-Files: Fight the Future, The (1998)",0.256022
101,Congo (1995),0.253982
