In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import PCA
from ast import literal_eval

plt.style.use('ggplot')
plt.rcParams['font.family'] = 'sans-serif' 
plt.rcParams['font.serif'] = 'Ubuntu' 
plt.rcParams['font.monospace'] = 'Ubuntu Mono' 
plt.rcParams['font.size'] = 14 
plt.rcParams['axes.labelsize'] = 12 
plt.rcParams['axes.labelweight'] = 'bold' 
plt.rcParams['axes.titlesize'] = 12 
plt.rcParams['xtick.labelsize'] = 12 
plt.rcParams['ytick.labelsize'] = 12 
plt.rcParams['legend.fontsize'] = 12 
plt.rcParams['figure.titlesize'] = 12 
plt.rcParams['image.cmap'] = 'jet' 
plt.rcParams['image.interpolation'] = 'none' 
plt.rcParams['figure.figsize'] = (12, 10) 
plt.rcParams['axes.grid']=True
plt.rcParams['lines.linewidth'] = 2 
plt.rcParams['lines.markersize'] = 8
colors = ['xkcd:pale orange', 'xkcd:sea blue', 'xkcd:pale red', 'xkcd:sage green', 'xkcd:terra cotta', 'xkcd:dull purple', 'xkcd:teal', 'xkcd: goldenrod', 'xkcd:cadet blue',
'xkcd:scarlet']

In [2]:
df_movies = pd.read_csv('datasets/to_use/movies_cleaned.csv')

In [3]:
df_movies.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
df_movies

Unnamed: 0,title,runtime,imdb_rating,num_of_rating,budget,worldwide_gross,origin,cast,director,writer,...,TV-Y7,TV-Y7-FV,U/A,Unrated,X,Not Rated,certificate_nan,movieId,num_of_rating_bins,budget_bins
0,The Shawshank Redemption (1994),150.0,9.3,2700000.0,27145000.0,2.888450e+07,United States,"['Tim Robbins,Morgan Freeman,Bob Gunton,Willia...",['Frank Darabont'],"['Stephen King,Frank Darabont']",...,0,0,0,0,0,0,0,318,super_high,medium
1,The Dark Knight (2008),180.0,9.0,2700000.0,200873000.0,1.006234e+09,"United States,United Kingdom","[""Christian Bale,Heath Ledger,Aaron Eckhart,Mi...",['Christopher Nolan'],"['Jonathan Nolan,Christopher Nolan,Christopher...",...,0,0,0,0,0,0,0,58559,super_high,high end
2,Inception (2010),150.0,8.8,2400000.0,173728000.0,8.368481e+08,"United States,United Kingdom","['Leonardo DiCaprio,Joseph Gordon-Levitt,Ellio...",['Christopher Nolan'],['Christopher Nolan'],...,0,0,0,0,0,0,0,79132,super_high,high end
3,Fight Club (1999),150.0,8.8,2100000.0,68405400.0,1.012097e+08,"Germany,United States","[""Edward Norton,Brad Pitt,Meat Loaf,Zach Greni...",['David Fincher'],"['Chuck Palahniuk,Jim Uhls']",...,0,0,0,0,0,0,0,2959,super_high,high end
4,Pulp Fiction (1994),180.0,8.9,2100000.0,8686400.0,2.139288e+08,United States,"['Tim Roth,Amanda Plummer,Laura Lovelace,John ...",['Quentin Tarantino'],"['Quentin Tarantino,Roger Avary,Quentin Tarant...",...,0,0,0,0,0,0,0,296,super_high,tier 2 low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22691,Doggiewoggiez! Poochiewoochiez! (2012),60.0,6.8,182.0,2171.6,,United States,"['Tim Allen,Michael Badalucco,Jim Belushi,Matt...",['nan'],['nan'],...,0,0,0,0,0,0,1,120134,super_low,indie
22692,Bobbikins (1959),90.0,5.0,182.0,,,United Kingdom,"[""Shirley Jones,Max Bygraves,Billie Whitelaw,B...",['Robert Day'],['Oscar Brodney'],...,0,0,0,0,0,0,0,113682,super_low,
22693,Vallen (2001),120.0,5.8,182.0,,,Belgium,"['Lee Williams,Emma Thomas,Alice Krige,Lydia C...",['Hans Herbots'],"['Hans Herbots,John Paul Chapple,Anne Provoost']",...,0,0,0,0,0,0,1,150064,super_low,
22694,Exclusive Story (1936),90.0,6.0,182.0,,,United States,"[""Franchot Tone,Madge Evans,Stuart Erwin,Josep...",['George B. Seitz'],"['Michael Fessier,Martin Mooney']",...,0,0,0,0,0,1,0,150944,super_low,


In [5]:
df_movies.isnull().sum()

title                     0
runtime                 232
imdb_rating               0
num_of_rating             0
budget                14488
                      ...  
Not Rated                 0
certificate_nan           0
movieId                   0
num_of_rating_bins        0
budget_bins           14488
Length: 94, dtype: int64

In [6]:
# drop those without runtime FOR NOW. leave budget
# df_movies = df_movies[df_movies['runtime'].notna()]

In [7]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(df_movies.head())

Unnamed: 0,title,runtime,imdb_rating,num_of_rating,budget,worldwide_gross,origin,cast,director,writer,producer,composer,cinematographer,editor,year,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,genre_nan,(Banned),13+,15,16+,18+,ADV,ADV16,AO,All,Approved,E,E10+,F,G,GA,GP,K-A,M,M/PG,M18,MA-13,MA-17,NC-16,NC-17,NC16,Open,PG,PG-13,PG13,Passed,R,R(A),R21,T,TV-13,TV-14,TV-G,TV-MA,TV-PG,TV-Y,TV-Y7,TV-Y7-FV,U/A,Unrated,X,Not Rated,certificate_nan,movieId,num_of_rating_bins,budget_bins
0,The Shawshank Redemption (1994),150.0,9.3,2700000.0,27145000.0,28884500.0,United States,"['Tim Robbins,Morgan Freeman,Bob Gunton,Willia...",['Frank Darabont'],"['Stephen King,Frank Darabont']","['Liz Glotzer,David V. Lester,Niki Marvin']",['Thomas Newman'],['Roger Deakins'],['Richard Francis-Bruce'],1990.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,318,super_high,medium
1,The Dark Knight (2008),180.0,9.0,2700000.0,200873000.0,1006234000.0,"United States,United Kingdom","[""Christian Bale,Heath Ledger,Aaron Eckhart,Mi...",['Christopher Nolan'],"['Jonathan Nolan,Christopher Nolan,Christopher...","['Kevin de la Noy,Jordan Goldberg,Philip Lee,B...","['James Newton Howard,Hans Zimmer']",['Wally Pfister'],['Lee Smith'],2000.0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58559,super_high,high end
2,Inception (2010),150.0,8.8,2400000.0,173728000.0,836848100.0,"United States,United Kingdom","['Leonardo DiCaprio,Joseph Gordon-Levitt,Ellio...",['Christopher Nolan'],['Christopher Nolan'],"['Zakaria Alaoui,John Bernard,Chris Brigham,Jo...",['Hans Zimmer'],['Wally Pfister'],['Lee Smith'],2000.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,79132,super_high,high end
3,Fight Club (1999),150.0,8.8,2100000.0,68405400.0,101209700.0,"Germany,United States","[""Edward Norton,Brad Pitt,Meat Loaf,Zach Greni...",['David Fincher'],"['Chuck Palahniuk,Jim Uhls']","['Ross Grayson Bell,Ceán Chaffin,John S. Dorse...","['Dust Brothers,John King,Michael Simpson']",['Jeff Cronenweth'],['James Haygood'],1990.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2959,super_high,high end
4,Pulp Fiction (1994),180.0,8.9,2100000.0,8686400.0,213928800.0,United States,"['Tim Roth,Amanda Plummer,Laura Lovelace,John ...",['Quentin Tarantino'],"['Quentin Tarantino,Roger Avary,Quentin Tarant...","['Lawrence Bender,Danny DeVito,Richard N. Glad...",['nan'],['Andrzej Sekula'],['Sally Menke'],1990.0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,296,super_high,tier 2 low


In [8]:
df_ratings = pd.read_csv('datasets/to_use/ratings_cleaned.csv')

In [9]:
df_ratings.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
df_ratings

Unnamed: 0,userId,movieId,rating
0,147413,1,3.5
1,103254,1,3.0
2,5320,1,5.0
3,1317,1,3.0
4,29725,1,3.5
...,...,...,...
13524164,39183,208941,3.0
13524165,92412,208943,2.0
13524166,84238,209041,3.0
13524167,15152,209053,3.5


# START OF ALGO

In [11]:
# Getting the rating given by a user to a movie.
def get_rating(userId,movieId):
    return (df_ratings.loc[(df_ratings.userId==userId) & (df_ratings.movieId == movieId),'rating'].iloc[0])
# Getting the list of all movie ids the specified user has rated.
def get_movieids(userId):
    return (df_ratings.loc[(df_ratings.userId==userId),'movieId'].tolist())
# Getting the movie titles against the movie id.
def get_movie_title(movieId):
    return (df_movies.loc[(df_movies.movieId == movieId),'title'].iloc[0])
# Getting the movie id against the movie title.
def get_movie_id(title):
    return (df_movies.loc[(df_movies.title == title),'movieId'].iloc[0])


In [12]:
print(get_rating(147413,1))
print(get_movieids(147413))
print(get_movie_title(1))


3.5
[1, 16, 19, 25, 36, 39, 44, 50, 70, 104, 111, 173, 208, 216, 223, 253, 296, 318, 353, 356, 357, 364, 367, 376, 377, 416, 431, 434, 441, 454, 475, 480, 485, 489, 500, 527, 539, 541, 543, 553, 555, 588, 589, 590, 592, 593, 597, 628, 697, 708, 802, 832, 858, 903, 1027, 1036, 1073, 1089, 1090, 1092, 1097, 1127, 1136, 1193, 1197, 1207, 1210, 1214, 1220, 1222, 1230, 1234, 1240, 1242, 1246, 1250, 1258, 1259, 1266, 1270, 1274, 1276, 1291, 1302, 1307, 1343, 1344, 1345, 1357, 1366, 1370, 1391, 1393, 1425, 1466, 1485, 1527, 1584, 1590, 1625, 1644, 1645, 1681, 1682, 1721, 1729, 1748, 1831, 1923, 1955, 1961, 1967, 1968, 1997, 2001, 2011, 2012, 2028, 2054, 2100, 2108, 2115, 2118, 2193, 2194, 2294, 2352, 2371, 2396, 2407, 2420, 2502, 2505, 2571, 2572, 2580, 2605, 2617, 2618, 2628, 2699, 2717, 2719, 2762, 2770, 2797, 2858, 2881, 2890, 2916, 2918, 2950, 2959, 2997, 3005, 3006, 3039, 3101, 3147, 3156, 3178, 3253, 3274, 3301, 3317, 3386, 3408, 3418, 3471, 3481, 3499, 3507, 3524, 3552, 3556, 3698, 375

Step 1: Filter out the movies which are rated by user X and get the genres only

Step 2: Get weighted genres for this user

Step 3: Get a recommendation (user-reviewed also included)

Step 4: Filter the reviewed movies

In [34]:
def get_recomm(userId):
    
    #STEP 1
    
    movie_id_df = df_ratings.loc[df_ratings['userId']==userId]
    user_movies = df_movies[df_movies['movieId'].isin(movie_id_df['movieId'].tolist())]
    user_movies.set_index('movieId', inplace=True)
    print(user_movies)
    user_movies = user_movies.iloc[:,15:44] #columns 15-43 are the genres one hot encoded
    print('----------- START OF ALL MOVIES THAT HAVE BEEN RATED BY USER X ----------------------')
    print(user_movies)
    print('----------- END OF ALL MOVIES THAT HAVE BEEN RATED BY USER X ----------------------')
    print(user_movies.shape)
    
    #STEP 2
    rating_df=df_ratings.loc[df_ratings['userId']==userId][['movieId','rating']]
    rating_df.set_index('movieId', inplace=True)
#     print(rating_df)
#     print(rating_df.shape)
    #Dot produt to get weights
    userProfile = user_movies.transpose().dot(rating_df)
    print('----------- START OF WEIGHTED GENRE PREFERENCE OF USER X ----------------------')
    print(userProfile)
    print('----------- END OF WEIGHTED GENRE PREFERENCE OF USER X ----------------------')
    
    #STEP 3
    
    movie_genres = df_movies.copy()
    movie_genres.set_index('movieId', inplace=True)
#     print(movie_genres)
    movie_genres = movie_genres.iloc[:,15:44]
#     print(movie_genres.transpose())
    
    recommendation_array=movie_genres.dot(userProfile)/(userProfile.sum())
    print('----------- START OF RECOMMENDATION SCORE OF EVERY MOVIE FOR USER X ----------------------')
    print(recommendation_array)
    print('----------- END OF RECOMMENDATION SCORE OF EVERY MOVIE FOR USER X ----------------------')
#     recommendation_series=pd.Series(recommendation_array).sort_values(ascending=False)
#     recommendation_keys=[keys for keys, value in recommendation_series.items() if value>=0.5 ]'
    recommendation_array.sort_values(by='rating', ascending=False, inplace=True)
    print(recommendation_array.head(50)) # LOOK AT THIS LINE IF YOU WANNA SEE THE PROBLEM (movies with the same genre are given the exact same score)
#   return top 50 recommended
    recommendation_array = recommendation_array.head(50)
#     print(recommendation_array.index.values.tolist())
    rec_movies=df_movies.loc[df_movies['movieId'].isin(recommendation_array.index.values.tolist())]
    print('----------- START OF DIRTY TOP 50 RECOMMENDED MOVIES FOR USER X ----------------------')
    print(rec_movies)
    print('----------- END OF DIRTY TOP 50 RECOMMENDED MOVIES FOR USER X ----------------------')
    
    #STEP 4
    
    user_reviews = df_ratings[df_ratings['userId'] == userId]
    movies_reviews = np.array(user_reviews['movieId'])    
    
    def get_movie_names(movie_ids):
#         '''
#         INPUT
#         movie_ids - a list of movie_ids
#         OUTPUT
#         movies - a list of movie names associated with the movie_ids    
#         '''
        movie_lst = list(df_movies[df_movies['movieId'].isin(movie_ids)]['title'])   
        return movie_lst
    
    movie_names = np.array(get_movie_names(movies_reviews))
    rec_movies=np.array(rec_movies['title'])
    recs=np.setdiff1d(rec_movies, movie_names)
    
    print('----------- DA GRAND REVEAL ----------------------')
    print(recs.size)
    print('50 RECOMMENDED MOVIES FOR USER', userId, ":")
    return recs


print(get_recomm(6969))

                                     title  runtime  imdb_rating  \
movieId                                                            
356                    Forrest Gump (1994)    150.0          8.8   
858                   The Godfather (1972)    180.0          9.2   
593        The Silence of the Lambs (1991)    120.0          8.6   
527                Schindler's List (1993)      NaN          9.0   
1270             Back to the Future (1985)    120.0          8.5   
...                                    ...      ...          ...   
60       The Indian in the Cupboard (1995)    120.0          6.0   
1683          The Wings of the Dove (1997)    120.0          7.1   
1082                  The Candidate (1972)    120.0          7.0   
1014                      Pollyanna (1960)    150.0          7.4   
853                           Dingo (1991)    120.0          6.9   

         num_of_rating      budget  worldwide_gross  \
movieId                                               
356  

In [35]:
print(get_recomm(1))

                                                     title  runtime  \
movieId                                                               
296                                    Pulp Fiction (1994)    180.0   
5952          The Lord of the Rings: The Two Towers (2002)    180.0   
6539     Pirates of the Caribbean: The Curse of the Bla...    150.0   
6377                                   Finding Nemo (2003)    120.0   
7361          Eternal Sunshine of the Spotless Mind (2004)    120.0   
3949                            Requiem for a Dream (2000)    120.0   
2011                     Back to the Future Part II (1989)    120.0   
8360                                        Shrek 2 (2004)    120.0   
6711                            Lost in Translation (2003)    120.0   
2012                    Back to the Future Part III (1990)    120.0   
1250                   The Bridge on the River Kwai (1957)    180.0   
2161                          The NeverEnding Story (1984)    120.0   
1217  

In [36]:
print(get_recomm(2))

                                                     title  runtime  \
movieId                                                               
318                        The Shawshank Redemption (1994)    150.0   
356                                    Forrest Gump (1994)    150.0   
2571                                     The Matrix (1999)    150.0   
4993     The Lord of the Rings: The Fellowship of the R...    180.0   
858                                   The Godfather (1972)    180.0   
...                                                    ...      ...   
534                                     Shadowlands (1993)    150.0   
8010                               The Power of One (1992)    150.0   
4535                       The Man from Snowy River (1982)    120.0   
1465                                       Rosewood (1997)    150.0   
6311                       The Other Side of Heaven (2001)    120.0   

         imdb_rating  num_of_rating       budget  worldwide_gross  \
movieId

In [37]:
print(get_recomm(3))

                                                  title  runtime  imdb_rating  \
movieId                                                                         
318                     The Shawshank Redemption (1994)    150.0          9.3   
58559                            The Dark Knight (2008)    180.0          9.0   
79132                                  Inception (2010)    150.0          8.8   
2959                                  Fight Club (1999)    150.0          8.8   
296                                 Pulp Fiction (1994)    180.0          8.9   
...                                                 ...      ...          ...   
72701                                  Planet 51 (2009)    120.0          6.0   
3993                                      Quills (2000)    150.0          7.3   
5004                                   The Party (1968)    120.0          7.4   
6482     Dumb and Dumberer: When Harry Met Lloyd (2003)     90.0          3.4   
3991                        

In [19]:
your_movie = get_movie_id('Pulp Fiction (1994)')
print(your_movie)

print(get_recomm(your_movie))

296
                                             title  runtime  imdb_rating  \
movieId                                                                    
318                The Shawshank Redemption (1994)    150.0          9.3   
58559                       The Dark Knight (2008)    180.0          9.0   
79132                             Inception (2010)    150.0          8.8   
2959                             Fight Club (1999)    150.0          8.8   
296                            Pulp Fiction (1994)    180.0          8.9   
...                                            ...      ...          ...   
98491                              Paperman (2012)     30.0          8.2   
47999                            Jesus Camp (2006)     90.0          7.4   
56563                             Helvetica (2007)     90.0          7.2   
8208                       The Razor's Edge (1946)    150.0          7.3   
85259    Winnie the Pooh and the Honey Tree (1966)     30.0          8.0   

       