In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import PCA
from ast import literal_eval

plt.style.use('ggplot')
plt.rcParams['font.family'] = 'sans-serif' 
plt.rcParams['font.serif'] = 'Ubuntu' 
plt.rcParams['font.monospace'] = 'Ubuntu Mono' 
plt.rcParams['font.size'] = 14 
plt.rcParams['axes.labelsize'] = 12 
plt.rcParams['axes.labelweight'] = 'bold' 
plt.rcParams['axes.titlesize'] = 12 
plt.rcParams['xtick.labelsize'] = 12 
plt.rcParams['ytick.labelsize'] = 12 
plt.rcParams['legend.fontsize'] = 12 
plt.rcParams['figure.titlesize'] = 12 
plt.rcParams['image.cmap'] = 'jet' 
plt.rcParams['image.interpolation'] = 'none' 
plt.rcParams['figure.figsize'] = (12, 10) 
plt.rcParams['axes.grid']=True
plt.rcParams['lines.linewidth'] = 2 
plt.rcParams['lines.markersize'] = 8
colors = ['xkcd:pale orange', 'xkcd:sea blue', 'xkcd:pale red', 'xkcd:sage green', 'xkcd:terra cotta', 'xkcd:dull purple', 'xkcd:teal', 'xkcd: goldenrod', 'xkcd:cadet blue',
'xkcd:scarlet']

In [2]:
df_movies = pd.read_csv('datasets/to_use/movies_cleaned.csv')

In [3]:
df_movies.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
df_movies

Unnamed: 0,title,runtime,imdb_rating,num_of_rating,budget,worldwide_gross,origin,cast,director,writer,...,TV-Y7,TV-Y7-FV,U/A,Unrated,X,Not Rated,certificate_nan,movieId,num_of_rating_bins,budget_bins
0,The Shawshank Redemption (1994),150.0,9.3,2700000.0,27145000.0,2.888450e+07,United States,"['Tim Robbins,Morgan Freeman,Bob Gunton,Willia...",['Frank Darabont'],"['Stephen King,Frank Darabont']",...,0,0,0,0,0,0,0,318,super_high,medium
1,The Dark Knight (2008),180.0,9.0,2700000.0,200873000.0,1.006234e+09,"United States,United Kingdom","[""Christian Bale,Heath Ledger,Aaron Eckhart,Mi...",['Christopher Nolan'],"['Jonathan Nolan,Christopher Nolan,Christopher...",...,0,0,0,0,0,0,0,58559,super_high,high end
2,Inception (2010),150.0,8.8,2400000.0,173728000.0,8.368481e+08,"United States,United Kingdom","['Leonardo DiCaprio,Joseph Gordon-Levitt,Ellio...",['Christopher Nolan'],['Christopher Nolan'],...,0,0,0,0,0,0,0,79132,super_high,high end
3,Fight Club (1999),150.0,8.8,2100000.0,68405400.0,1.012097e+08,"Germany,United States","[""Edward Norton,Brad Pitt,Meat Loaf,Zach Greni...",['David Fincher'],"['Chuck Palahniuk,Jim Uhls']",...,0,0,0,0,0,0,0,2959,super_high,high end
4,Pulp Fiction (1994),180.0,8.9,2100000.0,8686400.0,2.139288e+08,United States,"['Tim Roth,Amanda Plummer,Laura Lovelace,John ...",['Quentin Tarantino'],"['Quentin Tarantino,Roger Avary,Quentin Tarant...",...,0,0,0,0,0,0,0,296,super_high,tier 2 low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22691,Doggiewoggiez! Poochiewoochiez! (2012),60.0,6.8,182.0,2171.6,,United States,"['Tim Allen,Michael Badalucco,Jim Belushi,Matt...",['nan'],['nan'],...,0,0,0,0,0,0,1,120134,super_low,indie
22692,Bobbikins (1959),90.0,5.0,182.0,,,United Kingdom,"[""Shirley Jones,Max Bygraves,Billie Whitelaw,B...",['Robert Day'],['Oscar Brodney'],...,0,0,0,0,0,0,0,113682,super_low,
22693,Vallen (2001),120.0,5.8,182.0,,,Belgium,"['Lee Williams,Emma Thomas,Alice Krige,Lydia C...",['Hans Herbots'],"['Hans Herbots,John Paul Chapple,Anne Provoost']",...,0,0,0,0,0,0,1,150064,super_low,
22694,Exclusive Story (1936),90.0,6.0,182.0,,,United States,"[""Franchot Tone,Madge Evans,Stuart Erwin,Josep...",['George B. Seitz'],"['Michael Fessier,Martin Mooney']",...,0,0,0,0,0,1,0,150944,super_low,


In [5]:
df_movies_original = df_movies.copy()

In [6]:
df_movies.isnull().sum()

title                     0
runtime                 232
imdb_rating               0
num_of_rating             0
budget                14488
                      ...  
Not Rated                 0
certificate_nan           0
movieId                   0
num_of_rating_bins        0
budget_bins           14488
Length: 94, dtype: int64

In [7]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(df_movies.head())

Unnamed: 0,title,runtime,imdb_rating,num_of_rating,budget,worldwide_gross,origin,cast,director,writer,producer,composer,cinematographer,editor,year,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,genre_nan,(Banned),13+,15,16+,18+,ADV,ADV16,AO,All,Approved,E,E10+,F,G,GA,GP,K-A,M,M/PG,M18,MA-13,MA-17,NC-16,NC-17,NC16,Open,PG,PG-13,PG13,Passed,R,R(A),R21,T,TV-13,TV-14,TV-G,TV-MA,TV-PG,TV-Y,TV-Y7,TV-Y7-FV,U/A,Unrated,X,Not Rated,certificate_nan,movieId,num_of_rating_bins,budget_bins
0,The Shawshank Redemption (1994),150.0,9.3,2700000.0,27145000.0,28884500.0,United States,"['Tim Robbins,Morgan Freeman,Bob Gunton,Willia...",['Frank Darabont'],"['Stephen King,Frank Darabont']","['Liz Glotzer,David V. Lester,Niki Marvin']",['Thomas Newman'],['Roger Deakins'],['Richard Francis-Bruce'],1990.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,318,super_high,medium
1,The Dark Knight (2008),180.0,9.0,2700000.0,200873000.0,1006234000.0,"United States,United Kingdom","[""Christian Bale,Heath Ledger,Aaron Eckhart,Mi...",['Christopher Nolan'],"['Jonathan Nolan,Christopher Nolan,Christopher...","['Kevin de la Noy,Jordan Goldberg,Philip Lee,B...","['James Newton Howard,Hans Zimmer']",['Wally Pfister'],['Lee Smith'],2000.0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58559,super_high,high end
2,Inception (2010),150.0,8.8,2400000.0,173728000.0,836848100.0,"United States,United Kingdom","['Leonardo DiCaprio,Joseph Gordon-Levitt,Ellio...",['Christopher Nolan'],['Christopher Nolan'],"['Zakaria Alaoui,John Bernard,Chris Brigham,Jo...",['Hans Zimmer'],['Wally Pfister'],['Lee Smith'],2000.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,79132,super_high,high end
3,Fight Club (1999),150.0,8.8,2100000.0,68405400.0,101209700.0,"Germany,United States","[""Edward Norton,Brad Pitt,Meat Loaf,Zach Greni...",['David Fincher'],"['Chuck Palahniuk,Jim Uhls']","['Ross Grayson Bell,Ceán Chaffin,John S. Dorse...","['Dust Brothers,John King,Michael Simpson']",['Jeff Cronenweth'],['James Haygood'],1990.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2959,super_high,high end
4,Pulp Fiction (1994),180.0,8.9,2100000.0,8686400.0,213928800.0,United States,"['Tim Roth,Amanda Plummer,Laura Lovelace,John ...",['Quentin Tarantino'],"['Quentin Tarantino,Roger Avary,Quentin Tarant...","['Lawrence Bender,Danny DeVito,Richard N. Glad...",['nan'],['Andrzej Sekula'],['Sally Menke'],1990.0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,296,super_high,tier 2 low


In [8]:
df_ratings = pd.read_csv('datasets/to_use/ratings_cleaned.csv')

In [9]:
df_ratings.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
df_ratings

Unnamed: 0,userId,movieId,rating
0,147413,1,3.5
1,103254,1,3.0
2,5320,1,5.0
3,1317,1,3.0
4,29725,1,3.5
...,...,...,...
13524164,39183,208941,3.0
13524165,92412,208943,2.0
13524166,84238,209041,3.0
13524167,15152,209053,3.5


# START OF ALGO: GENRE ONLY

In [11]:
# Getting the rating given by a user to a movie.
def get_rating(userId,movieId):
    return (df_ratings.loc[(df_ratings.userId==userId) & (df_ratings.movieId == movieId),'rating'].iloc[0])
# Getting the list of all movie ids the specified user has rated.
def get_movieids(userId):
    return (df_ratings.loc[(df_ratings.userId==userId),'movieId'].tolist())
# Getting the movie titles against the movie id.
def get_movie_title(movieId):
    return (df_movies_original.loc[(df_movies_original.movieId == movieId),'title'].iloc[0])
# Getting the movie id against the movie title.
def get_movie_id(title):
    return (df_movies_original.loc[(df_movies_original.title == title),'movieId'].iloc[0])


In [12]:
print(get_rating(147413,1))
print(get_movieids(147413))
print(get_movie_title(1))


3.5
[1, 16, 19, 25, 36, 39, 44, 50, 70, 104, 111, 173, 208, 216, 223, 253, 296, 318, 353, 356, 357, 364, 367, 376, 377, 416, 431, 434, 441, 454, 475, 480, 485, 489, 500, 527, 539, 541, 543, 553, 555, 588, 589, 590, 592, 593, 597, 628, 697, 708, 802, 832, 858, 903, 1027, 1036, 1073, 1089, 1090, 1092, 1097, 1127, 1136, 1193, 1197, 1207, 1210, 1214, 1220, 1222, 1230, 1234, 1240, 1242, 1246, 1250, 1258, 1259, 1266, 1270, 1274, 1276, 1291, 1302, 1307, 1343, 1344, 1345, 1357, 1366, 1370, 1391, 1393, 1425, 1466, 1485, 1527, 1584, 1590, 1625, 1644, 1645, 1681, 1682, 1721, 1729, 1748, 1831, 1923, 1955, 1961, 1967, 1968, 1997, 2001, 2011, 2012, 2028, 2054, 2100, 2108, 2115, 2118, 2193, 2194, 2294, 2352, 2371, 2396, 2407, 2420, 2502, 2505, 2571, 2572, 2580, 2605, 2617, 2618, 2628, 2699, 2717, 2719, 2762, 2770, 2797, 2858, 2881, 2890, 2916, 2918, 2950, 2959, 2997, 3005, 3006, 3039, 3101, 3147, 3156, 3178, 3253, 3274, 3301, 3317, 3386, 3408, 3418, 3471, 3481, 3499, 3507, 3524, 3552, 3556, 3698, 375

In [13]:
df_movies_original = df_movies.copy()

In [14]:
df_movies = df_movies_original[['movieId']]

In [15]:
genres_columns = df_movies_original.iloc[:,15:44]
df_movies = df_movies.join(genres_columns)

In [16]:
df_movies

Unnamed: 0,movieId,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,genre_nan
0,318,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,58559,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,79132,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,2959,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,296,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22691,120134,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22692,113682,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22693,150064,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
22694,150944,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


Step 1: Filter out the movies which are rated by user X and get the genres only

Step 2: Get weighted genres for this user

Step 3: Get a recommendation (user-reviewed also included)

Step 4: Filter the reviewed movies

In [17]:
def get_recomm(userId):
    
    #STEP 1
    
    movie_id_df = df_ratings.loc[df_ratings['userId']==userId]
    user_movies = df_movies[df_movies['movieId'].isin(movie_id_df['movieId'].tolist())]
    user_movies.set_index('movieId', inplace=True)
    print(user_movies)
#     CHANGE LINE BELOW DEPENDING ON VARIABLES YOU WANNA INCLUDE
#     user_movies = user_movies.iloc[:,15:44] #columns 15-43 are the genres one hot encoded
    print('----------- START OF ALL MOVIES THAT HAVE BEEN RATED BY USER X ----------------------')
    print(user_movies)
    print('----------- END OF ALL MOVIES THAT HAVE BEEN RATED BY USER X ----------------------')
    print(user_movies.shape)
    
    #STEP 2
    rating_df=df_ratings.loc[df_ratings['userId']==userId][['movieId','rating']]
    rating_df.set_index('movieId', inplace=True)
#     print(rating_df)
#     print(rating_df.shape)
    #Dot produt to get weights
    userProfile = user_movies.transpose().dot(rating_df)
    print('----------- START OF WEIGHTED GENRE PREFERENCE OF USER X ----------------------')
    print(userProfile)
    print('----------- END OF WEIGHTED GENRE PREFERENCE OF USER X ----------------------')
    
    #STEP 3
    
    movie_genres = df_movies.copy()
    movie_genres.set_index('movieId', inplace=True)
#     print(movie_genres)
#     movie_genres = movie_genres.iloc[:,15:44]
#     print(movie_genres.transpose())
    
    recommendation_array=movie_genres.dot(userProfile)/(userProfile.sum())
    print('----------- START OF RECOMMENDATION SCORE OF EVERY MOVIE FOR USER X ----------------------')
    print(recommendation_array)
    print('----------- END OF RECOMMENDATION SCORE OF EVERY MOVIE FOR USER X ----------------------')
#     recommendation_series=pd.Series(recommendation_array).sort_values(ascending=False)
#     recommendation_keys=[keys for keys, value in recommendation_series.items() if value>=0.5 ]'
    recommendation_array.sort_values(by='rating', ascending=False, inplace=True)
    print(recommendation_array.head(50)) # LOOK AT THIS LINE IF YOU WANNA SEE THE PROBLEM (movies with the same genre are given the exact same score)
#   return top 50 recommended
    recommendation_array = recommendation_array.head(50)
#     print(recommendation_array.index.values.tolist())
    rec_movies=df_movies_original.loc[df_movies_original['movieId'].isin(recommendation_array.index.values.tolist())]
    print('----------- START OF DIRTY TOP 50 RECOMMENDED MOVIES FOR USER X ----------------------')
    print(rec_movies)
    print('----------- END OF DIRTY TOP 50 RECOMMENDED MOVIES FOR USER X ----------------------')
    
    #STEP 4
    
    user_reviews = df_ratings[df_ratings['userId'] == userId]
    movies_reviews = np.array(user_reviews['movieId'])    
    
    def get_movie_names(movie_ids):
#         '''
#         INPUT
#         movie_ids - a list of movie_ids
#         OUTPUT
#         movies - a list of movie names associated with the movie_ids    
#         '''
        movie_list = list(df_movies_original[df_movies_original['movieId'].isin(movie_ids)]['title'])   
        return movie_list
    
    movie_names = np.array(get_movie_names(movies_reviews))
    rec_movies=np.array(rec_movies['title'])
    recs=np.setdiff1d(rec_movies, movie_names)
    
    print('----------- DA GRAND REVEAL ----------------------')
    print(recs.size)
    print('50 RECOMMENDED MOVIES FOR USER', userId, ":")
    return recs


print(get_recomm(1))

         Action  Adult  Adventure  Animation  Biography  Comedy  Crime  \
movieId                                                                  
296           0      0          0          0          0       0      1   
5952          1      0          1          0          0       0      0   
6539          1      0          1          0          0       0      0   
6377          0      0          1          1          0       1      0   
7361          0      0          0          0          0       0      0   
3949          0      0          0          0          0       0      0   
2011          0      0          1          0          0       1      0   
8360          0      0          1          1          0       1      0   
6711          0      0          0          0          0       1      0   
2012          0      0          1          0          0       1      0   
1250          0      0          1          0          0       0      0   
2161          0      0          1     

In [18]:
print(get_recomm(1))

         Action  Adult  Adventure  Animation  Biography  Comedy  Crime  \
movieId                                                                  
296           0      0          0          0          0       0      1   
5952          1      0          1          0          0       0      0   
6539          1      0          1          0          0       0      0   
6377          0      0          1          1          0       1      0   
7361          0      0          0          0          0       0      0   
3949          0      0          0          0          0       0      0   
2011          0      0          1          0          0       1      0   
8360          0      0          1          1          0       1      0   
6711          0      0          0          0          0       1      0   
2012          0      0          1          0          0       1      0   
1250          0      0          1          0          0       0      0   
2161          0      0          1     

In [19]:
print(get_recomm(2))

         Action  Adult  Adventure  Animation  Biography  Comedy  Crime  \
movieId                                                                  
318           0      0          0          0          0       0      0   
356           0      0          0          0          0       0      0   
2571          1      0          0          0          0       0      0   
4993          1      0          1          0          0       0      0   
858           0      0          0          0          0       0      1   
...         ...    ...        ...        ...        ...     ...    ...   
534           0      0          0          0          1       0      0   
8010          0      0          0          0          0       0      0   
4535          0      0          1          0          0       0      0   
1465          1      0          0          0          0       0      0   
6311          0      0          1          0          1       0      0   

         Documentary  Drama  Family  

In [20]:
print(get_recomm(3))

         Action  Adult  Adventure  Animation  Biography  Comedy  Crime  \
movieId                                                                  
318           0      0          0          0          0       0      0   
58559         1      0          0          0          0       0      1   
79132         1      0          1          0          0       0      0   
2959          0      0          0          0          0       0      0   
296           0      0          0          0          0       0      1   
...         ...    ...        ...        ...        ...     ...    ...   
72701         0      0          1          1          0       1      0   
3993          0      0          0          0          1       0      0   
5004          0      0          0          0          0       1      0   
6482          0      0          0          0          0       1      0   
3991          0      0          1          0          0       1      0   

         Documentary  Drama  Family  

In [21]:
your_movie = get_movie_id('Pulp Fiction (1994)')
print(your_movie)

print(get_recomm(your_movie))

296
         Action  Adult  Adventure  Animation  Biography  Comedy  Crime  \
movieId                                                                  
318           0      0          0          0          0       0      0   
58559         1      0          0          0          0       0      1   
79132         1      0          1          0          0       0      0   
2959          0      0          0          0          0       0      0   
296           0      0          0          0          0       0      1   
...         ...    ...        ...        ...        ...     ...    ...   
98491         0      0          0          1          0       1      0   
47999         0      0          0          0          0       0      0   
56563         0      0          0          0          0       0      0   
8208          0      0          0          0          0       0      0   
85259         0      0          0          1          0       1      0   

         Documentary  Drama  Fami

----------- END OF DIRTY TOP 50 RECOMMENDED MOVIES FOR USER X ----------------------
----------- DA GRAND REVEAL ----------------------
50
50 RECOMMENDED MOVIES FOR USER 296 :
['Aanandam (2016)' 'Adventures in the Sin Bin (2012)' 'Aferim! (2015)'
 'Almost Famous (2000)' 'Away We Go (2009)' 'Bass Ackwards (2010)'
 'Beverly Hills Chihuahua (2008)' 'Born to Be Wild (1995)' 'Buddy (1997)'
 'Camelot (1967)' 'Candleshoe (1977)' 'Casanova (2005)'
 'Charlie & Boots (2009)' 'College Road Trip (2008)'
 'Davy Crockett and the River Pirates (1956)' 'Dreams Come True (1984)'
 'For the Love of Benji (1977)' 'Girls Trip (2017)' 'Gormenghast (2000)'
 'Grimm (2003)' 'Hunt for the Wilderpeople (2016)' 'Joseph Andrews (1977)'
 'Josh and S.A.M. (1993)' 'Karwaan (2018)' 'Little Big Man (1970)'
 "Nim's Island (2008)" 'North 24 Kaatham (2013)'
 'Once Upon a Honeymoon (1942)' 'Os Saltimbancos Trapalhões (1981)'
 'Race the Sun (1996)' 'Riffraff (1947)' 'Rocca verändert die Welt (2019)'
 'Romance in Manhattan (

In [22]:
df_movies_original[df_movies_original['title'] == 'Amazons and Gladiators (2001)'].loc[:,'movieId']

12222    185361
Name: movieId, dtype: int64

# now we add budget and num of rating into the mix

In [23]:
df_movies = pd.read_csv('datasets/to_use/movies_cleaned.csv')

In [24]:
df_movies.drop('Unnamed: 0', axis=1, inplace=True)

In [25]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(df_movies.head())

Unnamed: 0,title,runtime,imdb_rating,num_of_rating,budget,worldwide_gross,origin,cast,director,writer,producer,composer,cinematographer,editor,year,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,genre_nan,(Banned),13+,15,16+,18+,ADV,ADV16,AO,All,Approved,E,E10+,F,G,GA,GP,K-A,M,M/PG,M18,MA-13,MA-17,NC-16,NC-17,NC16,Open,PG,PG-13,PG13,Passed,R,R(A),R21,T,TV-13,TV-14,TV-G,TV-MA,TV-PG,TV-Y,TV-Y7,TV-Y7-FV,U/A,Unrated,X,Not Rated,certificate_nan,movieId,num_of_rating_bins,budget_bins
0,The Shawshank Redemption (1994),150.0,9.3,2700000.0,27145000.0,28884500.0,United States,"['Tim Robbins,Morgan Freeman,Bob Gunton,Willia...",['Frank Darabont'],"['Stephen King,Frank Darabont']","['Liz Glotzer,David V. Lester,Niki Marvin']",['Thomas Newman'],['Roger Deakins'],['Richard Francis-Bruce'],1990.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,318,super_high,medium
1,The Dark Knight (2008),180.0,9.0,2700000.0,200873000.0,1006234000.0,"United States,United Kingdom","[""Christian Bale,Heath Ledger,Aaron Eckhart,Mi...",['Christopher Nolan'],"['Jonathan Nolan,Christopher Nolan,Christopher...","['Kevin de la Noy,Jordan Goldberg,Philip Lee,B...","['James Newton Howard,Hans Zimmer']",['Wally Pfister'],['Lee Smith'],2000.0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58559,super_high,high end
2,Inception (2010),150.0,8.8,2400000.0,173728000.0,836848100.0,"United States,United Kingdom","['Leonardo DiCaprio,Joseph Gordon-Levitt,Ellio...",['Christopher Nolan'],['Christopher Nolan'],"['Zakaria Alaoui,John Bernard,Chris Brigham,Jo...",['Hans Zimmer'],['Wally Pfister'],['Lee Smith'],2000.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,79132,super_high,high end
3,Fight Club (1999),150.0,8.8,2100000.0,68405400.0,101209700.0,"Germany,United States","[""Edward Norton,Brad Pitt,Meat Loaf,Zach Greni...",['David Fincher'],"['Chuck Palahniuk,Jim Uhls']","['Ross Grayson Bell,Ceán Chaffin,John S. Dorse...","['Dust Brothers,John King,Michael Simpson']",['Jeff Cronenweth'],['James Haygood'],1990.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2959,super_high,high end
4,Pulp Fiction (1994),180.0,8.9,2100000.0,8686400.0,213928800.0,United States,"['Tim Roth,Amanda Plummer,Laura Lovelace,John ...",['Quentin Tarantino'],"['Quentin Tarantino,Roger Avary,Quentin Tarant...","['Lawrence Bender,Danny DeVito,Richard N. Glad...",['nan'],['Andrzej Sekula'],['Sally Menke'],1990.0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,296,super_high,tier 2 low


In [26]:
df_ratings = pd.read_csv('datasets/to_use/ratings_cleaned.csv')

In [27]:
df_ratings.drop('Unnamed: 0', axis=1, inplace=True)

adding budget and num of rating to df_movies

In [28]:
df_movies_original = df_movies.copy()

In [29]:
df_movies_original

Unnamed: 0,title,runtime,imdb_rating,num_of_rating,budget,worldwide_gross,origin,cast,director,writer,...,TV-Y7,TV-Y7-FV,U/A,Unrated,X,Not Rated,certificate_nan,movieId,num_of_rating_bins,budget_bins
0,The Shawshank Redemption (1994),150.0,9.3,2700000.0,27145000.0,2.888450e+07,United States,"['Tim Robbins,Morgan Freeman,Bob Gunton,Willia...",['Frank Darabont'],"['Stephen King,Frank Darabont']",...,0,0,0,0,0,0,0,318,super_high,medium
1,The Dark Knight (2008),180.0,9.0,2700000.0,200873000.0,1.006234e+09,"United States,United Kingdom","[""Christian Bale,Heath Ledger,Aaron Eckhart,Mi...",['Christopher Nolan'],"['Jonathan Nolan,Christopher Nolan,Christopher...",...,0,0,0,0,0,0,0,58559,super_high,high end
2,Inception (2010),150.0,8.8,2400000.0,173728000.0,8.368481e+08,"United States,United Kingdom","['Leonardo DiCaprio,Joseph Gordon-Levitt,Ellio...",['Christopher Nolan'],['Christopher Nolan'],...,0,0,0,0,0,0,0,79132,super_high,high end
3,Fight Club (1999),150.0,8.8,2100000.0,68405400.0,1.012097e+08,"Germany,United States","[""Edward Norton,Brad Pitt,Meat Loaf,Zach Greni...",['David Fincher'],"['Chuck Palahniuk,Jim Uhls']",...,0,0,0,0,0,0,0,2959,super_high,high end
4,Pulp Fiction (1994),180.0,8.9,2100000.0,8686400.0,2.139288e+08,United States,"['Tim Roth,Amanda Plummer,Laura Lovelace,John ...",['Quentin Tarantino'],"['Quentin Tarantino,Roger Avary,Quentin Tarant...",...,0,0,0,0,0,0,0,296,super_high,tier 2 low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22691,Doggiewoggiez! Poochiewoochiez! (2012),60.0,6.8,182.0,2171.6,,United States,"['Tim Allen,Michael Badalucco,Jim Belushi,Matt...",['nan'],['nan'],...,0,0,0,0,0,0,1,120134,super_low,indie
22692,Bobbikins (1959),90.0,5.0,182.0,,,United Kingdom,"[""Shirley Jones,Max Bygraves,Billie Whitelaw,B...",['Robert Day'],['Oscar Brodney'],...,0,0,0,0,0,0,0,113682,super_low,
22693,Vallen (2001),120.0,5.8,182.0,,,Belgium,"['Lee Williams,Emma Thomas,Alice Krige,Lydia C...",['Hans Herbots'],"['Hans Herbots,John Paul Chapple,Anne Provoost']",...,0,0,0,0,0,0,1,150064,super_low,
22694,Exclusive Story (1936),90.0,6.0,182.0,,,United States,"[""Franchot Tone,Madge Evans,Stuart Erwin,Josep...",['George B. Seitz'],"['Michael Fessier,Martin Mooney']",...,0,0,0,0,0,1,0,150944,super_low,


In [30]:
df_movies = df_movies_original[['movieId']]

In [31]:
genres_columns = df_movies_original.iloc[:,15:44]
print(genres_columns.head())
df_movies = df_movies.join(genres_columns)

   Action  Adult  Adventure  Animation  Biography  Comedy  Crime  Documentary  \
0       0      0          0          0          0       0      0            0   
1       1      0          0          0          0       0      1            0   
2       1      0          1          0          0       0      0            0   
3       0      0          0          0          0       0      0            0   
4       0      0          0          0          0       0      1            0   

   Drama  Family  ...  Reality-TV  Romance  Sci-Fi  Short  Sport  Talk-Show  \
0      1       0  ...           0        0       0      0      0          0   
1      1       0  ...           0        0       0      0      0          0   
2      0       0  ...           0        0       1      0      0          0   
3      1       0  ...           0        0       0      0      0          0   
4      1       0  ...           0        0       0      0      0          0   

   Thriller  War  Western  genre_nan  

In [32]:
num_rating_columns = pd.get_dummies(df_movies_original.num_of_rating_bins, prefix='num_rating')
print(num_rating_columns.head())
df_movies = df_movies.join(num_rating_columns)

   num_rating_high  num_rating_low  num_rating_medium  num_rating_super_high  \
0                0               0                  0                      1   
1                0               0                  0                      1   
2                0               0                  0                      1   
3                0               0                  0                      1   
4                0               0                  0                      1   

   num_rating_super_low  
0                     0  
1                     0  
2                     0  
3                     0  
4                     0  


In [33]:
budget_columns = pd.get_dummies(df_movies_original.budget_bins, prefix='budget')
print(budget_columns.head())
df_movies = df_movies.join(budget_columns)

   budget_high end  budget_indie  budget_medium  budget_micro budget  \
0                0             0              1                    0   
1                1             0              0                    0   
2                1             0              0                    0   
3                1             0              0                    0   
4                0             0              0                    0   

   budget_tier 1 low  budget_tier 2 low  budget_tier 3 low  
0                  0                  0                  0  
1                  0                  0                  0  
2                  0                  0                  0  
3                  0                  0                  0  
4                  0                  1                  0  


In [34]:
df_movies

Unnamed: 0,movieId,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,num_rating_medium,num_rating_super_high,num_rating_super_low,budget_high end,budget_indie,budget_medium,budget_micro budget,budget_tier 1 low,budget_tier 2 low,budget_tier 3 low
0,318,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
1,58559,1,0,0,0,0,0,1,0,1,...,0,1,0,1,0,0,0,0,0,0
2,79132,1,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,2959,0,0,0,0,0,0,0,0,1,...,0,1,0,1,0,0,0,0,0,0
4,296,0,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22691,120134,0,0,0,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
22692,113682,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
22693,150064,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
22694,150944,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [35]:
df_ratings = df_ratings.sort_values(by=['userId'])

In [36]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
10623028,1,31956,3.5
5716526,1,2161,3.5
5429904,1,2012,2.5
8611552,1,4703,4.0
8012996,1,3949,5.0


In [37]:
df_ratings[df_ratings.userId == 1]

Unnamed: 0,userId,movieId,rating
10623028,1,31956,3.5
5716526,1,2161,3.5
5429904,1,2012,2.5
8611552,1,4703,4.0
8012996,1,3949,5.0
10028476,1,7327,3.5
9524192,1,6377,4.0
9294655,1,5952,4.0
3693178,1,1217,3.5
9661731,1,6711,5.0


In [38]:
def get_recomm(userId):
    
    #STEP 1
    
    movie_id_df = df_ratings.loc[df_ratings['userId']==userId]
    user_movies = df_movies[df_movies['movieId'].isin(movie_id_df['movieId'].tolist())]
    user_movies.set_index('movieId', inplace=True)
    print(user_movies)
#     CHANGE LINE BELOW DEPENDING ON VARIABLES YOU WANNA INCLUDE
#     user_movies = user_movies.iloc[:,15:44] #columns 15-43 are the genres one hot encoded
    print('----------- START OF ALL MOVIES THAT HAVE BEEN RATED BY USER X ----------------------')
    print(user_movies)
    print('----------- END OF ALL MOVIES THAT HAVE BEEN RATED BY USER X ----------------------')
    print(user_movies.shape)
    
    #STEP 2
    rating_df=df_ratings.loc[df_ratings['userId']==userId][['movieId','rating']]
    rating_df.set_index('movieId', inplace=True)
#     print(rating_df)
#     print(rating_df.shape)
    #Dot produt to get weights
    userProfile = user_movies.transpose().dot(rating_df)
    print('----------- START OF WEIGHTED GENRE PREFERENCE OF USER X ----------------------')
    print(userProfile)
    print('----------- END OF WEIGHTED GENRE PREFERENCE OF USER X ----------------------')
    
    #STEP 3
    
    movie_genres = df_movies.copy()
    movie_genres.set_index('movieId', inplace=True)
#     print(movie_genres)
#     movie_genres = movie_genres.iloc[:,15:44]
#     print(movie_genres.transpose())
    
    recommendation_array=movie_genres.dot(userProfile)/(userProfile.sum())
    print('----------- START OF RECOMMENDATION SCORE OF EVERY MOVIE FOR USER X ----------------------')
    print(recommendation_array)
    print('----------- END OF RECOMMENDATION SCORE OF EVERY MOVIE FOR USER X ----------------------')
#     recommendation_series=pd.Series(recommendation_array).sort_values(ascending=False)
#     recommendation_keys=[keys for keys, value in recommendation_series.items() if value>=0.5 ]'
    recommendation_array.sort_values(by='rating', ascending=False, inplace=True)
    print(recommendation_array.head(50)) # LOOK AT THIS LINE IF YOU WANNA SEE THE PROBLEM (movies with the same genre are given the exact same score)
#   return top 50 recommended
    recommendation_array = recommendation_array.head(50)
#     print(recommendation_array.index.values.tolist())
    rec_movies=df_movies_original.loc[df_movies_original['movieId'].isin(recommendation_array.index.values.tolist())]
    print('----------- START OF DIRTY TOP 50 RECOMMENDED MOVIES FOR USER X ----------------------')
    print(rec_movies)
    print('----------- END OF DIRTY TOP 50 RECOMMENDED MOVIES FOR USER X ----------------------')
    
    #STEP 4
    
    user_reviews = df_ratings[df_ratings['userId'] == userId]
    movies_reviews = np.array(user_reviews['movieId'])    
    
    def get_movie_names(movie_ids):
#         '''
#         INPUT
#         movie_ids - a list of movie_ids
#         OUTPUT
#         movies - a list of movie names associated with the movie_ids    
#         '''
        movie_list = list(df_movies_original[df_movies_original['movieId'].isin(movie_ids)]['title'])   
        return movie_list
    
    movie_names = np.array(get_movie_names(movies_reviews))
    rec_movies=np.array(rec_movies['title'])
    recs=np.setdiff1d(rec_movies, movie_names)
    
    print('----------- DA GRAND REVEAL ----------------------')
    print(recs.size)
    print('50 RECOMMENDED MOVIES FOR USER', userId, ":")
    return recs


print(get_recomm(1))

         Action  Adult  Adventure  Animation  Biography  Comedy  Crime  \
movieId                                                                  
296           0      0          0          0          0       0      1   
5952          1      0          1          0          0       0      0   
6539          1      0          1          0          0       0      0   
6377          0      0          1          1          0       1      0   
7361          0      0          0          0          0       0      0   
3949          0      0          0          0          0       0      0   
2011          0      0          1          0          0       1      0   
8360          0      0          1          1          0       1      0   
6711          0      0          0          0          0       1      0   
2012          0      0          1          0          0       1      0   
1250          0      0          1          0          0       0      0   
2161          0      0          1     

# trying with custom movie ratings

In [39]:
df_ratings = pd.read_csv('datasets/to_use/ratings_custom1.csv')

In [40]:
print(get_recomm(696969))

         Action  Adult  Adventure  Animation  Biography  Comedy  Crime  \
movieId                                                                  
318           0      0          0          0          0       0      0   
296           0      0          0          0          0       0      1   
858           0      0          0          0          0       0      1   
68157         0      0          1          0          0       0      0   
106782        0      0          0          0          1       1      1   
89745         1      0          1          0          0       0      0   
72998         1      0          1          0          0       0      0   
112556        0      0          0          0          0       0      0   
91500         1      0          1          0          0       0      0   
122892        1      0          1          0          0       0      0   
86332         1      0          1          0          0       0      0   
111           0      0          0     

In [41]:
df_ratings = pd.read_csv('datasets/to_use/ratings_custom2.csv')

In [42]:
print(get_recomm(696969))

         Action  Adult  Adventure  Animation  Biography  Comedy  Crime  \
movieId                                                                  
5989          0      0          0          0          1       0      1   
122892        1      0          1          0          0       0      0   
134130        0      0          1          0          0       0      0   
86332         1      0          1          0          0       0      0   
110102        1      0          1          0          0       0      0   
78499         0      0          1          1          0       1      0   
122916        1      0          1          0          0       1      0   
122922        1      0          1          0          0       0      0   
47099         0      0          0          0          1       0      0   
115617        1      0          1          1          0       0      0   
551           0      0          0          1          0       0      0   
2273          1      0          0     

[50 rows x 94 columns]
----------- END OF DIRTY TOP 50 RECOMMENDED MOVIES FOR USER X ----------------------
----------- DA GRAND REVEAL ----------------------
48
50 RECOMMENDED MOVIES FOR USER 696969 :
['10,000 BC (2008)' 'Almost Famous (2000)' 'Anastasia (1997)'
 'Ant-Man and the Wasp (2018)' 'Austin Powers in Goldmember (2002)'
 'Australia (2008)' 'Big Fish (2003)' "Charlie's Angels (2000)"
 "Charlie's Angels: Full Throttle (2003)" 'Cold Mountain (2003)'
 'Dawn of the Planet of the Apes (2014)' 'Deadpool (2016)'
 'Get Smart (2008)' 'Gladiator (2000)' 'Guardians of the Galaxy (2014)'
 'Interstellar (2014)' 'Jumanji: Welcome to the Jungle (2017)'
 'King Arthur: Legend of the Sword (2017)' 'King Kong (2005)'
 'Kingsman: The Golden Circle (2017)' 'Knight and Day (2010)'
 'Last Action Hero (1993)'
 'Master and Commander: The Far Side of the World (2003)'
 'Men in Black: International (2019)' 'Okja (2017)' 'R.I.P.D. (2013)'
 'Rat Race (2001)' 'Robin Hood (2010)'
 'Robin Hood: Prince of Thi

# now we add worldwide_gross (continuous)

In [43]:
df_movies = pd.read_csv('datasets/to_use/movies_cleaned.csv')

In [44]:
df_movies.drop('Unnamed: 0', axis=1, inplace=True)

In [45]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(df_movies.head())

Unnamed: 0,title,runtime,imdb_rating,num_of_rating,budget,worldwide_gross,origin,cast,director,writer,producer,composer,cinematographer,editor,year,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,genre_nan,(Banned),13+,15,16+,18+,ADV,ADV16,AO,All,Approved,E,E10+,F,G,GA,GP,K-A,M,M/PG,M18,MA-13,MA-17,NC-16,NC-17,NC16,Open,PG,PG-13,PG13,Passed,R,R(A),R21,T,TV-13,TV-14,TV-G,TV-MA,TV-PG,TV-Y,TV-Y7,TV-Y7-FV,U/A,Unrated,X,Not Rated,certificate_nan,movieId,num_of_rating_bins,budget_bins
0,The Shawshank Redemption (1994),150.0,9.3,2700000.0,27145000.0,28884500.0,United States,"['Tim Robbins,Morgan Freeman,Bob Gunton,Willia...",['Frank Darabont'],"['Stephen King,Frank Darabont']","['Liz Glotzer,David V. Lester,Niki Marvin']",['Thomas Newman'],['Roger Deakins'],['Richard Francis-Bruce'],1990.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,318,super_high,medium
1,The Dark Knight (2008),180.0,9.0,2700000.0,200873000.0,1006234000.0,"United States,United Kingdom","[""Christian Bale,Heath Ledger,Aaron Eckhart,Mi...",['Christopher Nolan'],"['Jonathan Nolan,Christopher Nolan,Christopher...","['Kevin de la Noy,Jordan Goldberg,Philip Lee,B...","['James Newton Howard,Hans Zimmer']",['Wally Pfister'],['Lee Smith'],2000.0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58559,super_high,high end
2,Inception (2010),150.0,8.8,2400000.0,173728000.0,836848100.0,"United States,United Kingdom","['Leonardo DiCaprio,Joseph Gordon-Levitt,Ellio...",['Christopher Nolan'],['Christopher Nolan'],"['Zakaria Alaoui,John Bernard,Chris Brigham,Jo...",['Hans Zimmer'],['Wally Pfister'],['Lee Smith'],2000.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,79132,super_high,high end
3,Fight Club (1999),150.0,8.8,2100000.0,68405400.0,101209700.0,"Germany,United States","[""Edward Norton,Brad Pitt,Meat Loaf,Zach Greni...",['David Fincher'],"['Chuck Palahniuk,Jim Uhls']","['Ross Grayson Bell,Ceán Chaffin,John S. Dorse...","['Dust Brothers,John King,Michael Simpson']",['Jeff Cronenweth'],['James Haygood'],1990.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2959,super_high,high end
4,Pulp Fiction (1994),180.0,8.9,2100000.0,8686400.0,213928800.0,United States,"['Tim Roth,Amanda Plummer,Laura Lovelace,John ...",['Quentin Tarantino'],"['Quentin Tarantino,Roger Avary,Quentin Tarant...","['Lawrence Bender,Danny DeVito,Richard N. Glad...",['nan'],['Andrzej Sekula'],['Sally Menke'],1990.0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,296,super_high,tier 2 low


In [46]:
df_ratings = pd.read_csv('datasets/to_use/ratings_cleaned.csv')

In [47]:
df_ratings.drop('Unnamed: 0', axis=1, inplace=True)

adding budget and num of rating to df_movies

In [48]:
df_movies_original = df_movies.copy()

In [49]:
df_movies_original

Unnamed: 0,title,runtime,imdb_rating,num_of_rating,budget,worldwide_gross,origin,cast,director,writer,...,TV-Y7,TV-Y7-FV,U/A,Unrated,X,Not Rated,certificate_nan,movieId,num_of_rating_bins,budget_bins
0,The Shawshank Redemption (1994),150.0,9.3,2700000.0,27145000.0,2.888450e+07,United States,"['Tim Robbins,Morgan Freeman,Bob Gunton,Willia...",['Frank Darabont'],"['Stephen King,Frank Darabont']",...,0,0,0,0,0,0,0,318,super_high,medium
1,The Dark Knight (2008),180.0,9.0,2700000.0,200873000.0,1.006234e+09,"United States,United Kingdom","[""Christian Bale,Heath Ledger,Aaron Eckhart,Mi...",['Christopher Nolan'],"['Jonathan Nolan,Christopher Nolan,Christopher...",...,0,0,0,0,0,0,0,58559,super_high,high end
2,Inception (2010),150.0,8.8,2400000.0,173728000.0,8.368481e+08,"United States,United Kingdom","['Leonardo DiCaprio,Joseph Gordon-Levitt,Ellio...",['Christopher Nolan'],['Christopher Nolan'],...,0,0,0,0,0,0,0,79132,super_high,high end
3,Fight Club (1999),150.0,8.8,2100000.0,68405400.0,1.012097e+08,"Germany,United States","[""Edward Norton,Brad Pitt,Meat Loaf,Zach Greni...",['David Fincher'],"['Chuck Palahniuk,Jim Uhls']",...,0,0,0,0,0,0,0,2959,super_high,high end
4,Pulp Fiction (1994),180.0,8.9,2100000.0,8686400.0,2.139288e+08,United States,"['Tim Roth,Amanda Plummer,Laura Lovelace,John ...",['Quentin Tarantino'],"['Quentin Tarantino,Roger Avary,Quentin Tarant...",...,0,0,0,0,0,0,0,296,super_high,tier 2 low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22691,Doggiewoggiez! Poochiewoochiez! (2012),60.0,6.8,182.0,2171.6,,United States,"['Tim Allen,Michael Badalucco,Jim Belushi,Matt...",['nan'],['nan'],...,0,0,0,0,0,0,1,120134,super_low,indie
22692,Bobbikins (1959),90.0,5.0,182.0,,,United Kingdom,"[""Shirley Jones,Max Bygraves,Billie Whitelaw,B...",['Robert Day'],['Oscar Brodney'],...,0,0,0,0,0,0,0,113682,super_low,
22693,Vallen (2001),120.0,5.8,182.0,,,Belgium,"['Lee Williams,Emma Thomas,Alice Krige,Lydia C...",['Hans Herbots'],"['Hans Herbots,John Paul Chapple,Anne Provoost']",...,0,0,0,0,0,0,1,150064,super_low,
22694,Exclusive Story (1936),90.0,6.0,182.0,,,United States,"[""Franchot Tone,Madge Evans,Stuart Erwin,Josep...",['George B. Seitz'],"['Michael Fessier,Martin Mooney']",...,0,0,0,0,0,1,0,150944,super_low,


In [50]:
df_movies = df_movies_original[['movieId']]

In [51]:
genres_columns = df_movies_original.iloc[:,15:44]
print(genres_columns.head())
df_movies = df_movies.join(genres_columns)

   Action  Adult  Adventure  Animation  Biography  Comedy  Crime  Documentary  \
0       0      0          0          0          0       0      0            0   
1       1      0          0          0          0       0      1            0   
2       1      0          1          0          0       0      0            0   
3       0      0          0          0          0       0      0            0   
4       0      0          0          0          0       0      1            0   

   Drama  Family  ...  Reality-TV  Romance  Sci-Fi  Short  Sport  Talk-Show  \
0      1       0  ...           0        0       0      0      0          0   
1      1       0  ...           0        0       0      0      0          0   
2      0       0  ...           0        0       1      0      0          0   
3      1       0  ...           0        0       0      0      0          0   
4      1       0  ...           0        0       0      0      0          0   

   Thriller  War  Western  genre_nan  

In [52]:
num_rating_columns = pd.get_dummies(df_movies_original.num_of_rating_bins, prefix='num_rating')
print(num_rating_columns.head())
df_movies = df_movies.join(num_rating_columns)

   num_rating_high  num_rating_low  num_rating_medium  num_rating_super_high  \
0                0               0                  0                      1   
1                0               0                  0                      1   
2                0               0                  0                      1   
3                0               0                  0                      1   
4                0               0                  0                      1   

   num_rating_super_low  
0                     0  
1                     0  
2                     0  
3                     0  
4                     0  


In [53]:
budget_columns = pd.get_dummies(df_movies_original.budget_bins, prefix='budget')
print(budget_columns.head())
df_movies = df_movies.join(budget_columns)

   budget_high end  budget_indie  budget_medium  budget_micro budget  \
0                0             0              1                    0   
1                1             0              0                    0   
2                1             0              0                    0   
3                1             0              0                    0   
4                0             0              0                    0   

   budget_tier 1 low  budget_tier 2 low  budget_tier 3 low  
0                  0                  0                  0  
1                  0                  0                  0  
2                  0                  0                  0  
3                  0                  0                  0  
4                  0                  1                  0  


In [54]:
df_movies

Unnamed: 0,movieId,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,num_rating_medium,num_rating_super_high,num_rating_super_low,budget_high end,budget_indie,budget_medium,budget_micro budget,budget_tier 1 low,budget_tier 2 low,budget_tier 3 low
0,318,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
1,58559,1,0,0,0,0,0,1,0,1,...,0,1,0,1,0,0,0,0,0,0
2,79132,1,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,2959,0,0,0,0,0,0,0,0,1,...,0,1,0,1,0,0,0,0,0,0
4,296,0,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22691,120134,0,0,0,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
22692,113682,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
22693,150064,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
22694,150944,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


adding worlwide_gross to df_movies

In [55]:
worldwide_gross_column = df_movies_original[['worldwide_gross']]
worldwide_gross_column.head()

Unnamed: 0,worldwide_gross
0,28884500.0
1,1006234000.0
2,836848100.0
3,101209700.0
4,213928800.0


In [56]:
# need to standardise to 0 - 1
from sklearn.preprocessing import MinMaxScaler

In [57]:
scaler = MinMaxScaler()

df_movies[['worldwide_gross']] = scaler.fit_transform(worldwide_gross_column[['worldwide_gross']])

In [58]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(df_movies.head())

Unnamed: 0,movieId,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,genre_nan,num_rating_high,num_rating_low,num_rating_medium,num_rating_super_high,num_rating_super_low,budget_high end,budget_indie,budget_medium,budget_micro budget,budget_tier 1 low,budget_tier 2 low,budget_tier 3 low,worldwide_gross
0,318,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0.009882
1,58559,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0.344257
2,79132,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0.286306
3,2959,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0.034626
4,296,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0.07319


In [59]:
print(get_recomm(1))

         Action  Adult  Adventure  Animation  Biography  Comedy  Crime  \
movieId                                                                  
296           0      0          0          0          0       0      1   
5952          1      0          1          0          0       0      0   
6539          1      0          1          0          0       0      0   
6377          0      0          1          1          0       1      0   
7361          0      0          0          0          0       0      0   
3949          0      0          0          0          0       0      0   
2011          0      0          1          0          0       1      0   
8360          0      0          1          1          0       1      0   
6711          0      0          0          0          0       1      0   
2012          0      0          1          0          0       1      0   
1250          0      0          1          0          0       0      0   
2161          0      0          1     

----------- DA GRAND REVEAL ----------------------
49
50 RECOMMENDED MOVIES FOR USER 1 :
['10,000 BC (2008)' '50 First Dates (2004)' 'Almost Famous (2000)'
 'Anastasia (1997)' 'Apocalypto (2006)' 'Australia (2008)'
 'Big Fish (2003)' 'Chef (2014)' 'Click (2006)' 'Cold Mountain (2003)'
 'Dawn of the Planet of the Apes (2014)' 'Dogma (1999)' 'Gladiator (2000)'
 'Hanna (2011)' 'Hugo (2011)' 'Inglourious Basterds (2009)'
 'Interstellar (2014)' 'Jerry Maguire (1996)'
 'King Arthur: Legend of the Sword (2017)' 'King Kong (2005)'
 'Master and Commander: The Far Side of the World (2003)'
 "Miss Peregrine's Home for Peculiar Children (2016)" 'Nebraska (2013)'
 'Okja (2017)' 'Robin Hood (2010)' 'Robin Hood: Prince of Thieves (1991)'
 'Sex and the City (2008)' "Something's Gotta Give (2003)"
 'Star Trek: First Contact (1996)' 'The Book of Eli (2010)'
 'The Break-Up (2006)' 'The Count of Monte Cristo (2002)'
 'The Darjeeling Limited (2007)' 'The Happening (2008)' 'The Host (2013)'
 'The Last of th

In [60]:
print(get_recomm(2))

         Action  Adult  Adventure  Animation  Biography  Comedy  Crime  \
movieId                                                                  
318           0      0          0          0          0       0      0   
356           0      0          0          0          0       0      0   
2571          1      0          0          0          0       0      0   
4993          1      0          1          0          0       0      0   
858           0      0          0          0          0       0      1   
...         ...    ...        ...        ...        ...     ...    ...   
534           0      0          0          0          1       0      0   
8010          0      0          0          0          0       0      0   
4535          0      0          1          0          0       0      0   
1465          1      0          0          0          0       0      0   
6311          0      0          1          0          1       0      0   

         Documentary  Drama  Family  

# splitting df_ratings to train-test

In [72]:
from sklearn.model_selection import train_test_split

In [63]:
df_movies

Unnamed: 0,movieId,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,...,num_rating_super_high,num_rating_super_low,budget_high end,budget_indie,budget_medium,budget_micro budget,budget_tier 1 low,budget_tier 2 low,budget_tier 3 low,worldwide_gross
0,318,0,0,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0.009882
1,58559,1,0,0,0,0,0,1,0,1,...,1,0,1,0,0,0,0,0,0,0.344257
2,79132,1,0,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0.286306
3,2959,0,0,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,0.034626
4,296,0,0,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0.073190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22691,120134,0,0,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,0,
22692,113682,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,
22693,150064,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,
22694,150944,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,


In [106]:
print(df_movies[df_movies['movieId'] == 1217])

     movieId  Action  Adult  Adventure  Animation  Biography  Comedy  Crime  \
978     1217       1      0          0          0          0       0      0   

     Documentary  Drama  ...  num_rating_super_high  num_rating_super_low  \
978            0      1  ...                      1                     0   

     budget_high end  budget_indie  budget_medium  budget_micro budget  \
978                0             0              0                    0   

     budget_tier 1 low  budget_tier 2 low  budget_tier 3 low  worldwide_gross  
978                  0                  0                  1         0.001425  

[1 rows x 43 columns]


In [130]:
def get_recomm(userId):
    
    #STEP 1
    
    movie_id_df = df_ratings.loc[df_ratings['userId']==userId]
    
    # TRYING TRAIN TEST SPLIT HERE
    ratings_train, ratings_test = train_test_split(movie_id_df, test_size = 0.2, random_state = 69)
    print('----------- START OF TRAIN DATASET ----------------------')
    print(ratings_train)
    print('----------- END OF TRAIN DATASET ----------------------')
    print()
    print('----------- START OF TEST DATASET ----------------------')
    print(ratings_test)
    print('----------- END OF TEST DATASET ----------------------')
    print()
    
    user_movies = df_movies[df_movies['movieId'].isin(ratings_train['movieId'].tolist())]
    user_movies.set_index('movieId', inplace=True)
#     print(user_movies)
#     CHANGE LINE BELOW DEPENDING ON VARIABLES YOU WANNA INCLUDE
#     user_movies = user_movies.iloc[:,15:44] #columns 15-43 are the genres one hot encoded
    print('----------- START OF ALL MOVIES THAT HAVE BEEN RATED BY USER X ----------------------')
    print(user_movies)
    print(user_movies.shape)
    print('----------- END OF ALL MOVIES THAT HAVE BEEN RATED BY USER X ----------------------')
    
    #STEP 2
#     rating_df=df_ratings.loc[df_ratings['userId']==userId][['movieId','rating']]
    rating_df = ratings_train[['movieId','rating']]
    rating_df.set_index('movieId', inplace=True)
    print('----------- START OF ALL RATINGS BY USER X ----------------------')
    print(rating_df)
    print(rating_df.shape)
    print('----------- END OF ALL RATINGS BY USER X ----------------------')
        
    #Dot produt to get weights
    
    userProfile = user_movies.transpose().dot(rating_df)
    print('----------- START OF WEIGHTED GENRE PREFERENCE OF USER X ----------------------')
    print(userProfile)
    print('----------- END OF WEIGHTED GENRE PREFERENCE OF USER X ----------------------')
    
    #STEP 3
    
    movie_genres = df_movies.copy()
    movie_genres.set_index('movieId', inplace=True)
#     print(movie_genres)
#     movie_genres = movie_genres.iloc[:,15:44]
#     print(movie_genres.transpose())
    
    recommendation_array=movie_genres.dot(userProfile)/(userProfile.sum())
    
#     STANDARDISE RANGE SO IT BECOME 0 - 5
    scaler = MinMaxScaler(feature_range=(0, 5))
#     print(recommendation_array[['rating']].min())
#     print(recommendation_array[['rating']].max())
    recommendation_array[['rating']] = scaler.fit_transform(recommendation_array[['rating']])
    
    print('----------- START OF RECOMMENDATION SCORE OF EVERY MOVIE FOR USER X ----------------------')
    print(recommendation_array)
    print('----------- END OF RECOMMENDATION SCORE OF EVERY MOVIE FOR USER X ----------------------')
    recommendation_array_full = recommendation_array.copy()
#     recommendation_series=pd.Series(recommendation_array).sort_values(ascending=False)
#     recommendation_keys=[keys for keys, value in recommendation_series.items() if value>=0.5 ]'
    recommendation_array.sort_values(by='rating', ascending=False, inplace=True)
    print(recommendation_array.head(50)) # LOOK AT THIS LINE IF YOU WANNA SEE THE PROBLEM (movies with the same genre are given the exact same score)
#   return top 50 recommended
    recommendation_array = recommendation_array.head(50)
#     print(recommendation_array.index.values.tolist())
    rec_movies=df_movies_original.loc[df_movies_original['movieId'].isin(recommendation_array.index.values.tolist())]
    print('----------- START OF DIRTY TOP 50 RECOMMENDED MOVIES FOR USER X ----------------------')
    print(rec_movies)
    print('----------- END OF DIRTY TOP 50 RECOMMENDED MOVIES FOR USER X ----------------------')
    
    #STEP 4
    
    user_reviews = df_ratings[df_ratings['userId'] == userId]
    movies_reviews = np.array(user_reviews['movieId'])    
    
    def get_movie_names(movie_ids):
#         '''
#         INPUT
#         movie_ids - a list of movie_ids
#         OUTPUT
#         movies - a list of movie names associated with the movie_ids    
#         '''
        movie_list = list(df_movies_original[df_movies_original['movieId'].isin(movie_ids)]['title'])   
        return movie_list
    
    movie_names = np.array(get_movie_names(movies_reviews))
    rec_movies=np.array(rec_movies['title'])
    recs=np.setdiff1d(rec_movies, movie_names)
    
    print('----------- DA GRAND REVEAL ----------------------')
    print(recs.size)
    print('50 RECOMMENDED MOVIES FOR USER', userId, ":")
#     return recs
    print(recs)

    #STEP 5: USING THE TEST DATASET
    
    print('----------- START OF TEST DATASET ----------------------')
    ratings_test.sort_values(by='movieId', ascending=False, inplace=True)
    ratings_test.drop('userId', axis=1, inplace=True)
    print(ratings_test)
    print('----------- END OF TEST DATASET ----------------------')
    print()
    
    test_movies_list = ratings_test['movieId'].tolist()
    print(test_movies_list)
#     print(recommendation_array_full)
    ratings_pred = recommendation_array_full.copy()
    ratings_pred.reset_index(inplace=True)
    ratings_pred = ratings_pred[ratings_pred['movieId'].isin(test_movies_list)]
    
    print('----------- START OF PREDICTED VALUES OF TEST DATASET ----------------------')
    ratings_pred.sort_values(by='movieId', ascending=False, inplace=True)
    print(ratings_pred)
    print('----------- END OF PREDICTED VALUES OF TEST DATASET ----------------------')
    
    ratings_test = ratings_test['rating'].to_numpy()
#     print(ratings_test)
    
    ratings_pred = ratings_pred['rating'].to_numpy()
#     print(ratings_pred)
    
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import r2_score

    print()
    print('----------- FINAL RECCOMS AND VALIDATION ----------------------')
    mse = mean_squared_error(ratings_test, ratings_pred)
    rmse = np.sqrt(mse)
    print('rmse: ', rmse)
    r_squared = r2_score(ratings_test, ratings_pred)
    print('r squared: ', r_squared)
    print('50 recommended movies for User ', userId, ":")
    return recs

print(get_recomm(1))

----------- START OF TRAIN DATASET ----------------------
          userId  movieId  rating
5429904        1     2012     2.5
981192         1      296     5.0
10028476       1     7327     3.5
10041499       1     7361     5.0
8012996        1     3949     5.0
10227875       1     8360     4.0
5716526        1     2161     3.5
2510218        1      665     5.0
5418991        1     2011     2.5
6359813        1     2573     4.0
9294655        1     5952     4.0
10623028       1    31956     3.5
8611552        1     4703     4.0
9524192        1     6377     4.0
----------- END OF TRAIN DATASET ----------------------

----------- START OF TEST DATASET ----------------------
         userId  movieId  rating
3693178       1     1217     3.5
3927757       1     1250     4.0
9661731       1     6711     5.0
9619843       1     6539     3.5
----------- END OF TEST DATASET ----------------------

----------- START OF ALL MOVIES THAT HAVE BEEN RATED BY USER X ----------------------
         Ac

In [131]:
print(get_recomm(2))

----------- START OF TRAIN DATASET ----------------------
          userId  movieId  rating
4673791        2     1587     1.0
4221288        2     1302     3.0
1144867        2      318     5.0
4842533        2     1682     4.5
6795896        2     2797     1.0
...          ...      ...     ...
6542721        2     2694     4.0
10191033       2     8010     5.0
8842567        2     4993     5.0
9052034        2     5349     5.0
6470188        2     2640     5.0

[80 rows x 3 columns]
----------- END OF TRAIN DATASET ----------------------

----------- START OF TEST DATASET ----------------------
          userId  movieId  rating
8762608        2     4886     5.0
4427282        2     1393     4.0
2106729        2      588     2.0
9817684        2     6947     4.0
4543325        2     1485     3.0
1668171        2      480     2.0
4190051        2     1291     5.0
10173728       2     7624     3.5
8068478        2     3994     3.0
10709938       2    33660     5.0
10477795       2     89