In [1]:
import pandas as pd
import re
from ast import literal_eval
import warnings

In [2]:
ratings = pd.read_csv('ml-latest/ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [4]:
movies = pd.read_csv('ml-latest/movies.csv')

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
mv_tags = pd.read_csv('ml-latest/genome-scores.csv')

In [7]:
mv_tags.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.029
1,1,2,0.02375
2,1,3,0.05425
3,1,4,0.06875
4,1,5,0.16


In [8]:
mv_tags_desc = pd.read_csv('ml-latest/genome-tags.csv')

In [9]:
mv_tags_desc.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [10]:
#removing duplicates by title, if found 
movies.drop_duplicates(subset = 'title',keep='first',inplace=True)

In [11]:
#extracting year
movies['year'] =movies['title'].str.extract('.*\((.*)\).*',expand = False)
movies['year'] = movies['year'].str.strip()

In [12]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [13]:
#getting count of people who voted for each movie
votes = ratings[['movieId','rating']].groupby('movieId', as_index=False).sum()
movies['total_votes'] = votes['rating']

In [14]:
#getting average rating for each movie
average_rating = ratings[['movieId','rating']].groupby('movieId', as_index=False).mean()
movies['rating'] = average_rating['rating']

In [15]:
movies.head()

Unnamed: 0,movieId,title,genres,year,total_votes,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,266115.0,3.886649
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,88122.0,3.246583
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,49466.5,3.173981
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,8592.0,2.87454
4,5,Father of the Bride Part II (1995),Comedy,1995,47618.0,3.077291


In [16]:
#deleting rows having atleast one null values
movies.dropna(axis=0, how="any", inplace=True)

In [17]:
#these movies didnt had release year, so manually handled them
movies.loc[movies['movieId'] == 107434,'year'] = 2009
movies.loc[movies['movieId'] == 171749,'year'] = 2006
movies.loc[movies['movieId'] == 165821,'year'] = 2016
movies.loc[movies['movieId'] == 141628,'year'] = 1957
movies.loc[movies['movieId'] == 87442,'year'] = 2010
movies.loc[movies['movieId'] == 79607,'year'] = 1970
movies.loc[movies['movieId'] == 87442,'year'] = 2010
movies.loc[movies['movieId'] == 98063,'year'] = 1983

In [18]:
#reducing size of dataset
movies['movieId'] = movies['movieId'].astype('int32')
movies['title'] = movies['title'].astype('str')
movies['genres'] = movies['genres'].astype('str')
movies['year'] = movies['year'].astype('float32')
movies['total_votes'] = movies['total_votes'].astype('int32')
movies['rating'] = movies['rating'].astype('float32')

In [19]:
def movie_title_clean(title):
  # if ', The' or ', A' is a the end of the string, move it to the front
  # e.g. change "Illusionist, The" to "The Illusionist"
    if title[-5:] == ', The':
        title = 'The ' + title[:-5]
    elif title[-4:] == ', An':
        title = 'An ' + title[:-4]
    elif title[-3:] == ', A':
        title = 'A ' + title[:-3]
    
    return title
movies['title'] = movies['title'].apply(movie_title_clean)

In [20]:
#    v is the number of votes for the movie
#    m is the minimum votes required to be listed in the chart
#    R is the average rating of the movie
#    C is the mean vote across the whole report
C = movies['rating'].mean()
m =  movies['total_votes'].quantile(0.98)
C,m

(3.0686827, 22438.760000000017)

In [21]:
def weighted_rating(x):
    v = x['total_votes']
    R = x['rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [22]:
movies['wr'] = movies.apply(weighted_rating,axis=1)

In [23]:
movies.head()

Unnamed: 0,movieId,title,genres,year,total_votes,rating,wr
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,266115,3.886649,3.823042
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0,88122,3.246583,3.210477
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0,49466,3.173981,3.141122
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0,8592,2.87454,3.014927
4,5,Father of the Bride Part II (1995),Comedy,1995.0,47618,3.077291,3.074534


In [24]:
# merging mv_tags with movies on column tagId
mv_tags_denorm = mv_tags.merge(mv_tags_desc, on = 'tagId').merge(movies, on = 'movieId')

In [25]:
# for each movie, compute the relevance rank of tags so we can eventually rank order tags for each movie
mv_tags_denorm['relevance_rank'] = mv_tags_denorm.groupby("movieId")["relevance"].rank(method = "first", ascending 
                                                                                       = False).astype('int32')

In [57]:
mv_tags_denorm.relevance_rank.value_counts()

1128    13142
379     13142
373     13142
374     13142
375     13142
        ...  
746     13142
745     13142
744     13142
743     13142
1       13142
Name: relevance_rank, Length: 1128, dtype: int64

In [26]:
mv_tags_denorm.head()

Unnamed: 0,movieId,tagId,relevance,tag,title,genres,year,total_votes,rating,wr,relevance_rank
0,1,1,0.029,007,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,266115,3.886649,3.823042,877
1,1,2,0.02375,007 (series),Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,266115,3.886649,3.823042,942
2,1,3,0.05425,18th century,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,266115,3.886649,3.823042,686
3,1,4,0.06875,1920s,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,266115,3.886649,3.823042,606
4,1,5,0.16,1930s,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,266115,3.886649,3.823042,362


In [77]:
# compute median relevance score for each relevance rank
mv_tags_rank_agg = mv_tags_denorm.groupby('relevance_rank')['relevance'].median().reset_index(name
                                                                                = 'relevance_median').head(100)

In [78]:
mv_tags_rank_agg

Unnamed: 0,relevance_rank,relevance_median
0,1,0.981500
1,2,0.956500
2,3,0.929250
3,4,0.903500
4,5,0.878000
...,...,...
95,96,0.307875
96,97,0.305750
97,98,0.303750
98,99,0.301750


In [79]:
# compute percent change of median relevance score as we go down in rank
mv_tags_rank_agg['relevance_median_pct_chg'] = mv_tags_rank_agg['relevance_median'].pct_change()

In [80]:
mv_tags_rank_agg.head()

Unnamed: 0,relevance_rank,relevance_median,relevance_median_pct_chg
0,1,0.9815,
1,2,0.9565,-0.025471
2,3,0.92925,-0.028489
3,4,0.9035,-0.027711
4,5,0.878,-0.028224


In [81]:
mv_tags_list = mv_tags_denorm[mv_tags_denorm.relevance_rank <=
            100].groupby(['movieId','title','rating','wr'])['tag'].apply(lambda x: ' ,'.join(x)).reset_index()

In [82]:
mv_tags_list['tag_list'] = mv_tags_list.tag.map(lambda x: x.split(','))

In [83]:
mv_tags_list.drop('tag',1,inplace=True)

In [84]:
mv_tags_list.shape

(13142, 5)

In [27]:
mov_id  = mv_tags_list['movieId']

In [28]:
ratings1 = ratings.loc[ratings['movieId'].isin(mov_id)&(ratings['rating']>3)]

In [55]:
ratings1.shape

(17072150, 4)

In [30]:
ratings.shape

(27753444, 4)

In [72]:
pd.reset_option('display.max_colwidth')

target_movie = 'Married Life'

#target_tag_list contains the tags of target movie
target_tag_list = mv_tags_list[mv_tags_list.title.str.contains(target_movie)].tag_list.values[0]

#mv_tags_list_sim is anew table prepared from mv_tags_list with given column names
mv_tags_list_sim = mv_tags_list[['movieId','title','tag_list','rating','wr']]

#mv_tags_list_sim['jaccard_sim'] will hold the jaccard similarity of any 2 values

mv_tags_list_sim['jaccard_sim'] = mv_tags_list_sim['tag_list'].apply(lambda x:
                        len(set(x).intersection(set(target_tag_list))) / len(set(x).union(set(target_tag_list))))

print(f'Movies most similar to {target_movie} based on tags:')

#Sorting mv_tags_list_sim by jaccard similarity
mv_tags_list_sim.sort_values(by = 'jaccard_sim', ascending = False).head(10)

Movies most similar to Married Life based on tags:


Unnamed: 0,movieId,title,tag_list,rating,wr,jaccard_sim
9209,58494,Married Life (2007),"[adaptation , adapted from:book , adultery , a...",4.173756,4.055213,1.0
266,278,Miami Rhapsody (1995),"[adaptation , adapted from:book , adultery , b...",3.022477,3.062351,0.37931
4307,4782,Sidewalks of New York (2001),"[adaptation , addiction , adultery , affection...",3.17732,3.075664,0.369863
4579,5081,Birthday Girl (2001),"[adaptation , alternate endings , australia , ...",3.088038,3.07048,0.360544
1566,1770,B. Monkey (1998),"[adaptation , addiction , amy smart , based on...",2.859649,3.06278,0.360544
274,287,Nina Takes a Lover (1994),"[adaptation , adapted from:book , alternate en...",3.244012,3.076755,0.351351
9382,60950,Vicky Cristina Barcelona (2008),"[absurd , adultery , art , artist , artistic ,...",2.614286,3.066847,0.351351
7027,8841,Seeing Other People (2004),"[adaptation , addiction , adultery , bdsm , be...",3.192661,3.070576,0.342282
2818,3125,"End of the Affair, The (1999)","[adaptation , adapted from:book , adultery , a...",3.598009,3.160337,0.342282
8177,38994,Separate Lies (2005),"[adaptation , adultery , based on a book , bas...",3.0,3.068637,0.342282


In [29]:
# from surprise import Reader, Dataset, SVD

In [30]:
# Load the movielens-100k dataset
# reader = Reader(line_format='user item rating', sep=',', rating_scale = (0,5))

In [31]:
# data = Dataset.load_from_df(ratings1[['userId', 'movieId', 'rating']], reader)

In [32]:
# algo = SVD()

In [33]:
 # Retrieve the trainset.
# trainset = data.build_full_trainset()

In [34]:
# algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fdb09cceb80>

In [29]:
import pickle

In [78]:
# file = open('test.pkl','wb')
# pickle.dump(algo,file)
# file.close()

In [31]:
file = open('test.pkl','rb')
algo = pickle.load(file)
file.close()

In [34]:
algo.predict(169,302)

Prediction(uid=169, iid=302, r_ui=None, est=4.364286494940225, details={'was_impossible': False})

In [73]:
def hybrid(userId, title):
    
    target_tag_list = mv_tags_list[mv_tags_list.title.str.contains(title)].tag_list.values[0]

    mv_tags_list_sim = mv_tags_list[['movieId','title','tag_list','rating','wr']]

    mv_tags_list_sim['jaccard_sim'] = mv_tags_list_sim.tag_list.map(lambda x: 
                        len(set(x).intersection(set(target_tag_list))) / len(set(x).union(set(target_tag_list))))

    sim_scores =  mv_tags_list_sim.sort_values(by = 'jaccard_sim', ascending = False).head(10)
    
    movie_indices = [i for i in  sim_scores['movieId']]
    
    rec_mov = movies.loc[movies['movieId'].isin(movie_indices)]
    
    rec_mov['est'] =[ algo.predict(userId,x).est for x in movie_indices]
    
    rec_mov = rec_mov.sort_values('est', ascending=False)
    
    return rec_mov.head(10)

In [74]:
hybrid(165,'Married Life' )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mov['est'] =[ algo.predict(userId,x).est for x in movie_indices]


Unnamed: 0,movieId,title,genres,year,total_votes,rating,wr,est
4986,5081,Birthday Girl (2001),Drama|Romance,2001.0,2297,3.088038,3.07048,4.418494
12552,58494,Married Life (2007),Crime|Drama|Romance,2007.0,186738,4.173756,4.055213,4.394624
10511,38994,Separate Lies (2005),Drama|Romance|Thriller,2005.0,15,3.0,3.068637,4.380718
8158,8841,Seeing Other People (2004),Comedy,2004.0,348,3.192661,3.070576,4.305002
4687,4782,Sidewalks of New York (2001),Comedy|Romance,2001.0,1541,3.17732,3.075664,4.291306
12934,60950,Vicky Cristina Barcelona (2008),Comedy|Drama|Romance,2008.0,91,2.614286,3.066847,4.22839
284,287,Nina Takes a Lover (1994),Comedy|Romance,1994.0,1083,3.244012,3.076755,4.123144
1701,1770,B. Monkey (1998),Crime|Romance|Thriller,1998.0,652,2.859649,3.06278,4.105139
275,278,Miami Rhapsody (1995),Comedy,1995.0,3563,3.022477,3.062351,4.065165
3039,3125,"End of the Affair, The (1999)",Drama,1999.0,4699,3.598009,3.160337,3.940213


In [59]:
p = mv_tags_list[mv_tags_list['tag'].str.contains('Romance',case=False)]

In [61]:
p.sort_values(['wr'], ascending=[False]).head(10)

Unnamed: 0,movieId,title,rating,wr,tag,tag_list
9209,58494,Married Life (2007),4.173756,4.055213,"adaptation ,adapted from:book ,adultery ,alter...","[adaptation , adapted from:book , adultery , a..."
823,912,Casablanca (1942),4.210098,4.043084,"adultery ,affectionate ,afi 100 ,afi 100 (movi...","[adultery , affectionate , afi 100 , afi 100 (..."
1078,1198,Raiders of the Lost Ark (Indiana Jones and the...,4.120455,4.037387,"1930s ,action ,action packed ,adventure ,archa...","[1930s , action , action packed , adventure , ..."
1077,1197,"Princess Bride, The (1987)",4.124807,4.005902,"absurd ,action ,adaptation ,adapted from:book ...","[absurd , action , adaptation , adapted from:b..."
4482,4973,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",4.117255,3.983161,"affectionate ,amazing cinematography ,art ,art...","[affectionate , amazing cinematography , art ,..."
819,908,North by Northwest (1959),4.201091,3.958711,"007 (series) ,action ,adapted from:book ,adven...","[007 (series) , action , adapted from:book , a..."
106,110,Braveheart (1995),4.008481,3.937771,"action ,action packed ,adventure ,amazing cine...","[action , action packed , adventure , amazing ..."
6382,7361,Eternal Sunshine of the Spotless Mind (2004),4.07348,3.937059,"alternate reality ,amazing cinematography ,amn...","[alternate reality , amazing cinematography , ..."
814,903,Vertigo (1958),4.119519,3.874626,"70mm ,adapted from:book ,afi 100 (movie quotes...","[70mm , adapted from:book , afi 100 (movie quo..."
1147,1270,Back to the Future (1985),3.929999,3.852188,"1950s ,1980s ,80s ,action ,adventure ,alternat...","[1950s , 1980s , 80s , action , adventure , al..."


In [62]:
p = movies[movies['genres'].str.contains('Romance',case=False)]

In [64]:
p.sort_values(['wr'], ascending=[False]).head(10)

Unnamed: 0,movieId,title,genres,year,total_votes,rating,wr
12552,58494,Married Life (2007),Crime|Drama|Romance,2007.0,186738,4.173756,4.055213
895,912,Casablanca (1942),Drama|Romance,1942.0,130913,4.210098,4.043084
2773,2858,American Beauty (1999),Drama|Romance,1999.0,250670,4.121506,4.035006
1172,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,1987.0,176863,4.124807,4.005902
352,356,Forrest Gump (1994),Comedy|Drama|Romance|War,1994.0,393651,4.056585,4.00331
4878,4973,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy|Romance,2001.0,153026,4.117255,3.983161
2240,2324,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War,1997.0,112385,4.163178,3.981021
1645,1704,Good Will Hunting (1997),Drama|Romance,1997.0,174343,4.076387,3.96148
891,908,North by Northwest (1959),Action|Adventure|Mystery|Romance|Thriller,1959.0,82396,4.201091,3.958711
7250,7361,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,2004.0,142832,4.07348,3.937059
