# Set Up

In [63]:
import pandas as pd
import numpy as np
import time
import datetime

In [115]:
df_links = pd.read_csv('links.csv')
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')
df_tags = pd.read_csv('tags.csv')

# EDA

### Ratings

In [37]:
# check if user only rated on a movie once
df_ratings[df_ratings.duplicated(subset=['userId','movieId'], keep=False)]

Unnamed: 0,userId,movieId,rating,timestamp


In [40]:
df_ratings.groupby('movieId')['rating'].agg(['mean', 'count']).head()

Unnamed: 0_level_0,mean,count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.92093,215
2,3.431818,110
3,3.259615,52
4,2.357143,7
5,3.071429,49


In [39]:
df_ratings['rating'].value_counts()

4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: rating, dtype: int64

### Movies

In [56]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [55]:
df_movies.genres.value_counts()

Drama                                                1053
Comedy                                                946
Comedy|Drama                                          435
Comedy|Romance                                        363
Drama|Romance                                         349
                                                     ... 
Action|Adventure|Comedy|Crime|Mystery|Thriller          1
Children|Drama|Romance                                  1
Action|Adventure|Children|Mystery|Sci-Fi                1
Comedy|Drama|Mystery|Thriller                           1
Adventure|Animation|Children|Comedy|Drama|Fantasy       1
Name: genres, Length: 951, dtype: int64

In [62]:
genre_type = set()
for i in df_movies['genres']:
    genre_type.up(i.split('|'))
    
genre_type

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

# Top k Popular Movies

In [52]:
def getTopKMovies(df_ratings, df_movies, start_date="20170923", end_date="20180923", k=10, min_rating=4):
    start_time_stamp = time.mktime(datetime.datetime.strptime(start_date,"%Y%m%d").timetuple())
    end_time_stamp = time.mktime(datetime.datetime.strptime(end_date,"%Y%m%d").timetuple())
    df_temp = df_ratings.loc[(df_ratings['timestamp']>=start_time_stamp) & (df_ratings['timestamp']<=end_time_stamp)].groupby('movieId').agg({'rating':['count','mean']}).reset_index()
    df_temp.columns = ['movieId','Total Rating', 'Average Rating']
    df_temp = df_temp.loc[df_temp['Average Rating']>=min_rating].sort_values(by=('Total Rating'), ascending=False).head(k)
  
    df_temp = pd.merge(df_temp,df_movies,left_on='movieId',right_on='movieId')
    
    return df_temp

getTopKMovies(df_ratings, df_movies)

Unnamed: 0,movieId,Total Rating,Average Rating,title,genres
0,356,24,4.083333,Forrest Gump (1994),Comedy|Drama|Romance|War
1,79132,23,4.108696,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
2,2571,23,4.086957,"Matrix, The (1999)",Action|Sci-Fi|Thriller
3,7153,21,4.02381,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
4,318,21,4.52381,"Shawshank Redemption, The (1994)",Crime|Drama
5,58559,20,4.225,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
6,2959,20,4.025,Fight Club (1999),Action|Crime|Drama|Thriller
7,122916,20,4.025,Thor: Ragnarok (2017),Action|Adventure|Sci-Fi
8,5952,20,4.15,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
9,4993,19,4.131579,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy


# One Hot Encode genres

In [97]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [104]:
ohe_genre = pd.get_dummies(df_movies['genres'].str.split('|', expand = True).stack()).groupby(level=0).sum().drop(columns=['(no genres listed)'])
df_movies = pd.concat([df_movies, ohe_genre], axis=1)
df_movies.head()

Unnamed: 0,movieId,title,genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [227]:
def getSimilarKMovies(df_ratings, df_movies, movieId=1, k=10, min_rating=3, min_users=10):
    
    # one hot encode genre alternative
    # ohe_genre = pd.get_dummies(df_movies['genres'].str.split('|', expand = True).stack()).groupby(level=0).sum().drop(columns=['(no genres listed)'])
    df_temp = df_movies.copy()
    genre_set = set()
    for i in list(df_temp['genres'].str.split('|')):
        genre_set |= set(i)
    genre_set.remove('(no genres listed)')
    for g in genre_set:
        df_temp['genre_ind_{}'.format(g)] = df_temp['genres'].str.contains(g).astype(int)
    df_temp = df_temp[[i for i in df_temp.columns if 'genre_' in i]]
    
    np_temp = np.array(df_temp)
    np_temp = np_temp/np.sqrt(np.square(np_temp).sum(axis=1)).reshape(-1,1) #normalise
    cos_mtx = np.dot(np_temp,np_temp.T)  #cosine matrix

    mindex = df_movies.loc[df_movies['movieId']==movieId].index[0]
    df_similarity = df_movies.copy()
    df_similarity['cos_score'] = cos_mtx[mindex,:]
    
    df_temp = df_ratings.groupby('movieId').agg({'rating':['count','mean']}).reset_index()
    df_temp.columns = ['movieId','total_rating','average_rating']
    
    df_result = pd.merge(df_similarity,df_temp,on='movieId',how='left')
    
    
    df_result = df_result.loc[(df_result['total_rating']>=min_users)&(df_result['average_rating']>=min_rating)].sort_values(['cos_score', 'total_rating','average_rating'],ascending=False).head(k)
    
    return df_result

In [230]:
getSimilarKMovies(df_ratings, df_movies, movieId=1)

  np_temp = np_temp/np.sqrt(np.square(np_temp).sum(axis=1)).reshape(-1,1) #normalise


Unnamed: 0,movieId,title,genres,cos_score,total_rating,average_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,215.0,3.92093
3568,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,1.0,132.0,3.871212
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,1.0,97.0,3.860825
1706,2294,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy,1.0,45.0,3.244444
3000,4016,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy,1.0,37.0,3.716216
6486,53121,Shrek the Third (2007),Adventure|Animation|Children|Comedy|Fantasy,1.0,21.0,3.02381
9430,166461,Moana (2016),Adventure|Animation|Children|Comedy|Fantasy,1.0,10.0,3.45
3194,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,0.912871,170.0,3.867647
7355,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,0.912871,55.0,4.109091
8900,134853,Inside Out (2015),Adventure|Animation|Children|Comedy|Drama|Fantasy,0.912871,43.0,3.813953
