In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings; warnings.simplefilter('ignore')
from surprise import SVD, Reader
from surprise import Dataset
import ast 
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise.model_selection import cross_validate

## Importing the Dataset

In [2]:
movie_credits = pd.read_csv('credits.csv')
movie_keywords = pd.read_csv('keywords.csv')
movie_links_s = pd.read_csv('links_small.csv')
movie_metadata = pd.read_csv('movies_metadata.csv')
movie_ratings = pd.read_csv('ratings_small.csv')

## Simple recommendation system

In [3]:
movie_metadata['genres'] = movie_metadata['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i[
    'name'] for i in x] if isinstance(x, list) else [])

In [4]:
vote_cnt = movie_metadata[movie_metadata['vote_count'].notnull()]['vote_count'].astype('int')

vote_avg = movie_metadata[movie_metadata['vote_average'].notnull()]['vote_average'].astype('int')

C = vote_avg.mean()
C

5.244896612406511

In [5]:
m = vote_cnt.quantile(0.95)
m

434.0

In [6]:
movie_metadata['year'] = pd.to_datetime(movie_metadata['release_date'], errors='coerce').apply(
    lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [7]:
mo = movie_metadata[(movie_metadata['vote_count'] >= m) & 
               (movie_metadata['vote_count'].notnull()) & 
               (movie_metadata['vote_average'].notnull())][['title', 
                                                'year', 
                                                'vote_count', 
                                                'vote_average', 
                                                'popularity', 
                                                'genres']]

mo['vote_count'] = mo['vote_count'].astype('int')
mo['vote_average'] = mo['vote_average'].astype('int')
mo.shape

(2274, 6)

* Therefore, to qualify to be considered for the chart, a movie has to have at least __434 votes__ on TMDB. 
* We also see that the __average rating__ for __a movie on TMDB__ is __5.244 on a scale of 10__. 
* Here, only __2274 movies__ are qualify to be on our chart.

In [8]:
def w_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [9]:
mo['wr'] = mo.apply(w_rating, axis=1)

In [10]:
mo = mo.sort_values('wr', ascending=False).head(250)

## Displaying the popular 5 movies

In [13]:
mo.head(5)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871787


In [15]:
'''
>>> s
     a   b
one  1.  2.
two  3.  4.

>>> s.stack()
one a    1
    b    2
two a    3
    b    4
'''
z = movie_metadata.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
z.name = 'genre'
general_metadata = movie_metadata.drop('genres', axis=1).join(z)
general_metadata.head(3).transpose()

Unnamed: 0,0,0.1,0.2
adult,False,False,False
belongs_to_collection,"{'id': 10194, 'name': 'Toy Story Collection', ...","{'id': 10194, 'name': 'Toy Story Collection', ...","{'id': 10194, 'name': 'Toy Story Collection', ..."
budget,30000000,30000000,30000000
homepage,http://toystory.disney.com/toy-story,http://toystory.disney.com/toy-story,http://toystory.disney.com/toy-story
id,862,862,862
imdb_id,tt0114709,tt0114709,tt0114709
original_language,en,en,en
original_title,Toy Story,Toy Story,Toy Story
overview,"Led by Woody, Andy's toys live happily in his ...","Led by Woody, Andy's toys live happily in his ...","Led by Woody, Andy's toys live happily in his ..."
popularity,21.946943,21.946943,21.946943


In [16]:
def genre_rec(genre, percentile=0.85):
    d_frames = general_metadata[general_metadata['genre'] == genre]
    vote_counts = d_frames[d_frames['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = d_frames[d_frames['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    reco = d_frames[(d_frames['vote_count'] >= m) & (d_frames['vote_count'].notnull()) & 
                   (d_frames['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    reco['vote_count'] = reco['vote_count'].astype('int')
    reco['vote_average'] = reco['vote_average'].astype('int')
    
    reco['wr'] = reco.apply(lambda x: 
                        (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C),
                        axis=1)
    reco = reco.sort_values('wr', ascending=False).head(250)
    
    return reco

## Displaying top 5 romance movies

In [17]:
genre_rec('Romance').head(5)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,8.565285
351,Forrest Gump,1994,8147,8,48.307194,7.971357
876,Vertigo,1958,1162,8,18.20822,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.845107,7.745154


## Content based recommendation system

In [18]:
movie_links_s = movie_links_s[movie_links_s['tmdbId'].notnull()]['tmdbId'].astype('int')

In [19]:
def toINT(a):
    try:
        return int(a)
    except:
        return np.nan


In [20]:
movie_metadata['id'] = movie_metadata['id'].apply(toINT)
movie_metadata[movie_metadata['id'].isnull()]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[Carousel Productions, Vision View Entertainme...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,,,,,,,,,,NaT
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[Aniplex, GoHands, BROSTA TV, Mardock Scramble...","[{'iso_3166_1': 'US', 'name': 'United States o...",,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,,,,,,,,,,NaT
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[Odyssey Media, Pulser Productions, Rogue Stat...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,,,,,,,,,,NaT


In [21]:
movie_metadata = movie_metadata.drop([19730, 29503, 35587])

In [22]:
movie_metadata['id'] = movie_metadata['id'].astype('int')

In [24]:
small_mov = movie_metadata[movie_metadata['id'].isin(movie_links_s)]
small_mov.shape


(9099, 25)

###  Content based recommendation system

In [25]:
small_mov['tagline'] = small_mov['tagline'].fillna('')
small_mov['description'] = small_mov['overview'] + small_mov['tagline']
small_mov['description'] = small_mov['description'].fillna('')

In [26]:
tf_vector = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_vect_matrix = tf_vector.fit_transform(small_mov['description'])

In [27]:
tfidf_vect_matrix.shape

(9099, 268124)

In [28]:
cosine_sim = linear_kernel(tfidf_vect_matrix, tfidf_vect_matrix)

In [29]:
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [30]:
small_mov = small_mov.reset_index()
movie_titles = small_mov['title']
movie_indices = pd.Series(small_mov.index, index=small_mov['title'])

In [33]:
def recommends(x):
    index = movie_indices[x]
    scr = list(enumerate(cosine_sim[index]))
    scr = sorted(scr, key=lambda x: x[1], reverse=True)
    scr = scr[1:31]
    mi = [i[0] for i in scr]
    return movie_titles.iloc[mi]

In [34]:
recommends('Made').head(5)

4196       Johnny Dangerously
3108       The Way of the Gun
618                   Thinner
8387               The Family
6201    The Constant Gardener
Name: title, dtype: object

In [35]:
recommends('JFK').head(5)

7242     The File on Thelma Jordon
5987    A Love Song for Bobby Long
1135      Night Falls on Manhattan
4489                         Q & A
8680             The Young Savages
Name: title, dtype: object

### Content based Recommendation System with movie description, taglines, keywords, cast, director and genres

In [36]:
movie_keywords['id'] = movie_keywords['id'].astype('int')
movie_credits['id'] = movie_credits['id'].astype('int')
movie_metadata['id'] = movie_metadata['id'].astype('int')

In [38]:
movie_metadata = movie_metadata.merge(movie_credits, on='id')
movie_metadata = movie_metadata.merge(movie_keywords, on='id')

In [41]:
small_mov = movie_metadata[movie_metadata['id'].isin(movie_links_s)]
small_mov.shape


(9219, 28)

In [42]:
small_mov['cast'] = small_mov['cast'].apply(literal_eval)
small_mov['crew'] = small_mov['crew'].apply(literal_eval)
small_mov['keywords'] = small_mov['keywords'].apply(literal_eval)
small_mov['cast_size'] = small_mov['cast'].apply(lambda x: len(x))
small_mov['crew_size'] = small_mov['crew'].apply(lambda x: len(x))

In [43]:
def dirc(a):
    for i in a:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [44]:
small_mov['director'] = small_mov['crew'].apply(dirc)
small_mov['cast'] = small_mov['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
small_mov['cast'] = small_mov['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
small_mov['keywords'] = small_mov['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [45]:
small_mov['cast'] = small_mov['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
small_mov['director'] = small_mov['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
small_mov['director'] = small_mov['director'].apply(lambda x: [x,x, x])

In [47]:
q = small_mov.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
q.name = 'keyword'
q = q.value_counts()
q[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [48]:
q = q[q > 1]

In [51]:
def fil(x):
    words = []
    for i in x:
        if i in q:
            words.append(i)
    return words

In [53]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [54]:
small_mov['keywords'] = small_mov['keywords'].apply(fil)
small_mov['keywords'] = small_mov['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
small_mov['keywords'] = small_mov['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [55]:
small_mov['soup'] = small_mov['keywords'] + small_mov['cast'] + small_mov['director'] + small_mov['genres']
small_mov['soup'] = small_mov['soup'].apply(lambda x: ' '.join(x))

In [56]:
cv = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_vect_matrix = cv.fit_transform(small_mov['soup'])

In [57]:
cosine_sim = cosine_similarity(count_vect_matrix, count_vect_matrix)

In [58]:
small_mov = small_mov.reset_index()
movie_titles = small_mov['title']
movie_indices = pd.Series(small_mov.index, index=small_mov['title'])

In [59]:
recommends('Inception').head(5)

6623             The Prestige
3381                  Memento
4145                 Insomnia
2085                Following
8031    The Dark Knight Rises
Name: title, dtype: object

#### Adding the system with Popularity and Ratings 

In [66]:
def recommends_improve(a):
    x = movie_indices[a]
    similarilities = list(enumerate(cosine_sim[x]))
    similarilities = sorted(similarilities, key=lambda x: x[1], reverse=True)
    similarilities = similarilities[1:26]
    idx = [i[0] for i in similarilities]
    
    movies = small_mov.iloc[idx][['title', 'vote_count', 'vote_average', 'year']]
    v_cnts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    v_avg = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = v_avg.mean()
    m = v_cnts.quantile(0.60)
    temp = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & 
                       (movies['vote_average'].notnull())]
    temp['vote_count'] = temp['vote_count'].astype('int')
    temp['vote_average'] = temp['vote_average'].astype('int')
    temp['wr'] = temp.apply(w_rating, axis=1)
    temp = temp.sort_values('wr', ascending=False).head(10)
    return temp

In [67]:
recommends_improve('Interstellar')

Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8,2010,7.917588
6981,The Dark Knight,12269,8,2008,7.905871
6623,The Prestige,4510,8,2006,7.758148
3381,Memento,4168,8,2000,7.740175
8031,The Dark Knight Rises,9263,7,2012,6.921448
6218,Batman Begins,7511,7,2005,6.904127
8983,The Martian,7442,7,2015,6.903287
756,2001: A Space Odyssey,3075,7,1968,6.782925
8384,Oblivion,4862,6,2013,5.93812
8854,Terminator Genisys,3677,5,2015,5.025854


### Hybrid recommendation system

In [83]:
reader = Reader()
data = Dataset.load_from_df(movie_ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'],cv=5)


{'test_rmse': array([0.89712361, 0.89184355, 0.89992406, 0.89664172, 0.89977746]),
 'test_mae': array([0.69087467, 0.68303382, 0.69184999, 0.68991342, 0.69561501]),
 'fit_time': (10.015995979309082,
  10.20353889465332,
  12.074610710144043,
  10.020551443099976,
  9.938471555709839),
 'test_time': (0.6800813674926758,
  0.3853936195373535,
  0.33699965476989746,
  0.28899526596069336,
  0.2919943332672119)}

In [84]:
def toINT(x):
    try:
        return int(x)
    except:
        return np.nan

In [85]:
mov_id = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
mov_id['tmdbId'] = mov_id['tmdbId'].apply(toINT)
mov_id.columns = ['movieId', 'id']
mov_id = mov_id.merge(small_mov[['title', 'id']], on='id').set_index('title')

In [86]:
mapper = mov_id.set_index('id')

In [89]:
def hyd(Id, name):
    index = movie_indices[name]
    tmdbId = mov_id.loc[name]['id']
    movie_id = mov_id.loc[name]['movieId']
    mov_rate = list(enumerate(cosine_sim[int(index)]))
    mov_rate = sorted(mov_rate, key=lambda x: x[1], reverse=True)
    mov_rate = mov_rate[1:26]
    idx = [i[0] for i in mov_rate]
    movies = small_mov.iloc[idx][['title', 'vote_count', 'vote_average', 'release_date', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(Id, mapper.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [90]:
hyd(1, 'Aliens')

Unnamed: 0,title,vote_count,vote_average,release_date,id,est
522,Terminator 2: Judgment Day,4274.0,7.7,1991-07-01,280,3.321079
1011,The Terminator,4208.0,7.4,1984-10-26,218,3.206613
922,The Abyss,822.0,7.1,1989-08-09,2756,3.073257
987,Alien,4564.0,7.9,1979-05-25,348,3.032888
3935,Impostor,136.0,6.1,2001-12-03,4965,2.881922
7828,I Am Number Four,1606.0,5.9,2011-02-18,46529,2.829366
7498,Daybreakers,646.0,6.0,2009-01-06,19901,2.825574
6640,Déjà Vu,1519.0,6.6,2006-11-22,7551,2.77906
6967,Doomsday,374.0,5.8,2008-03-14,13460,2.778277
344,True Lies,1138.0,6.8,1994-07-14,36955,2.774729
