In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
#from nltk.stem.snowball import SnowballStemmer
#from nltk.stem.wordnet import WordNetLemmatizer
#from nltk.corpus import wordnet
#from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [2]:
mmd = pd. read_csv('movies_metadata.csv')

In [3]:
mmd['genres'] = mmd['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [4]:
vote_counts = mmd[mmd['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = mmd[mmd['vote_average'].notnull()]['vote_average'].astype('int')

In [5]:
mmd['year'] = pd.to_datetime(mmd['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)


In [6]:
#drop the columns we don't need
#mmd.drop(['belongs_to_collection','budget','homepage','imdb_id','runtime','poster_path','production_companies','production_countries','revenue','title','video'], axis=1, inplace=True)

In [7]:
mmd.count()

adult                    45466
belongs_to_collection     4494
budget                   45466
genres                   45466
homepage                  7782
id                       45466
imdb_id                  45449
original_language        45455
original_title           45466
overview                 44512
popularity               45461
poster_path              45080
production_companies     45463
production_countries     45463
release_date             45379
revenue                  45460
runtime                  45203
spoken_languages         45460
status                   45379
tagline                  20412
title                    45460
video                    45460
vote_average             45460
vote_count               45460
year                     45466
dtype: int64

In [9]:
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [10]:
#drop the rows with ?
mmd = mmd.drop([19730, 29503, 35587])

In [11]:
#test if we drop the rows successfully
mmd.count()

adult                    45463
belongs_to_collection     4491
budget                   45463
genres                   45463
homepage                  7779
id                       45463
imdb_id                  45446
original_language        45452
original_title           45463
overview                 44509
popularity               45460
poster_path              45077
production_companies     45460
production_countries     45460
release_date             45376
revenue                  45460
runtime                  45203
spoken_languages         45460
status                   45379
tagline                  20412
title                    45460
video                    45460
vote_average             45460
vote_count               45460
year                     45463
dtype: int64

In [12]:
list(mmd)

['adult',
 'belongs_to_collection',
 'budget',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count',
 'year']

In [13]:
# check EDA Notebook for how and why we got these indices.
mmd['id'] = mmd['id'].astype('int')


In [14]:
smd = mmd[mmd['id'].isin(links_small)]
smd.shape


(9099, 25)

In [15]:
#Movie Description Based Recommender

In [16]:
#use overview and tagline to form a discription
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [17]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [18]:
tfidf_matrix.shape

(9099, 268124)

In [19]:
# Cosine Similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [20]:
smd = smd.reset_index()
titles = smd['original_title']
indices = pd.Series(smd.index, index=smd['original_title'])

In [21]:
# Define a function to give the 10 most similar movies to the given one
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [22]:
#run the recommendation function and display the results
get_recommendations('The Godfather')

973      The Godfather: Part II
8387                 The Family
3509                       Made
4196         Johnny Dangerously
29                    摇啊摇，摇到外婆桥
5667                       Fury
2412             American Movie
1582    The Godfather: Part III
4221                   8 femmes
2159              Summer of Sam
Name: original_title, dtype: object

In [23]:
get_recommendations('The Dark Knight').head()

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
Name: original_title, dtype: object

In [24]:
get_recommendations('Jumanji').head()

8889                     Pixels
8608    Guardians of the Galaxy
6392                 Stay Alive
8154             Wreck-It Ralph
3196         Dungeons & Dragons
Name: original_title, dtype: object

In [25]:
## Metadata Based Recommender

In [26]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

In [27]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
mmd['id'] = mmd['id'].astype('int')

In [28]:
mmd.shape

(45463, 25)

In [29]:
#merge the metadata with credits and keywords to combine cast, crew, genres and credits, all in one dataframe.

In [30]:
meta_cd = mmd.merge(credits, on='id')
meta_cd_kw = meta_cd.merge(keywords, on='id')

In [31]:
s_meta_cd_kw = meta_cd_kw[meta_cd_kw['id'].isin(links_small)]
s_meta_cd_kw.shape

(9219, 28)

In [32]:
#Crew: From the crew, we only pick the director as our feature since the others don't contribute that much to the feel of the movie.
#Cast: Lesser known actors and minor roles do not really affect people's opinion of a movie. 
#So we only select the major characters and their respective actors. We choose the top 3 actors that appear in the cast column.

In [33]:
s_meta_cd_kw['cast'] = s_meta_cd_kw['cast'].apply(literal_eval)
s_meta_cd_kw['crew'] = s_meta_cd_kw['crew'].apply(literal_eval)
s_meta_cd_kw['keywords'] = s_meta_cd_kw['keywords'].apply(literal_eval)
s_meta_cd_kw['cast_size'] = s_meta_cd_kw['cast'].apply(lambda x: len(x))
s_meta_cd_kw['crew_size'] = s_meta_cd_kw['crew'].apply(lambda x: len(x))

In [34]:
#Define a function to select the directors in crew
def get_director(x):
    
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [35]:
s_meta_cd_kw['director'] = s_meta_cd_kw['crew'].apply(get_director)

In [36]:
s_meta_cd_kw['cast'] = s_meta_cd_kw['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
s_meta_cd_kw['cast'] = s_meta_cd_kw['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [37]:
s_meta_cd_kw['keywords'] = s_meta_cd_kw['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [38]:
#Strip Spaces and Convert to Lowercase from all our features.
#Mention Director 3 times to give it more weight relative to the entire cast.

In [39]:
s_meta_cd_kw['cast'] = s_meta_cd_kw['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [40]:
s_meta_cd_kw['director'] = s_meta_cd_kw['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
s_meta_cd_kw['director'] = s_meta_cd_kw['director'].apply(lambda x: [x,x, x])

In [41]:
#keywords preprocessing

In [42]:
#calculate the frequenct counts of every keyword that appears in the dataset
s = s_meta_cd_kw.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [43]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [44]:
# remove the words with extremely low frequency, we removed with frequecy at 1
s = s[s > 1]

In [74]:
#convert words to their stems in order that they can be treated as the same.
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

NameError: name 'SnowballStemmer' is not defined

In [45]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [None]:
s_meta_cd_kw['keywords'] = s_meta_cd_kw['keywords'].apply(filter_keywords)
s_meta_cd_kw['keywords'] = s_meta_cd_kw['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
s_meta_cd_kw['keywords'] = s_meta_cd_kw['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [None]:
s_meta_cd_kw['soup'] = s_meta_cd_kw['keywords'] + s_meta_cd_kw['cast'] + s_meta_cd_kw['director'] + s_meta_cd_kw['genres']
s_meta_cd_kw['soup'] = s_meta_cd_kw['soup'].apply(lambda x: ' '.join(x))

In [None]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(s_meta_cd_kw['soup'])

In [None]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
s_meta_cd_kw = s_meta_cd_kw.reset_index()
titles = s_meta_cd_kw['original_title']
indices = pd.Series(s_meta_cd_kw.index, index=s_meta_cd_kw['original_title'])

In [None]:
# test the recommadation

In [None]:
get_recommendations('The Dark Knight').head()

In [None]:
get_recommendations('Mean Girls').head(10)

In [None]:
# popularity and ratings

In [None]:
# top 25 movies based on similarity scores and calculate the vote of the 60th percentile movie
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = s_meta_cd_kw.iloc[movie_indices][['original_title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [None]:
#test the function
improved_recommendations('The Dark Knight')

In [None]:
improved_recommendations('Mean Girls')