# Import libraries

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import ast 
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD

import warnings; warnings.simplefilter('ignore')

# Load data of movies downloaded from https://www.kaggle.com/rounakbanik/the-movies-dataset/data

In [2]:
credits = pd.read_csv('archive/credits.csv')
keywords = pd.read_csv('archive/keywords.csv')
links_small = pd.read_csv('archive/links_small.csv')
md = pd.read_csv('archive/movies_metadata.csv')
ratings = pd.read_csv('archive/ratings_small.csv')

# Understanding data

## Features

In [3]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [4]:
keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


In [5]:
links_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9125 entries, 0 to 9124
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9125 non-null   int64  
 1   imdbId   9125 non-null   int64  
 2   tmdbId   9112 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 214.0 KB


In [6]:
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [8]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [9]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [10]:
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [11]:
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [12]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


# Simple recommendation system
## Approach:

### The Simple Recommender offers generalized recommendations to every user based on movie popularity and (sometimes) genre.

### The basic idea behind this recommender is that movies that are more popular and more critically acclaimed will have a higher probability of being liked by the average audience.

### This model does not give personalized recommendations based on the user.

# Unwrangle genre names from genres feature

In [13]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# Remove outliers

In [14]:
md.vote_count.quantile(0.95)

434.0

In [15]:
md['year'] = pd.to_datetime(md.release_date, errors='coerce').dt.year # get year from release date

In [16]:
cleaned_data = md.loc[(md.vote_count>md.vote_count.quantile(0.95)) & 
       (md.vote_count.notnull()) & 
       (md.vote_average.notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]

# Calculate weighted rating

In [17]:
C = md.vote_average.mean()
m = md.vote_count.quantile(0.95)

In [18]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [19]:
cleaned_data['wr'] = cleaned_data.apply(weighted_rating, axis=1)

In [20]:
cleaned_data.sort_values('wr', ascending=False).head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
314,The Shawshank Redemption,1994.0,8358.0,8.5,51.645403,"[Drama, Crime]",8.357746
834,The Godfather,1972.0,6024.0,8.5,41.109264,"[Drama, Crime]",8.306334
12481,The Dark Knight,2008.0,12269.0,8.3,123.167259,"[Drama, Action, Crime, Thriller]",8.208376
2843,Fight Club,1999.0,9678.0,8.3,63.869599,[Drama],8.184899
292,Pulp Fiction,1994.0,8670.0,8.3,140.950236,"[Thriller, Crime]",8.172155
351,Forrest Gump,1994.0,8147.0,8.2,48.307194,"[Comedy, Drama, Romance]",8.069421
522,Schindler's List,1993.0,4436.0,8.3,41.725123,"[Drama, History, War]",8.061007
23673,Whiplash,2014.0,4376.0,8.3,64.29999,[Drama],8.058025
5481,Spirited Away,2001.0,3968.0,8.3,41.048867,"[Fantasy, Adventure, Animation, Family]",8.035598
1154,The Empire Strikes Back,1980.0,5998.0,8.2,19.470959,"[Adventure, Action, Science Fiction]",8.025793


# Get top movies genre wise

In [21]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)
gen_md.head(3).transpose()

Unnamed: 0,0,0.1,0.2
adult,False,False,False
belongs_to_collection,"{'id': 10194, 'name': 'Toy Story Collection', ...","{'id': 10194, 'name': 'Toy Story Collection', ...","{'id': 10194, 'name': 'Toy Story Collection', ..."
budget,30000000,30000000,30000000
homepage,http://toystory.disney.com/toy-story,http://toystory.disney.com/toy-story,http://toystory.disney.com/toy-story
id,862,862,862
imdb_id,tt0114709,tt0114709,tt0114709
original_language,en,en,en
original_title,Toy Story,Toy Story,Toy Story
overview,"Led by Woody, Andy's toys live happily in his ...","Led by Woody, Andy's toys live happily in his ...","Led by Woody, Andy's toys live happily in his ..."
popularity,21.946943,21.946943,21.946943


In [22]:
def top_list_genre_wise(genre, percentile=0.95):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & 
                   (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: 
                        (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C),
                        axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [23]:
top_list_genre_wise('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
351,Forrest Gump,1994.0,8147,8,48.307194,7.86986
10309,Dilwale Dulhania Le Jayenge,1995.0,661,9,34.457024,7.582757
876,Vertigo,1958.0,1162,8,18.20822,7.298862
40251,Your Name.,2016.0,1030,8,34.461252,7.235471
883,Some Like It Hot,1959.0,835,8,11.845107,7.117619
1132,Cinema Paradiso,1988.0,834,8,14.177005,7.116921
19901,Paperman,2012.0,734,8,7.198633,7.041055
37863,Sing Street,2016.0,669,8,10.672862,6.984338
1639,Titanic,1997.0,7770,7,26.88907,6.916316
19731,Silver Linings Playbook,2012.0,4840,7,14.488111,6.869789


# Content based recommendation system : Using movie description and taglines and TF-IDF Vectorizer

In [24]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [25]:
md['id'] = md['id'].apply(convert_int)

In [26]:
md = md[md['id'].notnull()]

In [27]:
md['id'] = md['id'].astype('int')

In [28]:
smd = md[md['id'].isin(links_small.movieId)]
smd.shape

(2840, 25)

In [29]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [30]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [31]:
# http://scikit-learn.org/stable/modules/metrics.html#linear-kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [32]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])
#indices.head(2)

In [33]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [34]:
get_recommendations('The Godfather').head(10)

230         The Godfather: Part II
2525             Gang War in Milan
1144               Queen of Hearts
1825    The Cave of the Yellow Dog
1421           Running Out of Time
399        The Godfather: Part III
948                        8 Women
1268                 Bright Future
747              Jaws: The Revenge
2184             Short Sharp Shock
Name: title, dtype: object

In [35]:
get_recommendations('The Dark Knight').head(10)

24           Batman Forever
293          Batman Returns
111                  Batman
626                     JFK
1529          Batman Begins
1295        To End All Wars
322          Batman & Robin
2583    The Eleventh Victim
1615             Cul-de-sac
1831                 Taxi 4
Name: title, dtype: object

### This is not of much use to most people as it doesn't take into considerations very important features such as cast, crew, director and genre, which determine the rating and the popularity of a movie.

## Content based Recommendation System : Using movie description, taglines, keywords, cast, director and genres with Count Vectorizer

In [36]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [37]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [39]:
smd = md[md['id'].isin(links_small.movieId)]
smd.shape

(2858, 28)

In [40]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [41]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [42]:
smd['director'] = smd['crew'].apply(get_director)
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [43]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

In [44]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]

In [45]:
stemmer = SnowballStemmer('english')
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [46]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [47]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [48]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [49]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [50]:
get_recommendations('The Dark Knight').head(10)

1536     Batman Begins
1716      The Prestige
920           Insomnia
759            Memento
293     Batman Returns
322     Batman & Robin
24      Batman Forever
2070       Harry Brown
507           Superman
1166      The Enforcer
Name: title, dtype: object

### The recommendations seem to have recognized other Christopher Nolan movies (due to the high weightage given to director) and put them as top recommendations.
### I enjoyed watching The Dark Knight as well as some of the other ones in the list including Batman Begins, The Prestige and The Dark Knight Rises.

In [53]:
get_recommendations('Pulp Fiction').head(10)

350            Jackie Brown
1203      Kill Bill: Vol. 2
190          Reservoir Dogs
1099      Kill Bill: Vol. 1
1805            Death Proof
957          The 51st State
691                   Shaft
906          Changing Lanes
14      From Dusk Till Dawn
1174           No Good Deed
Name: title, dtype: object

## Add Popularity and Ratings

In [54]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & 
                       (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [55]:
improved_recommendations('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,year,wr
1716,The Prestige,4510,8,2006.0,7.790919
759,Memento,4168,8,2000.0,7.775381
1536,Batman Begins,7511,7,2005.0,6.924519
111,Batman,2145,7,1989.0,6.767469
943,The Transporter,1724,6,2002.0,5.923217
293,Batman Returns,1706,6,1992.0,5.922571
920,Insomnia,1181,6,2002.0,5.897401
507,Superman,1042,6,1978.0,5.887738
24,Batman Forever,1529,5,1995.0,5.13668
322,Batman & Robin,1447,4,1997.0,4.373366


In [56]:
improved_recommendations('Pulp Fiction')

Unnamed: 0,title,vote_count,vote_average,year,wr
190,Reservoir Dogs,3821,8,1992.0,7.757063
1099,Kill Bill: Vol. 1,5091,7,2003.0,6.891457
1203,Kill Bill: Vol. 2,4061,7,2004.0,6.866586
350,Jackie Brown,1580,7,1997.0,6.702235
135,A Time to Kill,522,7,1996.0,6.372701
1336,Ocean's Twelve,2169,6,2004.0,5.936343
14,From Dusk Till Dawn,1644,6,1996.0,5.920261
1805,Death Proof,1359,6,2007.0,5.907586
1677,Snakes on a Plane,504,5,2006.0,5.286036
938,xXx,1454,5,2002.0,5.142109


# Content based engine is only capable of suggesting movies which are close to a certain movie. It is not capable of capturing tastes and providing recommendations across genres.