In [2]:
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy as stats
from ast import literal_eval
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import cross_validate,train_test_split
import warnings
warnings.simplefilter('ignore')

SyntaxError: invalid syntax (<ipython-input-2-85dd0c6ee1f4>, line 1)

In [None]:
df=pd.read_csv("../input/the-movies-dataset/movies_metadata.csv")
df.head(5)

In [None]:
df['genres'] = df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')

C = vote_averages.mean()
C

In [None]:
m = vote_counts.quantile(0.95)
m

In [None]:
df['year'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [None]:
rated_movies=df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
rated_movies['vote_count']=rated_movies['vote_count'].astype('int')
rated_movies['vote_average'] = rated_movies['vote_average'].astype('int')
rated_movies.shape

So, there are 2274 movies that satisfy the category of being in the top 5% in terms of vote count.

In [None]:
def formula_rating(s):
    R = s['vote_average']
    v = s['vote_count']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
rated_movies['wr']=rated_movies.apply(formula_rating, axis=1)
rated_movies=rated_movies.sort_values('wr',ascending=False)

In [None]:
rated_movies.head(10)

In [None]:
s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = df.drop('genres', axis=1).join(s)

In [None]:
def genre_recommendations(genre, percentile=0.85):
    df1 = gen_md[gen_md['genre'] == genre]
    vote_counts = df1[df1['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df1[df1['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    rated_movies = df1[(df1['vote_count'] >= m) & (df1['vote_count'].notnull()) & (df1['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    rated_movies['vote_count'] = rated_movies['vote_count'].astype('int')
    rated_movies['vote_average'] = rated_movies['vote_average'].astype('int')
    
    rated_movies['wr'] = rated_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    rated_movies = rated_movies.sort_values('wr', ascending=False).head(250)
    
    return rated_movies

In [None]:
genre_recommendations('Action').head(10)

We will now to try to provide user with much more personalized recommendations

In [None]:
df_small = pd.read_csv('../input/the-movies-dataset/links_small.csv')
df_small = df_small[df_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [None]:
df = df.drop([19730, 29503, 35587])
df['id'] = df['id'].astype('int')

In [None]:
sdf = df[df['id'].isin(df_small)]
sdf.shape

In [None]:
sdf['tagline'] = sdf['tagline'].fillna('')
sdf['description'] = sdf['overview'] + sdf['tagline']
sdf['description'] = sdf['description'].fillna('')

In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(sdf['description'])

In [None]:
tfidf_matrix.shape

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim[0]
sdf = sdf.reset_index()

In [None]:
indices = pd.Series(sdf.index, index=sdf['title'])
titles = sdf['title']

In [None]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [None]:
get_recommendations('Inception').head(10)

Meta-data based recommender

In [None]:
credits = pd.read_csv('../input/the-movies-dataset/credits.csv')
keywords = pd.read_csv('../input/the-movies-dataset/keywords.csv')

In [None]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
df['id'] = df['id'].astype('int')

In [None]:
df.shape

In [None]:
df = df.merge(credits, on='id')
df = df.merge(keywords, on='id')
sdf = df[df['id'].isin(df_small)]
sdf.shape

In [None]:
sdf['cast'] = sdf['cast'].apply(literal_eval)
sdf['crew'] = sdf['crew'].apply(literal_eval)
sdf['keywords'] = sdf['keywords'].apply(literal_eval)
sdf['cast_size'] = sdf['cast'].apply(lambda x: len(x))
sdf['crew_size'] = sdf['crew'].apply(lambda x: len(x))

In [None]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [None]:
sdf['director'] = sdf['crew'].apply(get_director)

In [None]:
sdf['cast'] = sdf['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
sdf['cast'] = sdf['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [None]:
sdf['keywords'] = sdf['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
sdf['cast'] = sdf['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [None]:
sdf['director'] = sdf['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
sdf['director'] = sdf['director'].apply(lambda x: [x,x, x])

In [None]:
s = sdf.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [None]:
s = s.value_counts()
s[:5]

In [None]:
s = s[s>1]

Stemmer reduces words to their base or root word.

In [None]:
stemmer = SnowballStemmer('english')
stemmer.stem('cats')

In [None]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [None]:
sdf['keywords'] = sdf['keywords'].apply(filter_keywords)
sdf['keywords'] = sdf['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
sdf['keywords'] = sdf['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [None]:
sdf['soup'] = sdf['keywords'] + sdf['cast'] + sdf['director'] + sdf['genres']
sdf['soup'] = sdf['soup'].apply(lambda x: ' '.join(x))

In [None]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(sdf['soup'])

In [None]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
sdf = sdf.reset_index()
titles = sdf['title']
indices = pd.Series(sdf.index, index=sdf['title'])

In [None]:
get_recommendations('Memento').head(10)

    To include ratings and popularity
    Take 25 movies from the above similarity scores, and recommend top 40% i.e 10 movies.
   
   

In [None]:

def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = sdf.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    rated_movies = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    rated_movies['vote_count'] = rated_movies['vote_count'].astype('int')
    rated_movies['vote_average'] = rated_movies['vote_average'].astype('int')
    rated_movies['wr'] = rated_movies.apply(formula_rating, axis=1)
    rated_movies = rated_movies.sort_values('wr', ascending=False).head(10)
    return rated_movies

In [None]:
improved_recommendations('The Shawshank Redemption')

Collaborative filtering

In [None]:
reader = Reader(rating_scale=(1, 5))

In [None]:
ratings = pd.read_csv('../input/the-movies-dataset/ratings_small.csv')
ratings.head()

In [None]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [None]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'],cv=5)

In [None]:

trainset = data.build_full_trainset()
svd.fit(trainset)

In [None]:
ratings[ratings['userId'] == 5]

In [None]:
svd.predict(5, 150)

Hybrid Recommendation

In [None]:
I will try to build a hybrid recommender based on the previous techniques we have learnt.

In [None]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [None]:
id_map = pd.read_csv('../input/the-movies-dataset/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(sdf[['title', 'id']], on='id').set_index('title')
#id_map = id_map.set_index('tmdbId')

In [None]:
indices_map = id_map.set_index('id')
id_map.columns

In [None]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [i[0] for i in sim_scores]
    movies = sdf.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [None]:
hybrid(1, 'Pirates of the Caribbean: The Curse of the Black Pearl')

In [None]:
hybrid(5, 'Pirates of the Caribbean: The Curse of the Black Pearl')