In [None]:
import pandas as pd
import numpy as np

In [None]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')
credits.columns = ['id', 'title', 'cast', 'crew']
credits.columns

In [None]:
df = movies.merge(credits, on='id')
df

In [None]:
m = df['vote_average'].mean()
m

In [None]:
c = df['vote_count'].quantile(0.7)
c

In [None]:
# where each movie has number of ratings that are greater than 70th percentile (each that will appear on filt_df has more than 581 ratings)
filt_df = df.copy().loc[df['vote_count'] >= c]
filt_df

In [None]:
# IMDB weighted rating formula
counts = filt_df['vote_count']
avg = filt_df['vote_average']
filt_df['score'] = (counts/(counts+c) * avg) + (c/(c+counts) * m)


In [None]:
filt_df.sort_values('score', ascending=False, inplace=True)
filt_df.head(5)

# Recommending like movies based on description
Using TFIDF indepedently of ratings

In [None]:
df['title'] = df['title_x']
df.drop(columns=['title_x', 'title_y'], inplace=True)
df

In [None]:
vect_df = df[df['overview'].notna()]
vect_df.shape

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_mat = tfidf.fit_transform(vect_df['overview'])
tfidf_mat.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(tfidf_mat, tfidf_mat)

In [None]:
def recommend(title, cos_sim=cos_sim):
    ind = vect_df.loc[vect_df['title'] == title].index[0]
    print(ind)
    similarity = list(enumerate(cos_sim[ind]))
    similarity = sorted(similarity , key=lambda x: x[1], reverse=True)
    # ten best scores without matching with itself
    best = similarity[1:11]
    inds = [x[0] for x in best]
    return vect_df.iloc[inds]['title']

recommend('The Shawshank Redemption')

In [None]:
recommend('The Avengers')

# Recommend based on keywords, genre
A step further

In [None]:
vect_df.columns

In [None]:
import ast
vect_df['genres'] = vect_df['genres'].apply(ast.literal_eval)
vect_df['keywords'] = vect_df['keywords'].apply(ast.literal_eval)
vect_df['cast'] = vect_df['cast'].apply(ast.literal_eval)
vect_df['crew'] = vect_df['crew'].apply(ast.literal_eval)

In [None]:
def extract_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.NaN

vect_df['director'] = vect_df['crew'].apply(extract_director)

In [None]:
def extract_top(x):
    names = [i['name'] for i in x]
    if len(names) > 4:
        return names[:4]
    return names

feats = ['cast', 'genres', 'keywords']
for f in feats:
    vect_df[f + '_top'] = vect_df[f].apply(extract_top)

In [None]:
vect_df.columns

In [None]:
df_feats = vect_df.copy()

In [None]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ', '')) for i in x]
    else:
        if x is np.NaN:
            return ''
        else:
            return str.lower(x.replace(' ', ''))

feats = ['director', 'genres_top', 'cast_top', 'keywords_top']
for f in feats:
    df_feats[f] = df_feats[f].apply(clean_data)


In [None]:
def combine(x):
    return ' '.join(x['keywords_top']) + ' ' + ' '.join(x['cast_top']) + ' ' + ' '.join(x['genres_top']) + ' ' + x['director']
df_feats['combined_feat'] = df_feats.apply(combine, axis=1)

In [None]:
df_feats['combined_feat'].head(3)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english')
count_mat = count.fit_transform(df_feats['combined_feat'])

In [None]:
sim_small = cosine_similarity(count_mat, count_mat)
recommend('The Avengers', cos_sim=sim_small)

# Recommend based on both previous concepts
Very simply take the closest items, will tend toward keywords/cast/crew because it has smaller lengths than overview

In [None]:
def combined_recommend(title, cos_sim=cos_sim, sim_small=sim_small):
    ind = vect_df.loc[vect_df['title'] == title].index[0]
    sim_over = list(enumerate(cos_sim[ind]))
    sim_smal = list(enumerate(sim_small[ind]))
    combined = sim_over + sim_smal
    combined = sorted(combined, key=lambda x: x[1], reverse=True)
    inter = combined[1:11]
    inter_names = [x[0] for x in inter]
    return vect_df.iloc[inter_names]['title']

combined_recommend('The Avengers')

# Working with user info
Collaborative filtering

In [None]:
user_df = pd.read_csv('data/ratings.csv')
user_df.head(10)

In [None]:
user_df_copy = user_df.copy()
user_df.shape

In [None]:
x = user_df[['movieId', 'userId']]
y = user_df['rating']

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(x, y)
knn.predict([[302, 1]])

In [None]:
from surprise import Reader, Dataset, SVD
reader = Reader()
data = Dataset.load_from_df(user_df[['movieId', 'userId', 'rating']], reader)

In [None]:
from surprise.model_selection import cross_validate
svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

In [None]:
svd.predict(1, 302)

# Predict for specific user

In [None]:
df_feats.rename(columns={'id': 'movieId'}, inplace=True)
df_feats.shape, user_df.shape

In [None]:
combined_df = user_df.merge(df_feats, how='left', on='movieId')
combined_df.shape

In [None]:
combined_df_clean = combined_df.dropna()
combined_df_clean.shape

In [None]:
combined_df_clean.head()

In [None]:
def recommend_for(userId, combdf, cos_sim=cos_sim, sim_small=sim_small):
    user_data = combdf[combdf.userId == (userId)]
    print(f'User {userId} has rated {user_data.shape[0]} movies')
    top_rated = user_data.sort_values('rating', ascending=False).iloc[0]['title']
    return combined_recommend(top_rated)
recommend_for(1, combined_df_clean)