In [None]:
# Importing pandas, numpy and matplotlib pyplot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#  Reading required files and creating dataframes
tmdb_movies_df = pd.read_csv('tmdb-movie-metadata/tmdb_5000_movies.csv')
tmdb_credits_df = pd.read_csv('tmdb-movie-metadata/tmdb_5000_credits.csv')

# Demographic Filtering System

In [None]:
# Rounding of popularity column to 2 decimal points
tmdb_movies_df.popularity = tmdb_movies_df.popularity.round(2)

In [None]:
# Plotting top 10 movies based on popularity 
pop= tmdb_movies_df.sort_values('popularity', ascending=False)
plt.figure(figsize=(10,4))
plt.barh(pop['title'].head(10),pop['popularity'].head(10), align='center',
        color='red')
plt.gca().invert_yaxis()
for index, value in enumerate(pop['popularity'].head(10)):
    plt.text(value, index, str(value))
plt.xlabel("Popularity")
plt.title("Popular Movies")

In [None]:
# Calculating mean of vote_average column
vmean= tmdb_movies_df['vote_average'].mean()

# Calculating 95 quantile of vote_average column
vquant= tmdb_movies_df['vote_count'].quantile(0.95)

top_movies = tmdb_movies_df.copy().loc[tmdb_movies_df['vote_count'] >= vquant]

In [None]:
# Function to calculate score based on the IMDB formula
def score(x, m=vquant, vmean=vmean):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+vquant) * R) + (vquant/(vquant+v) * vmean)

In [None]:
# Creating new column score using score calcuclated with score function 
top_movies['score'] = top_movies.apply(score, axis=1)

# Rounding of score column to 2 decimal points 
top_movies.score = top_movies.score.round(2)

#Sort movies based on score calculated above
top_movies = top_movies.sort_values('score', ascending=False)

#Print the top 5 movies
top_movies[['title', 'vote_count', 'vote_average', 'score']].head()

In [None]:
# Plotting top 10 movies based on score 
pop= top_movies.sort_values('score', ascending=False)
plt.figure(figsize=(10,4))

plt.barh(pop['title'].head(10),pop['score'].head(10), align='center',
        color='blue')
plt.gca().invert_yaxis()
for index, value in enumerate(pop['score'].head(10)):
    plt.text(value, index, str(value))
plt.xlabel("Score")
plt.title("Highest Rated Movies")

# Content-Based Recommender System

In [None]:
# Importing literal_eval to convert unstringify json objects
from ast import literal_eval

In [None]:
# Data Preprocessing to convert data into required format

In [None]:
# Filling not available values with empty strings
tmdb_movies_df['overview'] = tmdb_movies_df['overview'].fillna('')

In [None]:
# Changing Keywords and Genres columns into single simple column

# Coverting Keywords and Generes columns from string format to object
tmdb_movies_df['keywords'] = tmdb_movies_df['keywords'].apply(literal_eval)
tmdb_movies_df['genres'] = tmdb_movies_df['genres'].apply(literal_eval)

Keywords_list = []
movie_keywords = ''
for i in tmdb_movies_df['keywords']:
    for j in i:
        # Replaces the space between 2 words of keywords 
        kname = j['name'].replace(" ", "")
        # Changing all upper case letters to lower case
        movie_keywords = movie_keywords +' ' +kname.lower()
    Keywords_list.append(movie_keywords)
    movie_keywords = ''
len(Keywords_list)    
Keywords_series = pd.Series(Keywords_list)
Keywords_series.name = 'New_Keywords'

genres_list = []
movie_genres = ''
for i in tmdb_movies_df['genres']:
    for j in i:
        # Replaces the space between 2 words of genres
        gname = j['name'].replace(" ", "")
        # Changing all upper case letters to lower case
        movie_genres = movie_genres +' ' +gname.lower()
    genres_list.append(movie_genres)
    movie_genres = ''   
Genres_series = pd.Series(genres_list)
Genres_series.name = 'New_Genres'

New_movie_df = tmdb_movies_df.join(Keywords_series)
New_movie_df = New_movie_df.join(Genres_series)

# Adding modified Keywords nad genres into single column
New_movie_df['Key_Gen'] = New_movie_df['New_Keywords'] + New_movie_df['New_Genres']

In [None]:
# Extracting top 4 actors from cast column and director from crew column, then combining both into single simple column

# Coverting cast and crew columns from string format to object
tmdb_credits_df['cast'] = tmdb_credits_df['cast'].apply(literal_eval)
tmdb_credits_df['crew'] = tmdb_credits_df['crew'].apply(literal_eval)

# Extracting top 4 actors
cast_list = []
movie_cast = ''
for i in tmdb_credits_df['cast']:
    check = 0
    for j in i:
        if check < 4:
            cname = j['name'].replace(" ", "")
            movie_cast = movie_cast +' ' +cname.lower()
        check = check + 1
    cast_list.append(movie_cast)
    movie_cast = ''

Cast_series = pd.Series(cast_list)
Cast_series.name = 'Top_Cast'
New_movie_df = New_movie_df.join(Cast_series)

# Extracting movie director/s 
director_list = []
movie_director = ''
for i in tmdb_credits_df['crew']:
    for j in i:
        if(j['job'] == 'Director'):
            dname = j['name'].replace(" ", "")
            movie_director = movie_director +' ' +dname.lower()
    director_list.append(movie_director)
    movie_director = ''

director_series = pd.Series(director_list)
director_series.name = 'Director'
New_movie_df = New_movie_df.join(director_series)

# Combining top actors and director into single column
New_movie_df['Cast_Director'] = New_movie_df['Director'] + New_movie_df['Top_Cast']

In [None]:
# Creating new column by combining Keywords, generes, top 4 actors and director of the movie
New_movie_df['Combined'] = New_movie_df['Key_Gen'] + New_movie_df['Cast_Director']

In [None]:
# Now designing content based recommender systems with different combinations

In [None]:
# Importing TFIDFVectorizer, CountVectorizer and CosineSimilarty
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Creating function to compare cosine similarty and provide top 10 similar movies 
def movie_recommend(mtitle, cosSim):
    indx = index_series[mtitle]
    scores_similarity = list(enumerate(cosSim[indx]))
    scores_similarity = sorted(scores_similarity, key=lambda x: x[1], reverse=True)
    scores_similarity = scores_similarity[1:11]
    mindices = []
    for i in scores_similarity:
            mindices.append(i[0])
    return tmdb_movies_df['title'].iloc[mindices]

In [None]:
# Creating TFIDF matrix using Overview column
tfidfvec = TfidfVectorizer(stop_words='english')
tfidf_mtx = tfidfvec.fit_transform(tmdb_movies_df['overview'])

# Calculating cosine similarty with tfidf matrix
cosSim_1 = cosine_similarity(tfidf_mtx, tfidf_mtx)

# Setting title of the movie as index of movie dataframe
index_series = pd.Series(tmdb_movies_df.index, index=tmdb_movies_df['title'])

In [None]:
# Recommending movie on the basis of plot of the movie(overview)
movie_recommend('Interstellar', cosSim_1)

In [None]:
# Creating TFIDF matrix using Key_Gen column(Combination of Keywords and Genres of the movie)
tfidf_1 = TfidfVectorizer(stop_words='english')
tfidf_mtx_1 = tfidf_1.fit_transform(New_movie_df['Key_Gen'])

# Calculating cosine similarty with tfidf matrix
cosSim_2 = cosine_similarity(tfidf_mtx_1, tfidf_mtx_1)

# Setting title of the movie as index of New movie dataframe
New_movie_df = New_movie_df.reset_index()
index_series = pd.Series(New_movie_df.index, index=New_movie_df['title'])

In [None]:
# Recommending movie on the basis of Keywords and Genres of the movie
movie_recommend('Interstellar', cosSim_2)

In [None]:
# Creating count vector matrix using Cast_Director column(Combination of director and top 4 actors of the movie)
count_vector = CountVectorizer(stop_words='english')
count_vector_matrix = count_vector.fit_transform(New_movie_df['Cast_Director'])

# Calculating cosine similarty with count vector matrix
cosSim_3 = cosine_similarity(count_vector_matrix, count_vector_matrix)

# Resetting the index
New_movie_df = New_movie_df.reset_index()
index_series = pd.Series(New_movie_df.index, index=New_movie_df['title'])

In [None]:
# Recommending movie on the basis of Cast(top 4 actors) and Crew(director) of the movie
movie_recommend('Interstellar', cosSim_3)

In [None]:
# Creating count vector matrix using Combined column(Combination of keywords, genres, director and top 4 actors of the movie)
count_vector_1 = CountVectorizer(stop_words='english')
count_vector_matrix_1 = count_vector_1.fit_transform(New_movie_df['Combined'])

# Calculating cosine similarty with count vector matrix
cosSim_4 = cosine_similarity(count_vector_matrix_1, count_vector_matrix_1)

In [None]:
# Recommending movie on the basis of Cast(top 4 actors) and Crew(director) of the movie
movie_recommend('Interstellar', cosSim_4)

# Collabarative Filtering System

In [None]:
# Reading movie file with specific columns and chnaging id column name to movieId
movies_df = pd.read_csv('tmdb-movie-metadata/tmdb_5000_movies.csv', usecols = ['id','title','genres'])
movies_df.rename(columns = {"id":"movieId"}, inplace = True)
movies_df.head()

In [None]:
# Coverting genres column from string format to object
movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['genres'] = movies_df['genres'].apply(lambda x: [i['name'] for i in x])

In [None]:
# Reading rating file
ratings_df = pd.read_csv('the-movies-dataset/ratings_small.csv')
ratings_df.head()

In [None]:
# Creating ratings matrix
rating_matrix = ratings_df.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
rating_matrix.head()

In [None]:
# Converting to matrix format
rating_matrix = rating_matrix.as_matrix()
rating_matrix.shape

In [None]:
# Subtract each users mean to denormalize
ratings_mean_each_user = np.mean(rating_matrix, axis = 1)
ratings_mean_each_user = ratings_mean_each_user.reshape(-1,1)
rating_matrix = rating_matrix - ratings_mean_each_user
rating_matrix

In [None]:
# importing svds to perform manual svds (for matrix factorization)
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(rating_matrix , k = 50)
sigma = np.diag(sigma)

In [None]:
#dot product of U, sigma and Vt which gives us an array of predicted ratings for all users
predicted_ratingsSVD = np.dot(np.dot(U, sigma), Vt) + ratings_mean_each_user.reshape(-1, 1)

In [None]:
#temp df used for naming columns
temp_df = ratings_df.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)

In [None]:
#array of predicted ratings for all users converted to a dataframe
predicted_ratingsSVD_DF = pd.DataFrame(predicted_ratingsSVD, columns = temp_df.columns)
predicted_ratingsSVD_DF.head()

In [None]:
#predicted ratings for user 10 sorted in descending order
userid = 10
predicted_ratingsUser10 = predicted_ratingsSVD_DF.iloc[userid - 1].sort_values(ascending=False)
predicted_ratingsUser10

In [None]:
#data of user 10 from ratings df
user_data = ratings_df[ratings_df['userId'] == 10]

In [None]:
#dataframe of movies rated by user 10
movies_rated_df = (user_data.merge(movies_df,how = 'left',on = 'movieId').sort_values(['rating'], ascending=False))

In [None]:
# Removing movies which are not available
movies_rated_df = movies_rated_df.dropna()

# Top movies already rated by user 10
movies_rated_df.head(10)

In [None]:
#Recommending movies to user with userid = 10
collaborative_recommendations = movies_df[~movies_df['movieId'].isin(movies_rated_df['movieId'])].merge(pd.DataFrame(predicted_ratingsUser10).reset_index(), on = 'movieId').rename(columns = {9: 'PredictedRatingNormalized'})

In [None]:
# Recommending movies to user with userid = 10 sorted by ratings in descending order
collaborative_recommendations = collaborative_recommendations.sort_values('PredictedRatingNormalized', ascending=False)

In [None]:
# Importing Reader, Dataset, SVD and for evaluation cross_validate
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [None]:
# Load Reader library
reader = Reader()

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

In [None]:
#calling the inbuilt SVD class in surprise package and assigning an object
svd = SVD()

In [None]:
#building trainset
trainset = data.build_full_trainset()
svd.fit(trainset)

In [None]:
ratings_df[ratings_df['userId'] == 10].head()

In [None]:
# For user with userid = 10 predicting rating for movie with movieid = 1994
svd.predict(10, 1994)

In [None]:
#calculating the RMSE using 5 fold cross validation
cross_validate(svd, data, measures=['RMSE'], cv=5, verbose=True)