## Genre-based Recommender System

1. Find how many different types of genres are present.
2. Made a dictionary of genres (keys = genres name, values = list of movies belongs to the given genre)
3. Sorted the values based on the imdb weighted rating system score.

In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval  # evaluate strings containing Python code in the current Python environment
from nltk.stem.snowball import SnowballStemmer # Removing stem words
from sklearn.feature_extraction.text import CountVectorizer  # To convert text to numerical data
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from collections import defaultdict
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
import seaborn as sns
import networkx as nx

import warnings  # disable python warnings
warnings.filterwarnings("ignore")

In [3]:
# Loading datasets

movies_data = pd.read_csv("dataset/movies_metadata.csv", low_memory=False)
credits = pd.read_csv('dataset/credits.csv')
keywords = pd.read_csv('dataset/keywords.csv')
links_small = pd.read_csv('dataset/links_small.csv')
ratings = pd.read_csv("dataset/ratings_small.csv")

In [4]:
movies_data.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


In [5]:
# Checking for null values in the dataset

print(movies_data.isnull().sum(),'\n') # used only selected column of the dataset which include genres,id,vote_average,vote_count
print(links_small.isnull().sum(),'\n')
print(ratings.isnull().sum(),'\n')
print(keywords.isnull().sum(),'\n')
print(credits.isnull().sum(),'\n')

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64 

movieId     0
imdbId      0
tmdbId     13
dtype: int64 

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64 

id          0
keywords    0
dtype: int64 

cast    0
crew    0
id      0
dtype: int64 



In [6]:
# Removing the rows with null value in the vote_average and vote_count columns in movies_data dataframe

movies_data = movies_data.dropna(subset=['vote_average', 'vote_count'])
print(movies_data.isnull().sum(),'\n')

adult                        0
belongs_to_collection    40970
budget                       0
genres                       0
homepage                 37682
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   0
poster_path                383
production_companies         0
production_countries         0
release_date                84
revenue                      0
runtime                    257
spoken_languages             0
status                      81
tagline                  25048
title                        0
video                        0
vote_average                 0
vote_count                   0
dtype: int64 



In [8]:
# Weighted rating
def weighted_rating(v,R):

    '''

    This function calculate weighted rating of a movies using IMDB formula

    Parameters: v (int): vote count
                R (int): vote average
    Returns: (float) IMDB score

    '''
    return ((v/(v+m)) * R) + ((m/(m+v)) * C)



C = movies_data['vote_average'].mean()         # mean vote across all data
m = movies_data['vote_count'].quantile(0.95)   # movies with more than 95% votes is taken (95 percentile)

# Taking movies whose vote count is greater than m
top_movies = movies_data.copy().loc[movies_data['vote_count'] >= m]
top_movies = top_movies.reset_index()

top_movies['score'] = ''

for i in range(top_movies.shape[0]):
    v = top_movies['vote_count'][i]          # number of vote count of the movie
    R = top_movies['vote_average'][i]        # average rating of the movie
    top_movies['score'][i] = weighted_rating(v,R)

top_movies = top_movies.sort_values('score', ascending=False)  # sorting movies in descending order according to score
top_movies = top_movies.reset_index()

# top_movies[['title', 'vote_count', 'vote_average', 'score']].head(20) # top 20 movies
t1 = top_movies[['title', 'score']].head(20)

print(t1)

                                            title     score
0                        The Shawshank Redemption  8.357746
1                                   The Godfather  8.306334
2                                 The Dark Knight  8.208376
3                                      Fight Club  8.184899
4                                    Pulp Fiction  8.172155
5                                    Forrest Gump  8.069421
6                                Schindler's List  8.061007
7                                        Whiplash  8.058025
8                                   Spirited Away  8.035598
9                         The Empire Strikes Back  8.025793
10                                      Inception  8.025763
11                              Life Is Beautiful  8.014521
12                               The Intouchables  8.008265
13                                   Interstellar  8.007315
14                         The Godfather: Part II  7.997846
15  The Lord of the Rings: The Return of

In [9]:
# Simple recommender based on genres

genres = set()

# Finding the exhaustive set of genres in the dataset
top_movies['genres'] = top_movies['genres'].apply(literal_eval)
for i in range(top_movies['genres'].shape[0]):   # converting string in map
    for x in top_movies['genres'][i]:
        genres.add(x['name'])


# creating map of string (genre name) and movies names(dataframe)
genres_based = dict()
for i in range(top_movies['genres'].shape[0]):
    for x in top_movies['genres'][i]:
        if x['name'] not in genres_based.keys():
            genres_based[x['name']] = pd.DataFrame(columns = top_movies.columns)
        genres_based[x['name']] = genres_based[x['name']].append(top_movies.iloc[i])

In [10]:
# Visualizing frequency of occurence of different genres

# Creating a count vector (list) containing frequency of a perticular genre
cnt = list()
for i in genres:
    cnt.append(genres_based[i].shape[0])

# Making a datafram
genre_cnt = pd.DataFrame( { 'genres' : list(genres),
                            'count'  : cnt

},
                         columns = ['genres','count']
)

fig = px.bar(genre_cnt, x='genres', y='count')
fig.show()

In [17]:
def genres_based_rcmnd(name):

    '''

    This function returns the top 10 movies of the given genre

    Parameters: name (string): Name of the genre

    Returns: (Dataframe) Top 10 move recommendation

    '''

    if name not in genres:
        return None
    else:
        return genres_based[name][['title', 'vote_count', 'vote_average', 'score']].head(10)

print("Movie Recommendation for Comedy")
print(genres_based_rcmnd("Comedy"),"\n")
print("Movie Recommendation for Science Fiction")
print(genres_based_rcmnd("Science Fiction"))

Movie Recommendation for Comedy
                          title  vote_count  vote_average     score
5                  Forrest Gump      8147.0           8.2  8.069421
11            Life Is Beautiful      3643.0           8.3  8.014521
12             The Intouchables      5410.0           8.2  8.008265
30           Back to the Future      6239.0           8.0  7.845092
35     The Grand Budapest Hotel      4644.0           8.0  7.796436
41      The Wolf of Wall Street      6768.0           7.9  7.762497
42                   Inside Out      6737.0           7.9  7.761902
52  Dilwale Dulhania Le Jayenge       661.0           9.1  7.720002
57                   La La Land      4745.0           7.9  7.708786
62                           Up      7048.0           7.8  7.673443 

Movie Recommendation for Science Fiction
                      title  vote_count  vote_average     score
9   The Empire Strikes Back      5998.0           8.2  8.025793
10                Inception     14075.0          