In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/the-movies-dataset/ratings.csv
/kaggle/input/the-movies-dataset/links_small.csv
/kaggle/input/the-movies-dataset/credits.csv
/kaggle/input/the-movies-dataset/keywords.csv
/kaggle/input/the-movies-dataset/movies_metadata.csv
/kaggle/input/the-movies-dataset/ratings_small.csv
/kaggle/input/the-movies-dataset/links.csv


In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from functools import reduce

import warnings; warnings.simplefilter('ignore')

# Preprocessing The Dataset

In [3]:
md = pd.read_csv('../input/the-movies-dataset/movies_metadata.csv')
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()

m = vote_counts.quantile(0.95)

md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')

In [4]:
credits = pd.read_csv('../input/the-movies-dataset/credits.csv')
keywords = pd.read_csv('../input/the-movies-dataset/keywords.csv')

links_small = pd.read_csv('../input/the-movies-dataset/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

md = md.drop([19730, 29503, 35587])

md['id'] = md['id'].astype('int')

md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

smd = md[md['id'].isin(links_small)]
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

# Generating Insights

## Retrieving Top Rated Movies

In [5]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

qualified['wr'] = qualified.apply(weighted_rating, axis=1)
qualified = qualified.sort_values('wr', ascending=False).head(500)

## Retrieving Genres

In [6]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

## Top 300 Rated Movies

In [7]:
def get_top_50_movies(genre, percentile=0.80):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(300)
    
    return qualified

## Genre-Wise Top Movies Generation

In [17]:
genre = input('Enter the name of the genre to get the top 50 rated movies')
get_top_50_movies(genre).head(50)

Enter the name of the genre to get the top 50 rated movies Science Fiction


Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15651,Inception,2010,14075,8,29.108149,7.967874
23076,Interstellar,2014,11187,8,32.213481,7.959689
256,Star Wars,1977,6778,8,42.149697,7.934026
1246,Back to the Future,1985,6239,8,25.778509,7.928458
1175,The Empire Strikes Back,1980,5998,8,19.470959,7.925653
1184,A Clockwork Orange,1971,3432,8,17.112594,7.872361
1921,Metropolis,1927,666,8,14.487867,7.438533
14723,Avatar,2009,12114,7,185.070892,6.97479
18008,The Avengers,2012,12000,7,89.887648,6.974553
23948,Guardians of the Galaxy,2014,10014,7,53.291601,6.96958


## User Preference - Content Based Movie Recommender

In [9]:
def get_recommendations_for_multiple_movies(movie_list, top_n=4):
    with open('CosineVal.pkl', 'rb') as file:
        cosine_sim = pickle.load(file)
    
    recommended_movies = []
    
    for movie_title in movie_list:
        idx = indices.get(movie_title, None)
        if idx is not None:
            if(cosine_sim[idx].ndim > 1):
                sim_scores = list(enumerate(cosine_sim[idx][0]))
            else:
                sim_scores = list(enumerate(cosine_sim[idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_scores = sim_scores[1:(top_n+1)]
            movie_indices = [i[0] for i in sim_scores]
            recommended_movies.extend(list(titles.iloc[movie_indices]))
    return reduce(lambda re, x: re+[x] if x not in re else re, recommended_movies, [])

In [10]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

# Preparing The Content Soup

In [11]:
smd['director'] = smd['crew'].apply(get_director)

smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]

In [12]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

## Stemming, Removing Stopwords and generating Similarity Coefficients

In [13]:
stemmer = SnowballStemmer('english')

smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)

smd = smd.reset_index()

titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

# Exporting Data for External Use

In [14]:
import pickle
with open('CosineVal.pkl', 'wb') as file:
      
    # A new file will be created
    pickle.dump(cosine_sim, file)

In [15]:
smd.to_csv("ShortenedMetaData.csv")

# Result Generation

In [16]:
get_recommendations_for_multiple_movies(['3 Idiots'], 5)

['Loser',
 'The Favor',
 'An Ideal Husband',
 'Educating Rita',
 'Sleeping Dogs Lie']