In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import flask

In [2]:
pwd

'c:\\Users\\Vivek.Sasikumar\\Downloads\\github_trial'

In [None]:
metadata = pd.read_csv('/Users/viveksasikumar/Downloads/movies_metadata.csv',low_memory=False,
                       parse_dates=True,infer_datetime_format=True)

In [3]:
metadata.head()

NameError: name 'metadata' is not defined

In [None]:
metadata[['original_title','popularity','revenue','vote_average', 'vote_count']].head()

In [None]:
metadata.columns

In [None]:
plt.figure(figsize=(16,9))
sns.heatmap(metadata.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
metadata['vote_average'].fillna(value=0,inplace=True)

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(metadata['vote_average'],kde=True,color='orange')

In [None]:
metadata[metadata['vote_average']==0].vote_average.count()

From the above graph, it is clear that 3,004 movies out of 45,466 does not ratings or were left empty.

In [None]:
plt.figure(figsize=(10,10))
sns.jointplot(x='vote_average',y='vote_count',data=metadata, color='green',kind='reg',dropna=True)

1. Most of the movies have very low number of vote_counts as compared vote_average. 
2. Also, the number of votes increases by a huge margin from around 4 to 8 and then sharply drops. This suggests that people are more likely to vote for movies that they enjoyed.

In [None]:
metadata['popularity'] = metadata['popularity'].convert_objects(convert_numeric=True)
metadata.info()

In [None]:
metadata['popularity'].fillna(value=0,inplace=True)
plt.figure(figsize=(10,10))
sns.jointplot(x='vote_average',y='popularity',data=metadata, color='green',kind='scatter',dropna=True)

Popularity seems to be a function of vote_average and vote_count. 

In [None]:
sns.heatmap(metadata.corr(), annot=True, cmap='coolwarm')

Vote_average has no significant correlation with any other number indicator. It would not make sense to create a predictive machine learning algorithms to develop a recommender system.

In [None]:
plt.figure(figsize=(10,10))
sns.jointplot(x='vote_average',y='revenue',data=metadata, color='green',kind='scatter',dropna=True,)

In [None]:
metadata[metadata['vote_count'].isnull()==True].count()

# Simple Recommender

In [None]:
C = metadata['vote_average'].mean()
m = metadata['vote_count'].quantile(0.95)
print(C,' ', m)

In [None]:
qual_movies = metadata.copy().loc[metadata['vote_count']>=m]
qual_movies.shape

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m)*R) + (m/(m+v)*C)

In [None]:
qual_movies['score'] = qual_movies.apply(weighted_rating,axis=1)

In [None]:
qual_movies = qual_movies.sort_values('score', ascending=False)

In [None]:
qual_movies[['title','vote_count','vote_average','score']].head(100)

In [None]:
qual_movie_score = qual_movies[['title','vote_count','vote_average','score']]

In [None]:
indices2 = pd.Series(qual_movies.index, index=qual_movie_score['title']).drop_duplicates()

In [None]:
def weighted_score_model(title):
    movie_index = indices2.index.get_loc(title)

    movie_indices = [movie_index+i for i in range(1,11)]
    return qual_movies['title'].iloc[movie_indices]
    

In [None]:
weighted_score_model('The Dark Knight')

# Content Based Recommender

### Based on Overview text data

In [None]:
metadata['overview'].head()

In [None]:
metadata = metadata.drop([28700])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer #Term Frequency-Inverse Document Frequency (TF-IDF) vectors 
tfidf = TfidfVectorizer(stop_words='english')
metadata['overview'] = metadata['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(metadata['overview'])
tfidf_matrix.shape

In [None]:
#calculating the dot product will directly give you the cosine similarity score
#Hence use linear_kernel() instead of cosine_similarities()

from sklearn.metrics.pairwise import linear_kernel

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def movie_overview_model(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [None]:
movie_overview_model('The Dark Knight',cosine_sim)

## Based on Credits, Keywords

In [None]:
credits = pd.read_csv('/Users/viveksasikumar/Downloads/credits.csv')
keywords = pd.read_csv('/Users/viveksasikumar/Downloads/keywords.csv')

metadata = metadata.drop([19730, 29503, 35587])

In [None]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

In [None]:
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

In [None]:
metadata.head(2)

In [None]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [None]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [None]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [None]:
# Define new director, cast, genres and keywords features that are in a suitable form.
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)

In [None]:
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

In [None]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)
    


In [None]:
metadata[['cast', 'keywords', 'director', 'genres']].head()

In [None]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [None]:
# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)
print(metadata['soup'].head(10))

In [None]:
pd.set_option('display.width', 10000)
pd.set_option('display.max_colwidth', 10000)
metadata[['genres','cast','director']].head(10)

In [None]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

In [None]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
# Reset index of your main DataFrame and construct reverse mapping as before
#metadata = metadata.reset_index()
indices1 = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def movie_keywordcredits_model(title, cosine_sim=cosine_sim2):
    # Get the index of the movie that matches the title
    idx = indices1[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim2[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices1 = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices1]

In [None]:
movie_keywordcredits_model('The Dark Knight', cosine_sim2)

In [None]:
movie_keywordcredits_model('Fight Club', cosine_sim2)

# Following are the 3 movie recommendation models

In [None]:
movie = input("Enter movie: ")

print('\nWeighted Score Model')
print(weighted_score_model(movie))

print('\nMovie Overview Model')
print(movie_overview_model(movie,cosine_sim))

print('\nMovie Keywords & Credits Model')
print(movie_keywordcredits_model(movie, cosine_sim2))

## Which model do you think makes most sense?