## Movie Recommendation System

[Priyanka Sharma](https://www.youtube.com/watch?v=ZcDaULKAnJ8)

## Import modules

In [1]:
import pandas as pd
import numpy as np

## Import dataset

In [2]:
df_credit = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
df_credit.head()

## Get Basic Information

In [3]:
df_credit.shape

In [4]:
df_credit.info()

## Load second dataset: Movies Dataset

In [5]:
df_movie = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')
df_movie.head()

## Get the basic information

In [6]:
df_movie.shape

In [7]:
df_movie.info()

There are null value in homepage and tagline

## Merge the two dataframes

In [8]:
df_credit.columns = ['id', 'title', 'cast', 'crew']
df = df_movie.merge(df_credit, on='id')

In [9]:
df.head()

In [10]:
df.shape

In [11]:
df.info()

In [12]:
C = df['vote_average'].mean()
C

## Minimum votes to be listed

In [13]:
m = df['vote_count'].quantile(0.9)
m

## Getting the list movies to be listed

In [14]:
lists_movies = df.copy().loc[df['vote_count'] >= m]
lists_movies.shape

In [15]:
lists_movies.head()

## Defining function

In [16]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formulas (m=1838, c=6.09)
    return (v/(v+m) + (m/(m+v) * C))

Define a new features 'score' and caclulate its value with `weighted_rating()`

In [17]:
lists_movies['score'] = lists_movies.apply(weighted_rating, axis=1)

In [18]:
lists_movies.head(3)

## Sorting the movies

In [19]:
# Sort movies based on score calculated above
lists_movies = lists_movies.sort_values('score', ascending=True)

# Print top 10 movies
lists_movies[['title_x', 'vote_count', 'vote_average', 'score']].head(10)

## Most Polular Movies

In [20]:
import matplotlib.pyplot as plt

In [21]:
pop = df.sort_values('popularity', ascending=False)
plt.figure(figsize=(12,4))
plt.barh(pop['title_x'].head(6), pop['popularity'].head(6), align='center', color='m')
plt.gca().invert_yaxis()
plt.xlabel("Popularity")
plt.title("Popular Movies")

In [22]:
pop = df.sort_values('budget', ascending=False)
plt.figure(figsize=(12,4))
plt.barh(pop['title_x'].head(6), pop['budget'].head(6), align='center', color='r')
plt.gca().invert_yaxis()
plt.xlabel("Budget")
plt.title("High Budget Movies")

In [23]:
pop = df.sort_values('revenue', ascending=False)
plt.figure(figsize=(12,4))
plt.barh(pop['title_x'].head(6), pop['revenue'].head(6), align='center', color='b')
plt.gca().invert_yaxis()
plt.xlabel("Revenue")
plt.title("Based on Revenue Movies")

## Content-Based Recommendation System

In [24]:
lists_movies.drop(['title_y'], axis=1, inplace=True)

In [25]:
lists_movies.shape

In [26]:
lists_movies.head(2)

## Column overview

In [27]:
df['overview'].head()

Based on the description we can get the similarity among movies

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
df['overview'] = df['overview'].fillna('')

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['overview'])

# Output the shape of tfidf_matrix
tfidf_matrix.shape

In [29]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [30]:
# Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['title_x']).drop_duplicates()

In [31]:
# Function thtat takes in movie title as input and outputing the similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the thitle
    idx = indices[title]
    
    # Get the pairwisesimilarity scores of all movies with that movies
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:6]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 5 most similar movies
    return df['title_x'].iloc[movie_indices]

In [32]:
get_recommendations("Avatar")

In [38]:
get_recommendations('Tears of the Sun')

In [36]:
get_recommendations('The Matrix')