## Content Based Filtering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
df2=pd.read_csv('tmdb_5000_movies.csv')
df1=pd.read_csv('tmdb_5000_credits 2.csv')
df1.columns = ['id','title','cast','crew']
df2=df2.merge(df1,on='id')


In [5]:
df2['overview'].head(5)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [None]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [10]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [None]:
#Replace NaN with an empty string

In [13]:
df2['overview'] = df2['overview'].fillna('')
df2['overview'].head(10)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
5    The seemingly invincible Spider-Man goes up ag...
6    When the kingdom's most wanted-and most charmi...
7    When Tony Stark tries to jumpstart a dormant p...
8    As Harry begins his sixth year at Hogwarts, he...
9    Fearing the actions of a god-like Super Hero l...
Name: overview, dtype: object

In [14]:
#Construct the required TF-IDF matrix by fitting and transforming the data

In [17]:
tfidf_matrix = tfidf.fit_transform(df2['overview'])
#Output the shape of tfidf_matrix
tfidf_matrix.shape
# We see that over 20,000 different words were used to describe the 4800 movies in our dataset

(4803, 20978)

In [89]:
from sklearn.metrics.pairwise import linear_kernel
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim)
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(cosine_sim[5])

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.02160533 0.         0.        ]
 [0.         0.         1.         ... 0.01488159 0.         0.        ]
 ...
 [0.         0.02160533 0.01488159 ... 1.         0.01609091 0.00701914]
 [0.         0.         0.         ... 0.01609091 1.         0.01171696]
 [0.         0.         0.         ... 0.00701914 0.01171696 1.        ]]
~~~~~~~~~~~~~~~~~~~~~~~~~~~
[0.03035254 0.         0.         ... 0.01828946 0.00561067 0.00568968]


In [96]:
# Get the pairwsie similarity scores of all movies with that movie
sim_scores = list(enumerate(cosine_sim[5]))
sim_scores[:11]

[(0, 0.030352543844312897),
 (1, 0.0),
 (2, 0.0),
 (3, 0.005144601815810792),
 (4, 0.0),
 (5, 0.9999999999999998),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.006208521095377078),
 (10, 0.016411752261489786)]

In [33]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df2.index,index=df2['title_y']).drop_duplicates()
indices.head(10)

title_y
Avatar                                      0
Pirates of the Caribbean: At World's End    1
Spectre                                     2
The Dark Knight Rises                       3
John Carter                                 4
Spider-Man 3                                5
Tangled                                     6
Avengers: Age of Ultron                     7
Harry Potter and the Half-Blood Prince      8
Batman v Superman: Dawn of Justice          9
dtype: int64

In [35]:
indices['Tangled']

6

In [81]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    print(sim_scores[1:11])
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    print(sim_scores[1:11])

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    print(movie_indices)
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

    # Return the top 10 most similar movies
    return df2['title_y'].iloc[movie_indices]

In [83]:
get_recommendations('The Dark Knight Rises')

[(1, 0.0), (2, 0.0), (3, 0.9999999999999994), (4, 0.010433403719159351), (5, 0.005144601815810792), (6, 0.012600632435462458), (7, 0.02695427057891266), (8, 0.0206522168853895), (9, 0.13374009066555226), (10, 0.0)]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[(65, 0.3015117659166547), (299, 0.2985704525539681), (428, 0.2878505467001693), (1359, 0.26446092382799496), (3854, 0.1854500300656145), (119, 0.167996261998507), (2507, 0.1668289104335827), (9, 0.13374009066555226), (1181, 0.1321970213847681), (210, 0.13045537014449815)]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[65, 299, 428, 1359, 3854, 119, 2507, 9, 1181, 210]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


65                              The Dark Knight
299                              Batman Forever
428                              Batman Returns
1359                                     Batman
3854    Batman: The Dark Knight Returns, Part 2
119                               Batman Begins
2507                                  Slow Burn
9            Batman v Superman: Dawn of Justice
1181                                        JFK
210                              Batman & Robin
Name: title_y, dtype: object