In [33]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [97]:
df_movies = pd.read_csv('archive/tmdb_5000_movies.csv')

In [96]:
df_movies.shape

(4803, 20)

In [11]:
df_movies['overview']

0       In the 22nd century, a paraplegic Marine is di...
1       Captain Barbossa, long believed to be dead, ha...
2       A cryptic message from Bondâ€™s past sends him o...
3       Following the death of District Attorney Harve...
4       John Carter is a war-weary, former military ca...
                              ...                        
4798    El Mariachi just wants to play his guitar and ...
4799    A newlywed couple's honeymoon is upended by th...
4800    "Signed, Sealed, Delivered" introduces a dedic...
4801    When ambitious New York attorney Sam is sent t...
4802    Ever since the second grade when he first saw ...
Name: overview, Length: 4803, dtype: object

Declare the stop words: words that dont count in a language

In [18]:
tfidf = TfidfVectorizer(stop_words='english')

Fill all empty columns with empty stings

In [24]:
df_movies['overview'] = df_movies['overview'].fillna('')

Build the vector space model matrix of all the words for the movies dataframe

In [28]:
tfidf_matrix = tfidf.fit_transform(df_movies['overview'])

Develop the linear kernel that will be used to calculate the cosine similarity

In [35]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

Extract the indices from the df

In [39]:
indices = pd.Series(df_movies.index, index=df_movies['original_title'])
indices

original_title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4798
Newlyweds                                   4799
Signed, Sealed, Delivered                   4800
Shanghai Calling                            4801
My Date with Drew                           4802
Length: 4803, dtype: int64

In [41]:
indices['Newlyweds']

4799

Function to get the recommendations

In [93]:
def get_recommendations(title, cosine_sim = cosine_sim):
    index = indices[title]
    similarity_scores = enumerate(cosine_sim[index])
    similarity_scores = sorted(similarity_scores, key = lambda x:x[1], reverse=True)
    #similarity_scores = similarity_scores[1:11]
    similarity_scores_index = [i[0] for i in similarity_scores]
    print( df_movies["original_title"].iloc[similarity_scores_index] )

In [94]:
get_recommendations('The Dark Knight Rises')

3       The Dark Knight Rises
65            The Dark Knight
299            Batman Forever
428            Batman Returns
1359                   Batman
                ...          
4795                     Bang
4796                   Primer
4797                   Cavite
4798              El Mariachi
4799                Newlyweds
Name: original_title, Length: 4803, dtype: object


In [95]:
get_recommendations('Newlyweds')

4799                    Newlyweds
3969             Something Wicked
616                         Ted 2
2689           Our Family Wedding
1576                   Bride Wars
                  ...            
4797                       Cavite
4798                  El Mariachi
4800    Signed, Sealed, Delivered
4801             Shanghai Calling
4802            My Date with Drew
Name: original_title, Length: 4803, dtype: object
