[Guide](https://towardsdatascience.com/how-to-build-from-scratch-a-content-based-movie-recommender-with-natural-language-processing-25ad400eb243)

In [1]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

#### Gather Data

In [2]:
df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')
df = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


#### Data Cleaning

In [3]:
# initializing the new column
df['Key_words'] = ''

for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntucation characters as well
    r = Rake()
    
    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)
    
    # getting the dictionary with key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = ' '.join([k for k in key_words_dict_scores.keys()])
    
# dropping the Plot column
df.drop(columns=['Plot'], inplace=True)

# convert to lowercase
for col in df.columns[1:]:
    df[col] = df[col].str.lower()

# combine Director's first and last name into 1 unqiue word
for col in df.columns[1:4]:
    df[col] = df[col].str.replace(' ', '')
    df[col] = df[col].str.replace(',', ' ')
    
# combining feature columns into  1 'bag_of_words' column
df['bag_of_words'] = df['Genre'] + ' ' + df['Director'] + ' ' + df['Actors'] + ' ' + df['Key_words']

# organize dataframe into 1 input column and 1 feature column
df = pd.concat([df['Title'], df['bag_of_words']], axis=1)

# re-assign 'Title' column to index
df.set_index('Title', inplace=True)

In [4]:
df.head()

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
The Shawshank Redemption,crime drama frankdarabont timrobbins morganfre...
The Godfather,crime drama francisfordcoppola marlonbrando al...
The Godfather: Part II,crime drama francisfordcoppola alpacino robert...
The Dark Knight,action crime drama christophernolan christianb...
12 Angry Men,crime drama sidneylumet martinbalsam johnfiedl...


#### Modeling

In [5]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [7]:
pd.DataFrame(cosine_sim, columns=df.index, index=df.index)

Title,The Shawshank Redemption,The Godfather,The Godfather: Part II,The Dark Knight,12 Angry Men,Schindler's List,The Lord of the Rings: The Return of the King,Pulp Fiction,Fight Club,The Lord of the Rings: The Fellowship of the Ring,...,Deadpool,The Wild Bunch,Aladdin,Big Fish,Patton,The Lost Weekend,Short Term 12,His Girl Friday,The Straight Story,Slumdog Millionaire
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Shawshank Redemption,1.000000,0.146385,0.131559,0.080322,0.105409,0.044721,0.091287,0.233126,0.043033,0.044721,...,0.000000,0.000000,0.00000,0.054233,0.047673,0.054233,0.046625,0.050000,0.050000,0.052705
The Godfather,0.146385,1.000000,0.342368,0.117579,0.154303,0.043644,0.044544,0.136505,0.041996,0.043644,...,0.000000,0.046524,0.00000,0.105851,0.046524,0.052926,0.045502,0.048795,0.048795,0.051434
The Godfather: Part II,0.131559,0.342368,1.000000,0.105670,0.138675,0.039223,0.040032,0.122679,0.075485,0.039223,...,0.000000,0.000000,0.00000,0.142695,0.083624,0.095130,0.040893,0.043853,0.043853,0.046225
The Dark Knight,0.080322,0.117579,0.105670,1.000000,0.084667,0.035921,0.073324,0.074901,0.069130,0.107763,...,0.037450,0.076584,0.00000,0.043561,0.038292,0.043561,0.037450,0.040161,0.040161,0.042333
12 Angry Men,0.105409,0.154303,0.138675,0.084667,1.000000,0.047140,0.048113,0.098295,0.045361,0.047140,...,0.000000,0.000000,0.00000,0.057166,0.050252,0.057166,0.049147,0.052705,0.052705,0.055556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Lost Weekend,0.054233,0.052926,0.095130,0.043561,0.057166,0.048507,0.049507,0.101144,0.093352,0.048507,...,0.000000,0.000000,0.00000,0.117647,0.051709,1.000000,0.050572,0.054233,0.054233,0.057166
Short Term 12,0.046625,0.045502,0.040893,0.037450,0.049147,0.083406,0.085126,0.043478,0.120386,0.041703,...,0.000000,0.000000,0.00000,0.050572,0.088911,0.050572,1.000000,0.046625,0.046625,0.049147
His Girl Friday,0.050000,0.048795,0.043853,0.040161,0.052705,0.044721,0.045644,0.093250,0.043033,0.044721,...,0.046625,0.000000,0.09325,0.054233,0.047673,0.054233,0.046625,1.000000,0.050000,0.052705
The Straight Story,0.050000,0.048795,0.043853,0.040161,0.052705,0.089443,0.045644,0.046625,0.043033,0.089443,...,0.000000,0.000000,0.00000,0.054233,0.095346,0.054233,0.046625,0.050000,1.000000,0.052705


In [47]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list that will be used in the function to match the indexes
indices = pd.Series(df.index)

# defining the function that takes in movie title as input and
# returns the top 10 recommended movies
def recommendations(title, cosine_sim=cosine_sim):
    
    # initializing the empty list of recommended movies
    recommended_movies = []
    
    # getting the index of the movie that matchs the title
    idx = indices[indices == title].index[0]
    
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False)
    
    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populaating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(indices[i])
        
    return recommended_movies

In [55]:
recommended_movies = []
test_idx = indices[indices == 'Fargo'].index[0]
test_score_series = pd.Series(cosine_sim[test_idx]).sort_values(ascending=False)
test_score_series

128    1.000000
132    0.222375
34     0.216777
226    0.214286
125    0.206197
         ...   
204    0.000000
52     0.000000
207    0.000000
16     0.000000
127    0.000000
Length: 250, dtype: float64

#### Testing

In [49]:
recommendations('Fargo')

['No Country for Old Men',
 'The Departed',
 'Rope',
 'The Big Lebowski',
 'Reservoir Dogs',
 'The Godfather',
 'The Godfather: Part II',
 'On the Waterfront',
 'Goodfellas',
 'Arsenic and Old Lace']