[Guide](https://towardsdatascience.com/how-to-build-from-scratch-a-content-based-movie-recommender-with-natural-language-processing-25ad400eb243)

In [1]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

#### Gather Data

In [2]:
df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')
df = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


#### Data Cleaning

In [3]:
# initializing the new column
df['Key_words'] = ''

for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntucation characters as well
    r = Rake()
    
    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)
    
    # getting the dictionary with key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = [k for k in key_words_dict_scores.keys()]
    
# dropping the Plot column
df.drop(columns=['Plot'], inplace=True)

In [4]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Key_words
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...","[common, decency, two, imprisoned, men, bond, ..."
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...","[reluctant, son, clandestine, empire, aging, p..."
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...","[family, crime, syndicate, vito, corleone, exp..."
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...","[joker, emerges, menace, known, wreaks, havoc,..."
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....","[colleagues, reconsider, prevent, miscarriage,..."


In [5]:
# convert to lowercase
cols = ['Genre', 'Director', 'Actors', 'Key_words']
for c in cols:
    df[c] = df[c].str.lower()

# combine Director's first and last name into 1 unqiue word
# df['Director'] = 

In [6]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Key_words
0,The Shawshank Redemption,"crime, drama",frank darabont,"tim robbins, morgan freeman, bob gunton, willi...",
1,The Godfather,"crime, drama",francis ford coppola,"marlon brando, al pacino, james caan, richard ...",
2,The Godfather: Part II,"crime, drama",francis ford coppola,"al pacino, robert duvall, diane keaton, robert...",
3,The Dark Knight,"action, crime, drama",christopher nolan,"christian bale, heath ledger, aaron eckhart, m...",
4,12 Angry Men,"crime, drama",sidney lumet,"martin balsam, john fiedler, lee j. cobb, e.g....",


In [7]:
for names in df['Director']:
    name = [name.replace(' ', '') for name in names]

In [8]:
df['Director'].str.replace(' ', '')

0                  frankdarabont
1             francisfordcoppola
2             francisfordcoppola
3               christophernolan
4                    sidneylumet
                 ...            
245                  billywilder
246          destindanielcretton
247                  howardhawks
248                   davidlynch
249    dannyboyle,loveleentandan
Name: Director, Length: 250, dtype: object