In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter



### Content Based Filtering
In this section we find similar movies by comparing the features of each movie. To do this we will utilize thr Bag of Words technique and then convert each word to a vector using CountVectorization. We chose count vectorization because we only looking at the existence of a word in each movie's features.Our bag of words will include, genres, actors and director. Plot was excluded because we are not conducting sentiment analysis.

In [3]:
movies = pd.read_csv('../Data/movies_sml.csv')
movies.head()


Unnamed: 0,movieId,Title,genres,Actors,Director,Plot,Poster
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",John Lasseter,A cowboy doll is profoundly threatened and jea...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji,Adventure|Children|Fantasy,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",Joe Johnston,When two kids find and play a magical board ga...,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,3,Grumpier Old Men,Comedy|Romance,"Walter Matthau, Jack Lemmon, Sophia Loren, Ann...",Howard Deutch,John and Max resolve to save their beloved bai...,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,Waiting to Exhale,Comedy|Drama|Romance,"Whitney Houston, Angela Bassett, Loretta Devin...",Forest Whitaker,"Based on Terry McMillan's novel, this film fol...",https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,Father of the Bride Part II,Comedy,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Charles Shyer,George Banks must deal not only with the pregn...,https://m.media-amazon.com/images/M/MV5BOTEyNz...


### Movies Dataset Preprocessing
##### Removing spaces from actors and director names
The spaces are removed so that only full names are considered in the search. For e.g actor Tom Hanks becomes TomHanks. If we don't remove the space, then an occurrence of another actor with the first name tom will produce a false similarity.

In [4]:
# function to switch all words to lower and remove spaces
def clean_data(x):
    if isinstance(x, list):
        return[str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [5]:
credits = ['Actors', 'Director']

for credit in credits:
    movies[credit] = movies[credit].apply(clean_data)
    

In [7]:
# remove | from genres
movies['genres'] = movies['genres'].str.replace("|", " ")
movies.head()


Unnamed: 0,movieId,Title,genres,Actors,Director,Plot,Poster
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,"tomhanks,timallen,donrickles,jimvarney",johnlasseter,A cowboy doll is profoundly threatened and jea...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji,Adventure Children Fantasy,"robinwilliams,jonathanhyde,kirstendunst,bradle...",joejohnston,When two kids find and play a magical board ga...,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,3,Grumpier Old Men,Comedy Romance,"waltermatthau,jacklemmon,sophialoren,ann-margret",howarddeutch,John and Max resolve to save their beloved bai...,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,Waiting to Exhale,Comedy Drama Romance,"whitneyhouston,angelabassett,lorettadevine,lel...",forestwhitaker,"Based on Terry McMillan's novel, this film fol...",https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,Father of the Bride Part II,Comedy,"stevemartin,dianekeaton,martinshort,kimberlywi...",charlesshyer,George Banks must deal not only with the pregn...,https://m.media-amazon.com/images/M/MV5BOTEyNz...


##### Combining Data 

In [8]:
def create_soup(x):
    return  x['Actors'].replace(',',' ') + ' ' + x['Director'] + ' ' + ' '.join(x['genres'])


In [10]:
movies['bag_of_words'] = movies.apply(create_soup, axis=1)
movies.head()

Unnamed: 0,movieId,Title,genres,Actors,Director,Plot,Poster,bag_of_words
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,"tomhanks,timallen,donrickles,jimvarney",johnlasseter,A cowboy doll is profoundly threatened and jea...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,tomhanks timallen donrickles jimvarney johnlas...
1,2,Jumanji,Adventure Children Fantasy,"robinwilliams,jonathanhyde,kirstendunst,bradle...",joejohnston,When two kids find and play a magical board ga...,https://m.media-amazon.com/images/M/MV5BZTk2Zm...,robinwilliams jonathanhyde kirstendunst bradle...
2,3,Grumpier Old Men,Comedy Romance,"waltermatthau,jacklemmon,sophialoren,ann-margret",howarddeutch,John and Max resolve to save their beloved bai...,https://m.media-amazon.com/images/M/MV5BMjQxM2...,waltermatthau jacklemmon sophialoren ann-margr...
3,4,Waiting to Exhale,Comedy Drama Romance,"whitneyhouston,angelabassett,lorettadevine,lel...",forestwhitaker,"Based on Terry McMillan's novel, this film fol...",https://m.media-amazon.com/images/M/MV5BYzcyMD...,whitneyhouston angelabassett lorettadevine lel...
4,5,Father of the Bride Part II,Comedy,"stevemartin,dianekeaton,martinshort,kimberlywi...",charlesshyer,George Banks must deal not only with the pregn...,https://m.media-amazon.com/images/M/MV5BOTEyNz...,stevemartin dianekeaton martinshort kimberlywi...


In [12]:
movies_bow = movies[['Title', 'movieId', 'bag_of_words']]
movies_bow.to_csv('../Data/movies_bow.csv', index=False)
movies_bow.head()

Unnamed: 0,Title,movieId,bag_of_words
0,Toy Story,1,tomhanks timallen donrickles jimvarney johnlas...
1,Jumanji,2,robinwilliams jonathanhyde kirstendunst bradle...
2,Grumpier Old Men,3,waltermatthau jacklemmon sophialoren ann-margr...
3,Waiting to Exhale,4,whitneyhouston angelabassett lorettadevine lel...
4,Father of the Bride Part II,5,stevemartin dianekeaton martinshort kimberlywi...


#### Building the recommender system 

In [13]:
# importing modules 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# initializing and generating the count matrix 
count = CountVectorizer()
count_matrix = count.fit_transform(movies_bow['bag_of_words'])


In [16]:
# generating teh cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)


In [19]:
# creating a function to generate recommendation | mapping index to title 
indices = pd.Series(movies_bow.index, index = movies_bow['Title'])

def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    return movies_bow['Title'].iloc[movie_indices]


In [20]:
# testing function 
get_recommendations('Rumble in the Bronx')

10841                   Kung Fu Yoga
4038                       Who Am I?
3401                     Crime Story
660                       Supercop 2
679      Around the World in 80 Days
680      Around the World in 80 Days
1127                       Mr. Magoo
1393                       Rush Hour
1565                      Black Mask
1970                   Shanghai Noon
2391                     Rush Hour 2
3983                     City Hunter
4852                     Thunderbolt
5184                     Rush Hour 3
9400                    Dragon Blade
10585                      Skiptrace
10678                Railroad Tigers
1209                    Mr. Nice Guy
2993                Shanghai Knights
4294                        Gorgeous
4536                New Police Story
6246               Shinjuku Incident
6658              Little Big Soldier
4850                    Swordsman II
0                          Toy Story
Name: Title, dtype: object