# _`MODELING`_

Import the libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import joblib

Load the data

In [2]:
data = pd.read_csv('_src/data/model_data.csv')

In [3]:
data.head()

Unnamed: 0,id,title,directors,genres_list,prod_companies,vote_average
0,862,Toy Story,John Lasseter,"Animation,Comedy,Family",Pixar Animation Studios,7.7
1,8844,Jumanji,Joe Johnston,"Adventure,Fantasy,Family","TriStar Pictures,Teitler Film,Interscope Commu...",6.9
2,15602,Grumpier Old Men,Howard Deutch,"Romance,Comedy","Warner Bros.,Lancaster Gate",6.5
3,31357,Waiting to Exhale,Forest Whitaker,"Comedy,Drama,Romance",Twentieth Century Fox Film Corporation,6.1
4,11862,Father of the Bride Part II,Charles Shyer,Comedy,"Sandollar Productions,Touchstone Pictures",5.7


In our recommendation model we are going to use cosine similarity, so we firts fill the `Na, commas, and numerical values` of the features ['directors', 'genres_list', 'prod_companies'] with empty spaces, and group them in a new column

In [5]:
data['grouped_data'] = data['directors'].fillna('') + ' '
data['grouped_data'] += (data['genres_list']).fillna('') + ' '
data['grouped_data'] += data['prod_companies'].fillna('') 
data['grouped_data'] = data['grouped_data'].str.replace(',',' ')

#Numerical values {0..9}
for i in range(10):
    data['grouped_data'] = data['grouped_data'].str.replace(f'{i}','')


Because we have limited processing power we are going to work with a reduced dataset of the first 10000 instances

In [8]:
reduced_data = data.head(10000)

In [9]:
reduced_data 

Unnamed: 0,id,title,directors,genres_list,prod_companies,vote_average,grouped_data
0,862,Toy Story,John Lasseter,"Animation,Comedy,Family",Pixar Animation Studios,7.7,John Lasseter Animation Comedy Family Pixar An...
1,8844,Jumanji,Joe Johnston,"Adventure,Fantasy,Family","TriStar Pictures,Teitler Film,Interscope Commu...",6.9,Joe Johnston Adventure Fantasy Family TriStar ...
2,15602,Grumpier Old Men,Howard Deutch,"Romance,Comedy","Warner Bros.,Lancaster Gate",6.5,Howard Deutch Romance Comedy Warner Bros. Lanc...
3,31357,Waiting to Exhale,Forest Whitaker,"Comedy,Drama,Romance",Twentieth Century Fox Film Corporation,6.1,Forest Whitaker Comedy Drama Romance Twentieth...
4,11862,Father of the Bride Part II,Charles Shyer,Comedy,"Sandollar Productions,Touchstone Pictures",5.7,Charles Shyer Comedy Sandollar Productions Tou...
...,...,...,...,...,...,...,...
9995,18240,The Proposal,Anne Fletcher,"Comedy,Romance,Drama","Touchstone Pictures,Mandeville Films,Kurtzman/...",6.7,Anne Fletcher Comedy Romance Drama Touchstone ...
9996,28110,Dillinger,John Milius,"Action,Crime,Drama",American International Pictures (AIP),6.7,John Milius Action Crime Drama American Intern...
9997,17610,Year One,Harold Ramis,"Comedy,Adventure","Columbia Pictures,Ocean Pictures,Apatow Produc...",4.6,Harold Ramis Comedy Adventure Columbia Picture...
9998,8933,O'Horten,Bent Hamer,"Drama,Comedy",Bulbul Films,6.1,Bent Hamer Drama Comedy Bulbul Films


We set the data in grouped_data as our combined_data

In [14]:
combined_data = reduced_data['grouped_data']

Create the classification matrix and calculate the cosine similarity matrix

In [15]:
vectorizer = CountVectorizer()

classification_matrix = vectorizer.fit_transform(combined_data)

cosine_sim = cosine_similarity(classification_matrix)

# storage the cosine similarity matrix as a compressed file with joblib
joblib.dump(cosine_sim, '_src/data/cosine_sim.pkl', compress=True)


['_src/data/cosine_sim.pkl']

We make the recommendation function

In [424]:
def get_recommendation(title: str, n: int = 5):
    indices = reduced_data[reduced_data['title'] == title].index[0]

    sim_scores = list(enumerate(cosine_sim[indices]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_recommendations = [(reduced_data.iloc[i[0]].title, reduced_data.iloc[i[0]].vote_average,reduced_data.iloc[i[0]].genres_list,reduced_data.iloc[i[0]].directors) for i in sim_scores[1:6]]

    return {"title": title, "recommendations": top_recommendations}

In [432]:
get_recommendation('Cinderella')

{'title': 'Cinderella',
 'recommendations': [('Alice in Wonderland',
   7.0,
   'Animation,Adventure,Family,Fantasy',
   'Hamilton Luske,Wilfred Jackson,Clyde Geronimi'),
  ('Peter Pan',
   7.0,
   'Animation,Music,Family,Adventure,Fantasy',
   'Clyde Geronimi,Wilfred Jackson,Hamilton Luske'),
  ('Melody Time',
   6.3,
   'Music,Family,Animation',
   'Clyde Geronimi,Jack Kinney,Wilfred Jackson,Hamilton Luske'),
  ('Lady and the Tramp',
   6.9,
   'Family,Animation',
   'Hamilton Luske,Wilfred Jackson,Clyde Geronimi'),
  ('Saludos Amigos',
   5.8,
   'Animation,Family',
   'William Roberts,Hamilton Luske,Wilfred Jackson,Jack Kinney')]}