In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Transformaciones preliminares

In [3]:
#Streaming Datasets
df_a=pd.read_csv('amazon_prime_titles.csv')
df_d=pd.read_csv('disney_plus_titles.csv')
df_h=pd.read_csv('hulu_titles.csv')
df_n=pd.read_csv('netflix_titles.csv')

df_rate=pd.read_csv('ratings/rating_global.csv') #Mean rate by movieId and year
df_rate2=pd.read_csv('ratings/recsys.csv') #Mean rate by movieId

In [6]:
#Transform function: 
def transform(input_dataframe, idx_char):
    df=input_dataframe
    df.insert(1,'id',idx_char+df[['show_id']]) #Generating Id
    df['rating'].fillna('G', inplace=True) #Replacing NA of rating
    df['date_added']=df.date_added.str.strip() ##Fixing netflix problem
    df['date_added']=pd.to_datetime(df['date_added'], format='%B %d, %Y') #Formating dates
    df[['duration_int','duration_type']]=df.duration.str.split(" ", expand=True) #Splitting duration
    df['type']=df.type.str.lower() #Lowercase for text fields
    df['title']=df.title.str.lower()
    df['director']=df.director.str.lower()
    df['cast']=df['cast'].astype('str') ##Fixing hulu problem
    df['cast']=df.cast.str.lower() #Converting to lowercase all fields
    df['country']=df.country.str.lower()
    df['rating']=df.rating.str.lower()
    df['listed_in']=df.listed_in.str.lower()
    df['description']=df.description.str.lower()
    df['duration_type']=df.duration_type.str.lower()
    return df

In [5]:
def get_resume_rating(platform):
    platforms={'amazon':amazon, 'disney':disney, 'hulu': hulu, 'netflix':netflix} #Load original datasets
    dr=df_rate #Load rating_global. Reduction -- 11M > 500K 
    dr=dr[['movieId', 'year', 'rating']] #Only util fields
    dr=dr[dr['movieId'].str.startswith(platform[0])] #Group by 'platform'
    dp=platforms[platform] #Original platform for join
    dr=dr.set_index('movieId').join(dp[['id', 'type']].set_index('id')) #Adding field type
    dr=dr[dr['type']=='movie'] #Returning only movies
    return dr

In [7]:
#Transformed datasets
amazon=transform(df_a,'a')
disney=transform(df_d, 'd')
hulu=transform(df_h, 'h')
netflix=transform(df_n, 'n')

rate_a=get_resume_rating('amazon')
rate_d=get_resume_rating('disney')
rate_h=get_resume_rating('hulu')
rate_n=get_resume_rating('netflix')
rates={'amazon': rate_a, 'disney': rate_d, 'hulu': rate_h, 'netflix':rate_n}

In [14]:
rates['netflix'].head(5)

Unnamed: 0,year,rating,type
ns1,1996,3.666667,movie
ns1,1997,3.714286,movie
ns1,1998,3.333333,movie
ns1,1999,3.590909,movie
ns1,2000,3.688889,movie


# Sistema de recomendación

In [15]:
#Dataset base 
def get_base_recsys():    
    platforms=pd.concat([amazon, disney, hulu, netflix]) #Union of all platforms
    platforms=platforms[['id', 'type', 'title','rating', 'listed_in']] #Extract needed fields
    dsc=df_rate2[['movieId', 'rating']] #clean df_rate2
    dsc=dsc.rename(columns={'rating':'mean_scored'}) #Rename and get mean_scored
    platforms=platforms.set_index('id').join(dsc.set_index('movieId')) #Joining df
    platforms['mean_scored']=platforms['mean_scored'].round(2) #Round mean_scored
    platforms['mean_scored']=platforms['mean_scored'].values.astype('str') #Convert mean_score as str for added as tag
    platforms=platforms.reset_index() #Resetting indexes
    titles_wo_duplicates=platforms['title'].drop_duplicates() #Dropping duplicates titles
    platforms=platforms.iloc[titles_wo_duplicates.index] #Regenerating platforms
    platforms['tags']=platforms['type']+', '+platforms['rating']+', '+platforms['listed_in']+', '+platforms['mean_scored'] #Adding 'tags' field
    platforms=platforms[['type', 'title', 'rating', 'listed_in', 'mean_scored', 'tags']].reset_index() #Reset indexes
    platforms=platforms[['type', 'title', 'rating', 'listed_in', 'mean_scored', 'tags']] #Df base    
    #Matrix of key words + frequency
    tfidf=TfidfVectorizer() #Vectorization
    tfidf_matrix=tfidf.fit_transform(platforms['tags']) #Matrix    
    #Apply cosine similarity
    cos_sim=cosine_similarity(tfidf_matrix, tfidf_matrix)
    return platforms, cos_sim

In [16]:
base=get_base_recsys() #Base for recommendation

In [37]:
def get_recommendation(title, platforms=base[0], cos_sim=base[1]): 
    indexes=pd.Series(platforms.index, index=platforms['title']) #Serie with 'titles' as index
    idx=indexes[title] #Index of title to evaluate
    sim_scores=list(enumerate(cos_sim[idx])) #Getting similarity_scores for 'title' and enumerate this
    sim_scores=sorted(sim_scores, key=lambda x: x[1], reverse=True) #Sort from max score
    sim_scores=sim_scores[1:6] #Slicing top 5
    rec_indexes=[i[0] for i in sim_scores] #Getting indexes of slicing
    #list(platforms['title'].iloc[rec_indexes])
    return platforms['title'].iloc[rec_indexes] #Return top 5 of titles with major similarity

## Test of model

In [43]:
get_recommendation('feast') #Test for aleatory title

11013           toy story 3
7604                   ogre
10787           ratatouille
10731    mr. duck steps out
7483                go fish
Name: title, dtype: object

### Title base

In [44]:
base[0][base[0]['title']=='feast']

Unnamed: 0,type,title,rating,listed_in,mean_scored,tags
9671,movie,feast,g,"animation, comedy, drama",3.43,"movie, g, animation, comedy, drama, 3.43"


### Recomendations

In [45]:
base[0].iloc[11013] #1

type                                              movie
title                                       toy story 3
rating                                                g
listed_in                      animation, comedy, drama
mean_scored                                        3.43
tags           movie, g, animation, comedy, drama, 3.43
Name: 11013, dtype: object

In [46]:
base[0].iloc[7604] #2

type                                movie
title                                ogre
rating                                 7+
listed_in                       animation
mean_scored                          3.43
tags           movie, 7+, animation, 3.43
Name: 7604, dtype: object

In [47]:
base[0].iloc[10787] #3

type                                               movie
title                                        ratatouille
rating                                                 g
listed_in                      animation, comedy, family
mean_scored                                         3.43
tags           movie, g, animation, comedy, family, 3.43
Name: 10787, dtype: object

In [48]:
base[0].iloc[10731] #4

type                                                  movie
title                                    mr. duck steps out
rating                                                 tv-g
listed_in                         animation, comedy, family
mean_scored                                            3.43
tags           movie, tv-g, animation, comedy, family, 3.43
Name: 10731, dtype: object

In [49]:
base[0].iloc[7483] #5

type                                     movie
title                                  go fish
rating                                     13+
listed_in                        comedy, drama
mean_scored                               3.43
tags           movie, 13+, comedy, drama, 3.43
Name: 7483, dtype: object

## Generating recommendations for total movies

In [28]:
all_recommendations={i:get_recommendation(i) for i in base[0]['title']} #Dictionary with all recommendations

### Serialize dictionary in .pkl for deploy on server and reduce ram consumption

In [30]:
with open('all_recommendations.pkl', 'wb') as f:
    pickle.dump(all_recommendations, f) 

In [31]:
f.close()