In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

movies_df = pd.read_csv('./preprocessed_data/movies_preprocessed.csv', index_col=0)

In [2]:
movies_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'overview_filtered', 'keywords',
       'keywords_list', 'preprocessed_overview'],
      dtype='object')

### **1. Using TF-IDF Vectorization**

In [3]:
import numpy as np

tfidf = TfidfVectorizer(dtype=np.float32)

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies_df['preprocessed_overview'].values.astype('U'))

In [4]:
tfidf_matrix

<45296x64086 sparse matrix of type '<class 'numpy.float32'>'
	with 1164020 stored elements in Compressed Sparse Row format>

In [5]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
def get_recommendations(title, n_recommendations, cosine_sim=cosine_sim):
    if title not in movies_df['title'].tolist():
        print("There is no movie of this title in our dataset")
        return 

    # Get the index of the movie that matches the title
    idx = movies_df.loc[movies_df['title'] == title].index.values[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:n_recommendations+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return sim_scores

In [7]:
title = movies_df.sample()['title'].item()
title

'The Legend of Bloody Mary'

In [8]:
movies_df.loc[movies_df['title'] == title]['preprocessed_overview'].item()

'Ryan plague nightmare night sister Amy go miss year play game Bloody Mary Amy stumble website internet witch call Bloody Mary game summon evil spirit senior college Ryan reach mental breaking point year stress guilt sister disappearance girlfriend Rachel frustrate Ryan emotional distance self pity call help professor Ryan Father Father priest archaeologist decide help Ryan end tormenting grief detective skill wit figure happen Ryan sister uncover truth Legend Bloody Mary'

In [9]:
get_recommendations(title, 5)

[(20075, 0.39806882),
 (3051, 0.39451265),
 (33105, 0.3614585),
 (20973, 0.35765594),
 (27746, 0.34115762)]

Building the rating matrix

In [11]:
movies_df = movies_df[['id', 'title', 'preprocessed_overview']]
movies_df

Unnamed: 0,id,title,preprocessed_overview
0,862,Toy Story,lead Woody Andy toy live room Andy birthday br...
1,8844,Jumanji,sibling Judy Peter discover enchanted board ga...
2,15602,Grumpier Old Men,family wedding reignite ancient feud door neig...
3,31357,Waiting to Exhale,cheat mistreat step woman hold breath wait elu...
4,11862,Father of the Bride Part II,George Banks recover daughter wedding receive ...
...,...,...,...
45291,439050,Subdue,rise fall man woman
45292,111109,Century of Birthing,artist struggle finish work storyline cult pla...
45293,67758,Betrayal,hit go wrong professional assassin end suitcas...
45294,227506,Satan Triumphant,small town live brother minister hunchback pai...


In [49]:
ratings_df = pd.read_csv('./preprocessed_data/ratings_preprocessed.csv', index_col=0)
ratings_df = ratings_df.loc[ratings_df['movieId'].isin(movies_df['id'].tolist())]
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,110,2.0
1,1,147,9.0
2,1,858,10.0
4,1,1246,10.0
5,1,1968,8.0
...,...,...,...
26024269,270896,48780,10.0
26024271,270896,49530,8.0
26024278,270896,54001,8.0
26024280,270896,54503,8.0


In [69]:
movies_df.loc[movies_df['title'] == 'Toy Story']

Unnamed: 0,id,title,preprocessed_overview
0,862,Toy Story,lead Woody Andy toy live room Andy birthday br...


In [59]:
def get_predictions(user_id, movie_id):
    if movie_id not in movies_df['id'].tolist():
        print("There is no movie of this title in our dataset")
        return
    if user_id not in ratings_df['userId'].tolist():
        print("Therenot user like this in our dataset")
        return 
    
    title = movies_df.loc[movies_df['id'] == movie_id]['title'].item()
    print(title)
    sim_movies = get_recommendations(title, 20000)
    print(sim_movies)
    df_user_sim_aux = ratings_df.loc[ratings_df['userId']==user_id]
    df_user_sim_aux['sim'] = 0

    print(len(df_user_sim_aux))
    for sim in sim_movies:
        if int(sim[0]) in df_user_sim_aux['movieId'].values:
        # Check this product has been rated by the user and, then, we save this rating
            df_user_sim_aux['sim'].loc[df_user_sim_aux['movieId']==int(sim[0])] = sim[1]

    
    print('List of similar products:')
    print(df_user_sim_aux.loc[df_user_sim_aux['sim']>0])
    # Now, we can make the prediction
    pred_rating = (df_user_sim_aux['rating']*df_user_sim_aux['sim']).sum()/df_user_sim_aux['sim'].sum()
    print('The predicted scoring is: %2.2f' %pred_rating)
    return df_user_sim_aux.loc[df_user_sim_aux['sim']>0]

In [1]:
df_user_sim_aux = get_predictions(78, 862)

NameError: name 'get_predictions' is not defined

In [71]:
df_user_sim_aux

Unnamed: 0,userId,movieId,rating,sim
7575,75,260,7.0,0.013411
7576,75,296,9.0,0.012404
7577,75,344,6.0,0.010304
7580,75,457,8.0,0.005446
7584,75,648,6.0,0.004418
7595,75,1610,8.0,0.027964
7597,75,2023,9.0,0.010564
7599,75,2105,10.0,0.010187
7605,75,2502,7.0,0.021359
7617,75,4995,9.0,0.016052


In [72]:
df_user_sim_aux.merge(movies_df[['title', 'id']], left_on='movieId', right_on='id')

Unnamed: 0,userId,movieId,rating,sim,title,id
0,75,260,7.0,0.013411,The 39 Steps,260
1,75,296,9.0,0.012404,Terminator 3: Rise of the Machines,296
2,75,344,6.0,0.010304,"Bang, Boom, Bang",344
3,75,457,8.0,0.005446,Sissi,457
4,75,648,6.0,0.004418,Beauty and the Beast,648
5,75,1610,8.0,0.027964,Fever Pitch,1610
6,75,2023,9.0,0.010564,Hidalgo,2023
7,75,2105,10.0,0.010187,American Pie,2105
8,75,2502,7.0,0.021359,The Bourne Supremacy,2502
9,75,4995,9.0,0.016052,Boogie Nights,4995


: 