In [165]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [166]:
movies_df = pd.read_csv('./preprocessed_data/movies_preprocessed.csv', index_col=0)
movies_df['overview_plus_keyword_preprocessed'].dropna(inplace=True)

In [167]:
movies_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'id',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'overview_filtered', 'keywords_list',
       'overview_plus_keyword', 'overview_plus_keyword_preprocessed'],
      dtype='object')

### **1. TF-IDF Vectorization**

In [168]:
import numpy as np

tfidf = TfidfVectorizer(dtype=np.float32)

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies_df['overview_plus_keyword_preprocessed'].values.astype('U'))

In [169]:
tfidf_matrix

<10231x27704 sparse matrix of type '<class 'numpy.float32'>'
	with 309716 stored elements in Compressed Sparse Row format>

In [170]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [171]:
import pickle

with open("cosine_sim.pkl", "wb") as file:
  pickle.dump(cosine_sim, file)

In [172]:
def get_recommendations(title, n_recommendations, cosine_sim=cosine_sim):
    if title not in movies_df['title'].tolist():
        print("There is no movie of this title in our dataset")
        return 

    # Get the index of the movie that matches the title
    idx = movies_df.loc[movies_df['title'] == title].index.values[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:n_recommendations+1]

    # Return the top 10 most similar movies
    return sim_scores

In [173]:
def get_dash_recommendations(title):
    recomm = get_recommendations(title, 5)
    recomm_lst = []
    for idx, sim in recomm:
      movie_name = movies_df.loc[movies_df.index == idx]['title'].tolist()[0]
      sim = str(round(sim, 3))
      recomm_lst.append(f'{movie_name}: {sim}')
    return recomm_lst

## **2. Building the rating matrix**

In [174]:
movies_df = movies_df[['id', 'title', 'overview_plus_keyword_preprocessed']]
movies_df['idx'] = movies_df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['idx'] = movies_df.index


In [175]:
ratings_df = pd.read_csv('./preprocessed_data/ratings_preprocessed.csv', index_col=0)
ratings_df = ratings_df.loc[ratings_df['movieId'].isin(movies_df['id'].tolist())]


In [176]:
def get_predictions(user_id, movie_id):
    if movie_id not in movies_df['id'].tolist():
        print("There is no movie of this title in our dataset")
        return
    if user_id not in ratings_df['userId'].tolist():
        print("There is no user like this in our dataset")
        return 
    
    title = movies_df.loc[movies_df['id'] == movie_id]['title'].item()
    sim_movies = get_recommendations(title, 20000)
    df_user_sim_aux = ratings_df.loc[ratings_df['userId']==user_id]
    df_user_sim_aux['sim'] = 0
    df_user_sim_aux = df_user_sim_aux.merge(movies_df[['title', 'idx', 'id']], left_on='movieId', right_on='id')

    for sim in sim_movies:
        if int(sim[0]) in df_user_sim_aux['idx'].values:
        # Check this product has been rated by the user and, then, we save this rating
            df_user_sim_aux['sim'].loc[df_user_sim_aux['idx'] == int(sim[0])] = sim[1]

    # Now, we can make the prediction
    pred_rating = (df_user_sim_aux['rating']*df_user_sim_aux['sim']).sum()/df_user_sim_aux['sim'].sum()
    return df_user_sim_aux.loc[df_user_sim_aux['sim']>0], pred_rating

## **3. Evaluating model - data to the report**

### 3.1 Choosing movies and users

In [177]:
movies = ['Batman Forever', 'Star Wars', 'Spider-Man']
for movie in movies:
  print(movies_df.loc[movies_df['title'] == movie]['id'].item())

414
11
557


In [178]:
ratings_df['userId'].value_counts() # Using users that rated the greatest amount of movies

8659      1634
179792    1436
107720    1392
45811     1326
229879    1096
          ... 
239972       1
88879        1
88909        1
88914        1
100340       1
Name: userId, Length: 261562, dtype: int64

### 3.2 Calculate single prediction

In [179]:
df_user_sim_aux, pred_rating = get_predictions(107720, 414)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'].loc[df_user_sim_aux['idx'] == int(sim[0])] = sim[1]


In [180]:
pred_rating

4.849612870317836

In [181]:
df_user_sim_aux.sort_values(by=['sim'], ascending=False).head(10)

Unnamed: 0,userId,movieId,rating,sim,title,idx,id
230,107720,364,6.0,0.327897,Batman Returns,1313,364
172,107720,268,4.0,0.29352,Batman,582,268
248,107720,415,4.0,0.259142,Batman & Robin,1475,415
174,107720,272,8.0,0.223875,Batman Begins,10093,272
1015,107720,2661,4.0,0.106883,Batman,8579,2661
470,107720,820,4.0,0.08943,JFK,3247,820
807,107720,1924,2.0,0.08871,Superman,2506,1924
797,107720,1902,4.0,0.081993,Open Your Eyes,2460,1902
898,107720,2118,6.0,0.073846,L.A. Confidential,1528,2118
798,107720,1903,4.0,0.071586,Vanilla Sky,4820,1903


### 3.3 Calculate recommendations to all of the movies

In [182]:
for movie in movies:
  recomm = get_dash_recommendations(movie)
  for r in recomm:
    print(r)
  print('\n')

Batman Returns: 0.328
Batman: 0.294
Batman: Mask of the Phantasm: 0.262
Batman & Robin: 0.259
Batman Beyond: Return of the Joker: 0.245


The Empire Strikes Back: 0.431
Star Wars: Episode III - Revenge of the Sith: 0.302
Return of the Jedi: 0.288
Star Wars: Episode II - Attack of the Clones: 0.193
Hot Shots! Part Deux: 0.192


Arachnophobia: 0.342
Spider-Man 2: 0.302
X-Men: 0.183
The Giant Spider Invasion: 0.164
Hangman's Curse: 0.15




### 3.4 Data to general comparison

In [183]:
users_id = [480, 260, 15]
movies_id = [414, 11, 296] 

for user_id, movie_id in zip(users_id, movies_id):
    print(movies_df.loc[movies_df['id'] == movie_id]['title'].item())
    df_user_sim_aux, pred_rating = get_predictions(user_id, movie_id)
    print(pred_rating)

Batman Forever


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'].loc[df_user_sim_aux['idx'] == int(sim[0])] = sim[1]


6.31430137853958
Star Wars


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'].loc[df_user_sim_aux['idx'] == int(sim[0])] = sim[1]


9.436587072076145
Terminator 3: Rise of the Machines
7.304236582377969


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'].loc[df_user_sim_aux['idx'] == int(sim[0])] = sim[1]
