In [32]:
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
import pickle
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


movies_df = pd.read_csv('./preprocessed_data/movies_preprocessed.csv', index_col=0)

## **0. Small preprocess**

In [33]:
movies_df['overview_plus_keyword_preprocessed'] = movies_df['overview_plus_keyword_preprocessed'].astype(str)
movies_df['overview_plus_keyword_preprocessed'].dropna(inplace=True)

In [34]:
def tokenize(txt):
  return txt.split(sep=" ")

In [35]:
movies_df = movies_df[['id', 'title', 'overview_plus_keyword_preprocessed']]
movies_df['tokens'] = movies_df['overview_plus_keyword_preprocessed'].apply(lambda x: tokenize(x))


## **1. Doc2Vec**

In [36]:
keyword_id_list = []
for i, row in movies_df.iterrows():
    keyword_id_list.append((row["id"], row["tokens"]))

In [37]:
documents = [TaggedDocument(doc, [i]) for i, doc in keyword_id_list]

In [8]:
model_overviews = Doc2Vec(documents, vector_size=500, window=75, min_count=20)



In [38]:
recomm = model_overviews.dv.most_similar(414, 10)
recomm

[(32078, 0.5796336531639099),
 (4477, 0.5758765935897827),
 (32652, 0.562588632106781),
 (12145, 0.5391075611114502),
 (1493, 0.5371254682540894),
 (5137, 0.5343013405799866),
 (44233, 0.5283899903297424),
 (4547, 0.5267226696014404),
 (1572, 0.5218833088874817),
 (12535, 0.518213152885437)]

## **2. Building the rating matrix**

In [39]:
ratings_df = pd.read_csv('./preprocessed_data/ratings_preprocessed.csv', index_col=0)
ratings_df = ratings_df.loc[ratings_df['movieId'].isin(movies_df['id'].tolist())]


In [40]:
def get_predictions(user_id, movie_id):
    if movie_id not in movies_df['id'].tolist():
        print("There is no movie of this title in our dataset")
        return
    if user_id not in ratings_df['userId'].tolist():
        print("There is no user like this in our dataset")
        return 
    
    sim_movies = model_overviews.dv.most_similar(movie_id, 10000)
    print(sim_movies)
    df_user_sim_aux = ratings_df.loc[ratings_df['userId']==user_id]
    df_user_sim_aux['sim'] = 0
    df_user_sim_aux = df_user_sim_aux.merge(movies_df[['title', 'id']], left_on='movieId', right_on='id')

    print(len(df_user_sim_aux))
    
    for sim in sim_movies:
        if int(sim[0]) in df_user_sim_aux['id'].values:
        # Check this product has been rated by the user and, then, we save this rating
            df_user_sim_aux['sim'].loc[df_user_sim_aux['id'] == int(sim[0])] = sim[1]

    
    print('List of similar products:')
    print(df_user_sim_aux.loc[df_user_sim_aux['sim']>0])
    # Now, we can make the prediction
    pred_rating = (df_user_sim_aux['rating']*df_user_sim_aux['sim']).sum()/df_user_sim_aux['sim'].sum()
    print('The predicted scoring is: %2.2f' %pred_rating)
    return df_user_sim_aux.loc[df_user_sim_aux['sim']>0], pred_rating

## **3. Evaluating model - data to the report**

### 3.1 Choosing movies and users

In [41]:
movies = ['Batman Forever', 'Star Wars', 'Spider-Man']
for movie in movies:
  print(movies_df.loc[movies_df['title'] == movie]['id'].item())

414
11
557


In [42]:
ratings_df['userId'].value_counts()

8659      1634
179792    1436
107720    1392
45811     1326
229879    1096
          ... 
239972       1
88879        1
88909        1
88914        1
100340       1
Name: userId, Length: 261562, dtype: int64

### 3.2 Give general recomendation (similarities)

In [43]:
recomm = model_overviews.dv.most_similar(557)
for id, sim in recomm:
  movie_name = movies_df.loc[movies_df['id'] == id]['title'].tolist()[0]
  sim = str(round(sim, 3))
  print(f'{movie_name}: {sim}')

Spider-Man 2: 0.827
House of the Dead: 0.816
If Looks Could Kill: 0.815
Young Sherlock Holmes: 0.796
Agent Cody Banks: 0.79
Charly: 0.782
Hangman's Curse: 0.766
Teacher's Pet: 0.761
Firestarter: 0.76
Class of Nuke 'Em High 2: Subhumanoid Meltdown: 0.742


## 3.3 Calculate single prediction

In [44]:
df_user_sim_aux, pred_rating = get_predictions(8659, 557)

[(22434, 0.5913615822792053), (11059, 0.5887901186943054), (10923, 0.5763324499130249), (11904, 0.5698927640914917), (29146, 0.5689902305603027), (558, 0.5681638121604919), (10726, 0.5577535629272461), (42570, 0.5540615916252136), (24034, 0.5533162951469421), (11495, 0.5520579218864441)]
1635
List of similar products:
     userId  movieId  rating       sim         title   id
300    8659      558     6.0  0.568164  Spider-Man 2  558
The predicted scoring is: 6.00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'].loc[df_user_sim_aux['id'] == int(sim[0])] = sim[1]


In [45]:
df_user_sim_aux

Unnamed: 0,userId,movieId,rating,sim,title,id
300,8659,558,6.0,0.568164,Spider-Man 2,558


In [46]:
pred_rating

6.0

### 3.4 Data to general comparison

In [47]:
users_id = [480, 260, 15]
movies_id = [414, 11, 296] 

for user_id, movie_id in zip(users_id, movies_id):
    print(movies_df.loc[movies_df['id'] == movie_id]['title'].item())
    df_user_sim_aux, pred_rating = get_predictions(user_id, movie_id)
    print(pred_rating)

Batman Forever
[(4477, 0.5524433851242065), (32078, 0.5517944693565369), (32652, 0.5365716814994812), (5137, 0.5280776619911194), (12535, 0.523230254650116), (24558, 0.5217174291610718), (1642, 0.5178608298301697), (44233, 0.512367308139801), (1572, 0.5100498795509338), (1493, 0.5074796676635742)]
32
List of similar products:
Empty DataFrame
Columns: [userId, movieId, rating, sim, title, id]
Index: []
The predicted scoring is: nan
nan
Star Wars


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'] = 0
  pred_rating = (df_user_sim_aux['rating']*df_user_sim_aux['sim']).sum()/df_user_sim_aux['sim'].sum()


[(118910, 0.6668151617050171), (1891, 0.6665468215942383), (15121, 0.6660174131393433), (15379, 0.6629526615142822), (36669, 0.6617326140403748), (11949, 0.6612383723258972), (31010, 0.6575192809104919), (14653, 0.6573393940925598), (49689, 0.6566340923309326), (20457, 0.6565850377082825)]
5
List of similar products:
Empty DataFrame
Columns: [userId, movieId, rating, sim, title, id]
Index: []
The predicted scoring is: nan
nan
Terminator 3: Rise of the Machines


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'] = 0
  pred_rating = (df_user_sim_aux['rating']*df_user_sim_aux['sim']).sum()/df_user_sim_aux['sim'].sum()


[(6171, 0.6184961795806885), (2191, 0.5976189970970154), (24099, 0.5786418318748474), (10329, 0.5723732709884644), (77593, 0.5681955814361572), (11551, 0.5636097192764282), (10787, 0.5615474581718445), (9276, 0.5614514350891113), (10690, 0.55548495054245), (1362, 0.5519055128097534)]
46
List of similar products:
Empty DataFrame
Columns: [userId, movieId, rating, sim, title, id]
Index: []
The predicted scoring is: nan
nan


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sim_aux['sim'] = 0
  pred_rating = (df_user_sim_aux['rating']*df_user_sim_aux['sim']).sum()/df_user_sim_aux['sim'].sum()
