In [13]:
import pandas as pd
import numpy as np

df_movies = pd.read_csv('./data_2/lens-movies.csv')
df_rating = pd.read_csv('./data_2/lens-ratings.csv')

df = pd.merge(df_rating, df_movies, on='movieId')

df_grpby = df \
     .groupby(by = ['movieId'])['rating'] \
     .count() \
     .reset_index() \
     .rename(columns = {'rating': 'totalRatingCount'})

df = df.merge(df_grpby, left_on = 'movieId', right_on = 'movieId', how = 'left')

popularity_threshold = 50
df_popularity= df.query('totalRatingCount >= @popularity_threshold')

# On crée un pivot, mais dans l'autre sens cette fois-ci
df = df_popularity.pivot_table(index='userId',columns='movieId',values='rating')
df

movieId,1,2,3,6,7,10,11,16,17,19,...,91500,91529,96079,99114,106782,109374,109487,112852,116797,122904
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,4.0,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,3.5,,3.5,5.0,,3.0,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,2.5,,2.5,,4.0,2.0,...,4.5,,,,,,,,,
607,4.0,,,,,,3.0,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,4.0,,4.5,,2.0,...,,,,,,,,,,
609,3.0,,,,,4.0,,,,,...,,,,,,,,,,


In [14]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

sparse = csr_matrix(df.fillna(0).values)

K = 50
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=K)
knn.fit(sparse)

# Random user
rand_int = np.random.choice(df.shape[0])
# User query
user_query = df.iloc[rand_int, :]
user_query_no_na = user_query.fillna(0)

distances, indices = knn.kneighbors(user_query_no_na.values.reshape(1, -1))

print('Target user has ID : ' + str(rand_int + 1))

for i,x in enumerate(distances.flatten()[0:10]):
    print(f'Kneighbor {indices.flatten()[i] + 1} with distance {x}')



Target user has ID : 362
Kneighbor 362 with distance 0.0
Kneighbor 348 with distance 0.24075940506863347
Kneighbor 526 with distance 0.24124501896473283
Kneighbor 387 with distance 0.3243535399650076
Kneighbor 119 with distance 0.32592240284514273
Kneighbor 489 with distance 0.33394544110781166
Kneighbor 275 with distance 0.3551589717736966
Kneighbor 453 with distance 0.35943395942350675
Kneighbor 268 with distance 0.3783402747249822
Kneighbor 204 with distance 0.37937860204826135


In [15]:
def note_matrix(df, user_query, K):
    notes_matrice = df.iloc[indices.reshape(K, )]
    notes_matrice_np = notes_matrice.to_numpy()
    user_mask = np.isnan(user_query)
    user_mask_np = user_mask.to_numpy().reshape((450,))
    columns_index = notes_matrice.columns.to_numpy()
    columns_index = columns_index[user_mask_np]
    notes_matrice_np = notes_matrice_np[:, user_mask_np]

    return pd.DataFrame(notes_matrice_np, columns=columns_index)

# Let's build our main matrix

note_matrix = note_matrix(df, user_query, K)
note_matrix['mean'] = note_matrix.agg(['mean'], axis=1)

distance_serie = pd.Series(distances.reshape((K,)))
note_matrix['distances'] = distance_serie

note_matrix

Unnamed: 0,2,3,6,7,10,11,16,17,19,21,...,96079,99114,106782,109374,109487,112852,116797,122904,mean,distances
0,,,,,,,,,,,...,,,,,,,,,,0.0
1,,,,3.0,,,,2.0,,,...,,,,,,,,,3.0,0.240759
2,,,,2.0,,,,,,,...,,,,,,,,,2.666667,0.241245
3,,,5.0,,,,,4.0,,,...,,,,,,,,,4.125,0.324354
4,,,,,,,,,,,...,,,,,,,,,4.0,0.325922
5,,4.0,3.0,3.0,,,,,,,...,,,,,,,,,3.866667,0.333945
6,,,,,,,,,,,...,,,,,,,,,3.5,0.355159
7,,3.0,,,,,,,,,...,,,,,,,,,3.857143,0.359434
8,,3.0,3.0,3.0,,,,4.0,,,...,,,,,,,,,3.076923,0.37834
9,,,,4.0,,,,5.0,,,...,,,,,,,,,4.666667,0.379379


In [16]:
def predict_notes(
    item_notes,
    distances,
    mean_users_notes,
    mean_target_notes):

    item_notes = np.array(item_notes)

    # We need to apply a mask, in order to
    # do calculation only on available ratings :
    mask = ~np.isnan(item_notes)
    item_notes = item_notes[mask]
    mean_users_notes = mean_users_notes.to_numpy()[mask]
    distances = distances.to_numpy()[mask]

    # Take the inverse of distance, so we give more
    # weight to closer vectors.
    distances = 1 - distances

    weighted_notes = item_notes - mean_users_notes
    # mean_target_notes is our constant
    return mean_target_notes +\
        np.dot(distances, weighted_notes) / np.sum(np.abs(distances))

In [17]:
# Transpose for allowing itertuples, and remove user column :
note_matrix_transposed = note_matrix.transpose().iloc[:,1:]

# Isolate distance and mean rows :
distances = note_matrix_transposed.iloc[-1,:]
mean = note_matrix_transposed.iloc[-2,:]

# Remove distance/mean rows since we don't need them anymore :
note_matrix_cleaned = note_matrix_transposed.iloc[:-2,:]

user_mean = user_query.mean()

predictions = []
# Here we will iterate over each movie,
# and get a rating prediction, only if
# the iterated movie was noted by at
# least one neighbor :
for row in note_matrix_cleaned.itertuples(index=False):
    predictions.append(predict_notes(row, distances, mean, user_mean))

predictions = np.array(predictions)
predictions

  np.dot(distances, weighted_notes) / np.sum(np.abs(distances))


array([3.78191063, 3.73568463, 4.34730216, 3.7325709 , 3.99857397,
       4.45240736, 4.08235676, 3.97281937, 3.28390823, 3.98540789,
       4.28271734, 4.79462905, 4.14880813, 3.64395202, 4.44079817,
       3.16040587, 4.42992311, 4.22198365, 2.43348007, 4.13655736,
       4.44117613, 4.30801442, 3.88872727, 4.55661656, 3.33232638,
       3.08224987, 3.39491301, 4.29278347, 3.96157433, 4.22941552,
       3.6644766 , 3.51664063, 3.30932041, 3.60322292, 3.16459782,
       4.20968422, 4.11589404, 3.36949378, 3.78188525, 4.21155961,
       4.33878081, 2.89491551, 4.24831605, 3.89891767, 4.30501872,
       4.06184851, 3.85347271, 4.09557434, 4.89486263, 3.77795863,
       3.98800113, 4.12698164, 3.73988246, 3.46113239, 4.30746314,
       4.11617754, 3.52463405, 4.40298023, 3.8166758 , 4.23303245,
       4.06040918, 3.82248567, 4.07599191, 4.25131368, 3.84827218,
       3.78759063, 3.7537247 , 3.73593422, 2.94309604, 3.31459108,
       4.30532576, 3.44751773, 4.01578814, 4.54801798, 3.59590

In [18]:
# Add predicted values as new column
note_matrix_cleaned['predicted'] = predictions

# Top ten best predicted notes
top_ten_predicted = note_matrix_cleaned.sort_values('predicted', ascending=False).head(n=10)

# Print movies title
index = top_ten_predicted.index.to_numpy().astype('int64')
df_movies_indexed = df_movies.set_index('movieId')
results = df_movies_indexed.loc[index]

results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  note_matrix_cleaned['predicted'] = predictions


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2699,Arachnophobia (1990),Comedy|Horror
1639,Chasing Amy (1997),Comedy|Drama|Romance
109487,Interstellar (2014),Sci-Fi|IMAX
58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
1288,This Is Spinal Tap (1984),Comedy
35836,"40-Year-Old Virgin, The (2005)",Comedy|Romance
3996,"Crouching Tiger, Hidden Dragon (Wo hu cang lon...",Action|Drama|Romance
2324,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War
908,North by Northwest (1959),Action|Adventure|Mystery|Romance|Thriller


In [19]:
# Let's take a look at initial notes left by user

user_query_no_na = user_query.dropna()
user_initial_notes = df_movies_indexed.loc[user_query_no_na.index.to_numpy()]
user_initial_notes['note'] = user_query_no_na
user_initial_notes

Unnamed: 0_level_0,title,genres,note
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0
32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,3.0
95,Broken Arrow (1996),Action|Adventure|Thriller,5.0
141,"Birdcage, The (1996)",Comedy,4.0
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.0
648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,3.0
733,"Rock, The (1996)",Action|Adventure|Thriller,4.0
736,Twister (1996),Action|Adventure|Romance|Thriller,4.0
780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller,5.0
786,Eraser (1996),Action|Drama|Thriller,4.0
