# Get data

In [34]:
import pandas as pd
import numpy as np

# We work with small dataset from MovieLens (100 000 ratings), otherwise
# with the big one (27 000 000) computing cannot be done with panda.
df_rating = pd.read_csv('./data_small/ratings.csv')

# Crate user-item interaction matrix. Users are the main entities we
# want to calculate distances between.
df = df_rating.pivot_table(index='userId',columns='movieId',values='rating')

df.sample(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
495,,,,,,,,,,,...,,,,,,,,,,
71,5.0,,,,,,3.0,,,,...,,,,,,,,,,
272,,,,,,,,,,,...,,,,,,,,,,
557,,,,,,,,,,4.5,...,,,,,,,,,,
112,3.0,1.5,,,,4.5,,,,,...,,,,,,,,,,
485,,,,,,,,,,4.0,...,,,,,,,,,,
40,5.0,,,,,,,,,,...,,,,,,,,,,
327,,,,,,,,,,,...,,,,,,,,,,
117,,3.0,3.0,,3.0,3.0,4.0,,,3.0,...,,,,,,,,,,
325,,,,,,4.0,,,,,...,,,,,,,,,,


# Train model

In [43]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

sparse = csr_matrix(df.fillna(0).values)

# 50 kneighbors
K = 50
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=K)
knn.fit(sparse)

# Random user
rand_int = np.random.choice(df.shape[0])

# User query (vector)
user_query = df.iloc[rand_int, :]
user_query_no_na = user_query.fillna(0)

distances, indices = knn.kneighbors(user_query_no_na.values.reshape(1, -1))

print('Target user has ID : ' + str(rand_int + 1))

print('Its 10 first kneighbors are :')
for i,x in enumerate(distances.flatten()[0:11]):
    indice = int(indices.flatten()[i] + 1)
    if indice != rand_int + 1:
        print(f'Kneighbor {indice} with distance {x}')


Target user has ID : 310
Its 10 first kneighbors are :
Kneighbor 161 with distance 0.8452601762031671
Kneighbor 488 with distance 0.8460323766538755
Kneighbor 263 with distance 0.8467599738873776
Kneighbor 167 with distance 0.8479687389927802
Kneighbor 137 with distance 0.848662713755836
Kneighbor 597 with distance 0.8487630899608007
Kneighbor 572 with distance 0.8488948307157964
Kneighbor 376 with distance 0.8533931606524314
Kneighbor 487 with distance 0.8537982811390119
Kneighbor 590 with distance 0.8541011818313784


# Get matrix with means and distances

In [45]:
def note_matrix(df, user_query, K):
    notes_matrice = df.iloc[indices.reshape(K, )]
    notes_matrice_np = notes_matrice.to_numpy()
    user_mask = np.isnan(user_query)
    user_mask_np = user_mask.to_numpy().reshape((9724,))
    columns_index = notes_matrice.columns.to_numpy()
    columns_index = columns_index[user_mask_np]
    notes_matrice_np = notes_matrice_np[:, user_mask_np]

    return pd.DataFrame(notes_matrice_np, columns=columns_index)

# Let's build our main matrix

note_matrix = note_matrix(df, user_query, K)
note_matrix['mean'] = note_matrix.agg(['mean'], axis=1)

distance_serie = pd.Series(distances.reshape((K,)))
note_matrix['distances'] = distance_serie

note_matrix.sample(10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193571,193573,193579,193581,193583,193585,193587,193609,mean,distances
19,3.0,4.0,,,,,,,,,...,,,,,,,,,3.532258,0.870842
47,3.5,4.0,,,,,,,,,...,,,,,,,,,3.536504,0.890775
33,,,,,,,,,,,...,,,,,,,,,2.611111,0.883993
40,4.5,2.0,4.0,,2.0,,,,,3.0,...,,,,,,,,,3.145366,0.887284
31,,,,,,,1.0,,,,...,,,,,,,,,2.860825,0.879982
34,,,,,,,,,,4.5,...,,,,,,,,,4.486842,0.884515
43,3.0,,,,,4.0,,,,3.5,...,,,,,,,,,3.51432,0.888648
17,4.0,3.0,3.0,,,5.0,,,,3.5,...,,,,,,,,,3.404762,0.870812
30,,,,,,4.0,3.0,,,,...,,,,,,,,,3.367816,0.879648
37,,3.0,,,,4.0,,,,3.0,...,,,,,,,,,3.534483,0.887087


# Predict and recommend

In [46]:
def predict_notes(
    item_notes,
    distances,
    mean_users_notes,
    mean_target_notes):

    item_notes = np.array(item_notes)

    # We need to apply a mask, in order to
    # do calculation only on available ratings :
    mask = ~np.isnan(item_notes)
    item_notes = item_notes[mask]
    mean_users_notes = mean_users_notes.to_numpy()[mask]
    distances = distances.to_numpy()[mask]

    # Take the inverse of distance, so we give more
    # weight to closer vectors.
    distances = 1 - distances

    weighted_notes = item_notes - mean_users_notes
    # mean_target_notes is our constant
    return mean_target_notes +\
        np.dot(distances, weighted_notes) / np.sum(np.abs(distances))

In [53]:
# Transpose for allowing itertuples, and remove user column :
note_matrix_transposed = note_matrix.transpose().iloc[:,1:]

# Isolate distance and mean rows :
distances = note_matrix_transposed.iloc[-1,:]
mean = note_matrix_transposed.iloc[-2,:]

# Remove distance/mean rows since we don't need them anymore :
note_matrix_cleaned = note_matrix_transposed.iloc[:-2,:]

user_mean = user_query.mean()

predictions = []
# Here we will iterate over each movie,
# and get a rating prediction, only if
# the iterated movie was noted by at
# least one neighbor :
for row in note_matrix_cleaned.itertuples(index=False):
    predictions.append(predict_notes(row, distances, mean, user_mean))

predictions = np.array(predictions)
predictions[:10]

  np.dot(distances, weighted_notes) / np.sum(np.abs(distances))


array([3.85820748, 3.23172062, 3.33409592,        nan, 2.5587792 ,
       4.03098181, 2.47798828, 3.15125   , 2.39572639, 3.52625628])

In [54]:
df_movies = pd.read_csv('./data_small/movies.csv')

# Add predicted values as new column
note_matrix_cleaned['predicted'] = predictions

# Top ten best predicted notes
top_ten_predicted = note_matrix_cleaned.sort_values('predicted', ascending=False).head(n=10)

# Print movies title
index = top_ten_predicted.index.to_numpy().astype('int64')
df_movies_indexed = df_movies.set_index('movieId')
results = df_movies_indexed.loc[index]

results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  note_matrix_cleaned['predicted'] = predictions


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
132333,Seve (2014),Documentary|Drama
5490,The Big Bus (1976),Action|Comedy
5915,Victory (a.k.a. Escape to Victory) (1981),Action|Drama|War
96004,Dragon Ball Z: The History of Trunks (Doragon ...,Action|Adventure|Animation
170355,Mulholland Dr. (1999),Drama|Mystery|Romance
7748,Pierrot le fou (1965),Crime|Drama
3379,On the Beach (1959),Drama
3677,Baraka (1992),Documentary
7091,Horse Feathers (1932),Comedy
80693,It's Kind of a Funny Story (2010),Comedy|Drama


In [52]:
# Let's take a look at initial notes left by user

user_query_no_na = user_query.dropna()
user_initial_notes = df_movies_indexed.loc[user_query_no_na.index.to_numpy()]
user_initial_notes['note'] = user_query_no_na
user_initial_notes.sort_values(by=['note'], ascending=False).head(20)

Unnamed: 0_level_0,title,genres,note
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1945,On the Waterfront (1954),Crime|Drama,5.0
1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,5.0
1304,Butch Cassidy and the Sundance Kid (1969),Action|Western,5.0
1233,"Boot, Das (Boat, The) (1981)",Action|Drama|War,5.0
6863,School of Rock (2003),Comedy|Musical,5.0
1223,"Grand Day Out with Wallace and Gromit, A (1989)",Adventure|Animation|Children|Comedy|Sci-Fi,5.0
5899,Zulu (1964),Action|Drama|War,5.0
745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,5.0
2944,"Dirty Dozen, The (1967)",Action|Drama|War,5.0
1204,Lawrence of Arabia (1962),Adventure|Drama|War,5.0
