In [6]:
import pandas as pd
import numpy as np

https://realpython.com/build-recommendation-engine-collaborative-filtering/

### Generating User-Level listening events

variables:
- N: num users
- M: num contents

In [2]:
N, M = 10000, 1000

In [18]:
import random

# generating random event (i-th user, k-th content, rating)
num_events = 1000000
listening_events = [[random.randint(0, N), random.randint(0, M), random.randint(0, 5)] for _ in range(num_events)]

In [27]:
df_listenings = pd.DataFrame(listening_events, columns=['userId', 'contentId', 'rating'])

In [44]:
df_ratings = df_listenings.groupby(['userId', 'contentId']).agg({'rating': ['mean']}).reset_index()
renamed_columns = [col[0] for col in df_ratings.columns]
df_ratings.columns = renamed_columns

In [54]:
# df_ratings

In [53]:
df_matrix = pd.pivot_table(df_ratings, index=['userId'], columns=['contentId'], values=['rating'])
df_matrix = df_matrix.fillna(0.0)
matrix = df_matrix.to_numpy()

In [9]:
# --- co-occurence matrix
# matrix = np.zeros((N+1, N+1))
# for user, content in listening_events:
#     matrix[user, content] += 1

### Collaborative Filtering - Memory-based Approach

Steps:
1. Determine which users are similar to user U
2. Compute the rating R that user U would give to item I

Variables:
- C: number of closest users
- S_u: Similarity Score between user U and other users => shape: [1, C]
- R_u: Ranking given by other users => shape: [C, M]
- R_w: Ranking given by other users, weighted by their similarity score => shape: [C, M]
- R_U: Estimated Ranking given by user U => [1, M]

In [96]:
from collections import defaultdict
from numpy.linalg import norm
from numpy import dot

def get_closest_users(matrix: np.array, userId: int, top_n: int = 100):
    num_users = matrix.shape[0]
    distances = defaultdict(float)
    user = matrix[userId]
    for i in range(num_users):
        if i == userId: 
            continue
        other = matrix[i]
        distances[i] = (dot(user, other) / (norm(user) * norm(other))).item()
    distances = sorted(distances.items(), key=lambda kv: kv[1], reverse=True)[:top_n] # TODO: keep only positive similarity score
    distances = np.array(distances)
    return distances

In [99]:
users_similarity = get_closest_users(matrix, userId=0)

In [100]:
users_similarity[:5]

array([[8.91300000e+03, 2.15225317e-01],
       [5.84900000e+03, 1.91296331e-01],
       [6.13000000e+02, 1.89854209e-01],
       [4.31000000e+03, 1.85684416e-01],
       [8.51000000e+02, 1.85356062e-01]])

In [164]:
closest_users_indexes = list(map(int, users_similarity[:, 0])) # -- convert numpy array to int
S_u = users_similarity[:, 1].reshape((1, -1))
R_u = matrix[closest_users_indexes]
R_w = np.multiply(R_u, np.transpose(S_u))
R_U = R_w.sum(axis=0) / S_u.sum()

In [176]:
sorted_contents = [(i.item(), R_U[i].item()) for i in R_U.argsort()[::-1]] # (item, estimated rating)

### Collaborative Filtering - Model-based Approach