In [None]:
import pandas as pd
import numpy as np

https://realpython.com/build-recommendation-engine-collaborative-filtering/

### Generating User-Level listening events

variables:
- N: num users
- M: num contents

In [None]:
N, M = 10000, 1000

In [None]:
import random

# generating random event (i-th user, k-th content, rating)
num_events = 1000000
listening_events = [[random.randint(0, N), random.randint(0, M), random.randint(0, 5)] for _ in range(num_events)]

In [None]:
df_listenings = pd.DataFrame(listening_events, columns=['userId', 'contentId', 'rating'])

In [None]:
df_ratings = df_listenings.groupby(['userId', 'contentId']).agg({'rating': ['mean']}).reset_index()
renamed_columns = [col[0] for col in df_ratings.columns]
df_ratings.columns = renamed_columns

In [None]:
# df_ratings

In [None]:
df_matrix = pd.pivot_table(df_ratings, index=['userId'], columns=['contentId'], values=['rating'])
df_matrix = df_matrix.fillna(0.0)
matrix = df_matrix.to_numpy()

In [None]:
# --- co-occurence matrix
# matrix = np.zeros((N+1, N+1))
# for user, content in listening_events:
#     matrix[user, content] += 1

### Collaborative Filtering - Memory-based Approach

Steps:
1. Determine which users are similar to user U
2. Compute the rating R that user U would give to item I

Variables:
- C: number of closest users
- S_u: Similarity Score between user U and other users => shape: [1, C]
- R_u: Ranking given by other users => shape: [C, M]
- R_w: Ranking given by other users, weighted by their similarity score => shape: [C, M]
- R_U: Estimated Ranking given by user U => [1, M]

Caveats: TODO
- What happens when matrix is sparse? ie most users haven't listened to all contents
- What happends when user hasn't listened to anything yet?
    - Cold Start Problem: We are unable to estimate what they like because they haven't liked anything yet. We use static popular content
- Normalize based on user mean => a user may rate everything systematically less than another users even though they liked it the
  same because their view of what it means to be good is different

In [None]:
from collections import defaultdict
from numpy.linalg import norm
from numpy import dot

def get_closest_users(matrix: np.array, userId: int, top_n: int = 100):
    num_users = matrix.shape[0]
    distances = defaultdict(float)
    user = matrix[userId]
    for i in range(num_users):
        if i == userId: 
            continue
        other = matrix[i]
        distances[i] = (dot(user, other) / (norm(user) * norm(other))).item()
    distances = sorted(distances.items(), key=lambda kv: kv[1], reverse=True)[:top_n] # TODO: keep only positive similarity score
    distances = np.array(distances)
    return distances

In [None]:
userId=0
users_similarity = get_closest_users(matrix, userId=userId)

In [None]:
users_similarity[:5]

In [None]:
closest_users_indexes = list(map(int, users_similarity[:, 0])) # -- convert numpy array to int
S_u = users_similarity[:, 1].reshape((1, -1))
R_u = matrix[closest_users_indexes]
R_w = np.multiply(R_u, np.transpose(S_u))
R_U = R_w.sum(axis=0) / S_u.sum()

In [None]:
sorted_contents = [(i.item(), R_U[i].item()) for i in R_U.argsort()[::-1]] # (item, estimated rating)

In [None]:
sorted_contents[:5]

In [None]:
# filtering out content already listened to
already_listened = set([i for i, rating in enumerate(matrix[userId]) if rating])
suggestions = [(item, rating) for item, rating in sorted_contents if item not in already_listened]

### Collaborative Filtering - Model-based Approach

Steps:
1. Factorize co-occurence matrix to get get U and C
2. Compute estimate content rating for user U
3. Rank content suggestions based on rating estimates

Variables:
- T: Number of themes (arbitrary number ie hyperparams)
- R: Co-occurence matrix => shape: [N, M]
- U: User-Theme matrix => shape: [N, T]
- C: Content-Theme matrix => shape: [T, M]
- R = U x C
- U_U: User U theme likeliness vector => shape: [1, T]
- R_U: Contents Rating for user U => R_U = U_U x C => shape: [1, M] 

### Comparing Collaborative Filtering Approaches - Memory-based vs Model-based