In [1]:
import tqdm
import json

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

## User-based Collaborative Filtering

#### Основная идея: 
Рекомендовать пользователю треки, которые понравились похожим на него пользователям

$$\hat r_{ui} = h^{-1} \left( \frac{\sum_{v \in N_i(u)} w_{uv} h(r_{vi})}{\sum_{v \in N_i(u)} w_{uv}} \right)$$

$N_i(u)$ - соседи пользователя $u$, которые оценили айтем $i$,
$w_{uv}, w_{ij}$ - веса соседей, 
$h$ - функция нормализации



**Нормализация**: В качестве функции нормализации используем среднее время прослушивания

**Веса**: Похожих пользователей будем искать по *cosine similarity*

**Отсутствующие данные**: заполним средним времнем прослушивания по пользователю

**Соседи**: в качестве соседей будем рассматривать всех пользователей. Q: Как это упростит формулу?

In [2]:
BOTIFY_DATA_DIR = "/Users/romanzilotov/Desktop/MADE/2 семестр/rec sys/repo/recsys-itmo-spring-2023-master/botify/data/"

data = pd.read_json("/Users/romanzilotov/Desktop/data.json", lines=True)[["user", "time", "track"]].copy()

data.head()

Unnamed: 0,user,time,track
0,404,1.0,1084
1,404,1.0,1084
2,404,1.0,1084
3,404,1.0,1084
4,404,0.0,487


In [3]:
data["normalized_time"] = data.groupby("user")["time"].transform(lambda time: time - time.mean())

data.head()

Unnamed: 0,user,time,track,normalized_time
0,404,1.0,1084,0.83
1,404,1.0,1084,0.83
2,404,1.0,1084,0.83
3,404,1.0,1084,0.83
4,404,0.0,487,-0.17


In [4]:
interactions = pd.pivot_table(data, values="normalized_time", index="user", columns="track").fillna(0)

print(f"Interactions matrix: shape={interactions.shape}, sparsity={(interactions != 0).values.sum() / interactions.size}")

Interactions matrix: shape=(9443, 49397), sparsity=0.0005827303650766998


In [5]:
similarity_matrix = cosine_similarity(interactions)
np.fill_diagonal(similarity_matrix, 0)

print(f"Mean positive neighbours per user: {(similarity_matrix > 0).sum(axis=1).mean()}")

Mean positive neighbours per user: 116.22789367785661


In [6]:
print(f"Mean negative neighbours per user: {(similarity_matrix < 0).sum(axis=1).mean()}")

Mean negative neighbours per user: 62.6976596420629


In [7]:
# TODO: Compute proper user-based scores
# TODO: expected size: observed users x observed tracks
scores_matrix = np.matmul(similarity_matrix, interactions.values)

scores = pd.DataFrame(
    scores_matrix,
    index=interactions.index,
    columns=interactions.columns
)

scores[[1, 2, 3, 4, 5]].head()

track,1,2,3,4,5
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0


## Глянем на рекомендации

In [8]:
products = pd.read_json(BOTIFY_DATA_DIR + "tracks.json", lines=True).set_index("track")
products.head()

Unnamed: 0_level_0,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Jack Johnson,The Cove
1,Billy Preston,Nothing from Nothing
2,Paco De Lucia,Entre Dos Aguas
3,Josh Rouse,Under Cold Blue Stars
4,The Dead 60s,Riot Radio (Soundtrack Version)


In [9]:
user = np.random.choice(scores.index)
k = 10

# data[data["user"] == user]

In [10]:
data[data["user"] == user]

Unnamed: 0,user,time,track,normalized_time
96410,3406,1.0,12063,0.831667
96415,3406,0.0,46758,-0.168333
96419,3406,0.0,29014,-0.168333
96423,3406,0.0,8950,-0.168333
96427,3406,0.01,45889,-0.158333
96431,3406,0.0,38265,-0.168333


In [11]:
user_scores = pd.merge(
    scores.loc[user].sort_values(ascending=False)[:k].to_frame("score"),
    products, 
    left_index=True, 
    right_index=True,
    how="inner"
)

user_scores

Unnamed: 0_level_0,score,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12063,0.316189,All-4-One,I Can Love You Like That (LP Version)
7272,0.197218,Boyz II Men,End Of The Road
33281,0.197218,The Chipmunks,You Spin Me Round [Like A Record]
25225,0.197218,John Legend,Show Me
3084,0.194201,The Pussycat Dolls,Bite The Dust
1472,0.176724,Beyoncé,Halo
1490,0.176724,Lady GaGa / Colby O'Donis,Just Dance
442,0.176724,Taylor Swift,Love Story
435,0.176724,Travie McCoy,Billionaire [feat. Bruno Mars] (Explicit Albu...
44797,0.17308,Hell Razah,Project Jazz


In [12]:
user_interactions = pd.merge(
    interactions.loc[user].sort_values(ascending=False).to_frame("time"),
    products, 
    left_index=True, 
    right_index=True, 
    how="inner"
)

user_interactions[user_interactions["time"] != 0]

Unnamed: 0_level_0,time,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12063,0.831667,All-4-One,I Can Love You Like That (LP Version)
45889,-0.158333,Jim Cuddy,All In Time
38265,-0.168333,Gaelic Storm,The Leaving Of Liverpool
29014,-0.168333,Wild Nothing,Bored Games
8950,-0.168333,Medina Azahara,Andalucia
46758,-0.168333,Lifter Puller,The Langelos


## Подготавливаем рекомендации для продакшена

In [14]:
def recommend(user_id, scores, k):
    return scores.loc[user_id].sort_values(ascending=False)[:k].index.tolist()

## top 100

In [15]:
users = data["user"].unique()

with open(BOTIFY_DATA_DIR + "recommendations_ub_100.json", "w") as rf:
    for user in tqdm.tqdm(users):
        recommendation = {
            "user": int(user),
            "tracks": recommend(user, scores, 100)
        }
        rf.write(json.dumps(recommendation) + "\n")

100%|██████████████████████████████████████| 9443/9443 [00:17<00:00, 535.07it/s]


## top 50

In [16]:
users = data["user"].unique()

with open(BOTIFY_DATA_DIR + "recommendations_ub_50.json", "w") as rf:
    for user in tqdm.tqdm(users):
        recommendation = {
            "user": int(user),
            "tracks": recommend(user, scores, 50)
        }
        rf.write(json.dumps(recommendation) + "\n")

100%|██████████████████████████████████████| 9443/9443 [00:17<00:00, 537.79it/s]


## top 200

In [17]:
users = data["user"].unique()

with open(BOTIFY_DATA_DIR + "recommendations_ub_200.json", "w") as rf:
    for user in tqdm.tqdm(users):
        recommendation = {
            "user": int(user),
            "tracks": recommend(user, scores, 200)
        }
        rf.write(json.dumps(recommendation) + "\n")

100%|██████████████████████████████████████| 9443/9443 [00:18<00:00, 504.62it/s]
