In [1]:
import tqdm
import json

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

## User-based Collaborative Filtering

#### Основная идея: 
Рекомендовать пользователю треки, которые понравились похожим на него пользователям

$$\hat r_{ui} = h^{-1} \left( \frac{\sum_{v \in N_i(u)} w_{uv} h(r_{vi})}{\sum_{v \in N_i(u)} w_{uv}} \right)$$

$N_i(u)$ - соседи пользователя $u$, которые оценили айтем $i$,
$w_{uv}, w_{ij}$ - веса соседей, 
$h$ - функция нормализации



**Нормализация**: В качестве функции нормализации используем среднее время прослушивания

**Веса**: Похожих пользователей будем искать по *cosine similarity*

**Отсутствующие данные**: заполним средним времнем прослушивания по пользователю

**Соседи**: в качестве соседей будем рассматривать всех пользователей. Q: Как это упростит формулу?

In [49]:
BOTIFY_DATA_DIR = '/home/tatiana/MADE_Ubuntu/MADE_2/RecSys/recsys-itmo-spring-2023/botify'

data = pd.read_json("/home/tatiana/MADE_Ubuntu/MADE_2/RecSys/data/data.json" #"/Users/n.anokhin/Desktop/input.json"
                    , lines=True)[["user", "time", "track"]].copy()

data.head()

Unnamed: 0,user,time,track
0,5084,1.0,192
1,5084,1.0,192
2,5084,1.0,424
3,5084,0.02,2254
4,5084,1.0,171


In [32]:
data["normalized_time"] = data.groupby("user")["time"].transform(lambda time: time - time.mean())
data.head()

Unnamed: 0,user,time,track,normalized_time
0,5084,1.0,192,0.483158
1,5084,1.0,192,0.483158
2,5084,1.0,424,0.483158
3,5084,0.02,2254,-0.496842
4,5084,1.0,171,0.483158


In [33]:
interactions = pd.pivot_table(data, values="normalized_time", index="user", columns="track").fillna(0)

print(f"Interactions matrix: shape={interactions.shape}, sparsity={(interactions != 0).values.sum() / interactions.size}")

Interactions matrix: shape=(3023, 8403), sparsity=0.0010897451719765663


In [35]:
interactions.loc[2][interactions.loc[2] != 0]

track
170    -0.2075
246     0.5625
424    -0.4275
4205    0.0625
7585   -0.1875
7617    0.5625
Name: 2, dtype: float64

In [36]:
similarity_matrix = cosine_similarity(interactions)
np.fill_diagonal(similarity_matrix, 0)

print(f"Mean positive neighbours per user: {(similarity_matrix > 0).sum(axis=1).mean()}")

Mean positive neighbours per user: 314.31822692689383


In [37]:
print(f"Mean negative neighbours per user: {(similarity_matrix < 0).sum(axis=1).mean()}")

Mean negative neighbours per user: 229.74594773403902


In [39]:
# TODO: Compute proper user-based scores
# TODO: expected size: observed users x observed tracks
scores_matrix = np.matmul(similarity_matrix, interactions.values)

scores = pd.DataFrame(
    scores_matrix,
    index=interactions.index,
    columns=interactions.columns
)

scores[[1, 2, 5, 7]].head()

track,1,2,5,7
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,0.0,0.0,0.0,0.014096
3,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0
14,0.055306,0.0,0.0,0.051164
15,0.0,0.0,0.0,0.0


In [40]:
(scores != 0).values.sum() / interactions.size

0.11181694044732776

## Глянем на рекомендации

In [53]:
products = pd.read_json(BOTIFY_DATA_DIR + '/data/' + "tracks.json", lines=True).set_index("track")
products.head()

Unnamed: 0_level_0,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Jack Johnson,The Cove
1,Billy Preston,Nothing from Nothing
2,Paco De Lucia,Entre Dos Aguas
3,Josh Rouse,Under Cold Blue Stars
4,The Dead 60s,Riot Radio (Soundtrack Version)


In [60]:
user = np.random.choice(scores.index)
k = 10

# data[data["user"] == user]

In [61]:
data[data["user"] == user]

Unnamed: 0,user,time,track
16148,2087,1.0,18065
16150,2087,0.09,140
16151,2087,0.09,1305
16152,2087,0.0,179
16154,2087,0.55,162
16156,2087,0.0,175


In [62]:
user_scores = pd.merge(
    scores.loc[user].sort_values(ascending=False)[:k].to_frame("score"),
    products, 
    left_index=True, 
    right_index=True,
    how="inner"
)

user_scores

Unnamed: 0_level_0,score,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
162,2.039224,Train,Hey_ Soul Sister
309,0.273295,Erin McKeown,Fast As I Can
2939,0.273065,Alliance Ethnik,Sincerité Et Jalousie
23694,0.260884,Masta Killa,Ringing Bells
222,0.245696,Daft Punk,Face To Face
5718,0.245156,The Killers,Bones
3553,0.244017,Soltero,Step Through The Door
1159,0.240562,Soltero,Ghost At The Foot Of The Bed
2350,0.223655,The All-American Rejects,Don't Leave Me
185,0.221769,Coldplay,Clocks


In [63]:
user_interactions = pd.merge(
    interactions.loc[user].sort_values(ascending=False).to_frame("time"),
    products, 
    left_index=True, 
    right_index=True, 
    how="inner"
)

user_interactions[user_interactions["time"] != 0]

Unnamed: 0_level_0,time,artist,title
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18065,0.711667,The Smiths,There Is A Light That Never Goes Out
162,0.261667,Train,Hey_ Soul Sister
140,-0.198333,Justin Bieber,Somebody To Love
1305,-0.198333,Jason Derulo,Whatcha Say
179,-0.288333,Sam Cooke,Ain't Misbehavin
175,-0.288333,Kings Of Leon,Revelry


## Подготавливаем рекомендации для продакшена

In [64]:
def recommend(user_id, scores, k):
    return scores.loc[user_id].sort_values(ascending=False)[:k].index.tolist()

In [65]:
users = data["user"].unique()

with open(BOTIFY_DATA_DIR + "recommendations_ub.json", "w") as rf:
    for user in tqdm.tqdm(users):
        recommendation = {
            "user": int(user),
            "tracks": recommend(user, scores, 100)
        }
        rf.write(json.dumps(recommendation) + "\n")

100%|█████████████████████████████████████| 3023/3023 [00:01<00:00, 2862.98it/s]
