In [1]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from collections import defaultdict
from tqdm import tqdm
from surprise.model_selection import cross_validate, train_test_split
from scipy.sparse import csr_matrix

In [2]:
train = pd.read_csv('recsys_music_vk/train.csv')
train

Unnamed: 0,userId,id,artistId
0,-9223365420690745920,4789011704403477758,4862000404086590321
1,-9223357012432074795,-3685301030413601284,-5911062739952260612
2,-9223357012432074795,-1040959798245656474,6738173673409941080
3,-9223331337535253792,-8750332497321603794,-2531033142411854446
4,-9223331337535253792,7353026449734842135,-285672225852509948
...,...,...,...
4920099,9223323429584020370,-6973328648673682165,-3904789406522786641
4920100,9223323429584020370,3712711740551503640,-3690361587617342969
4920101,9223341540783520542,8309155247938420417,-7126264979390954459
4920102,9223351321092564051,-3093196378698218885,5622172636319693268


In [4]:
train = train[['userId', 'id']]

In [5]:
train

Unnamed: 0,userId,id
0,-9223365420690745920,4789011704403477758
1,-9223357012432074795,-3685301030413601284
2,-9223357012432074795,-1040959798245656474
3,-9223331337535253792,-8750332497321603794
4,-9223331337535253792,7353026449734842135
...,...,...
4920099,9223323429584020370,-6973328648673682165
4920100,9223323429584020370,3712711740551503640
4920101,9223341540783520542,8309155247938420417
4920102,9223351321092564051,-3093196378698218885


In [6]:
train['weight'] = 1

In [12]:
users_train = set(train['userId'])
user2idx = {u: i for i, u in enumerate(users_train)}
idx2user = {i: u for i, u in enumerate(users_train)}

In [15]:
items_train = set(train['id'])
item2idx = {u: i for i, u in enumerate(items_train)}
idx2item = {i: u for i, u in enumerate(items_train)}

In [17]:
train['userId'] = train['userId'].map(user2idx)

In [20]:
train['id'] = train['id'].map(item2idx)

In [21]:
train

Unnamed: 0,userId,id,weight
0,996772,162886,1
1,28477,333755,1
2,28477,530506,1
3,1005410,448394,1
4,1005410,613949,1
...,...,...,...
4920099,169265,2268,1
4920100,169265,602521,1
4920101,54069,339306,1
4920102,953224,216939,1


In [23]:
(len(users_train), len(items_train))

(1166360, 629330)

In [26]:
train.astype(pd.SparseDtype("float", np.nan))

Unnamed: 0,userId,id,weight
0,996772.0,162886.0,1.0
1,28477.0,333755.0,1.0
2,28477.0,530506.0,1.0
3,1005410.0,448394.0,1.0
4,1005410.0,613949.0,1.0
...,...,...,...
4920099,169265.0,2268.0,1.0
4920100,169265.0,602521.0,1.0
4920101,54069.0,339306.0,1.0
4920102,953224.0,216939.0,1.0


In [30]:
mtx = csr_matrix(train[['userId', 'id']])

In [35]:
mtx = csr_matrix((train['weight'], (train['userId'], train['id'])), shape=(len(users_train), len(items_train)))

In [None]:
np.array().

In [60]:
train['userId'].value_counts()

userId
1161472    12413
1002670     7957
950641      4559
249489      4404
662254      4354
           ...  
491074         1
858902         1
490823         1
413406         1
927373         1
Name: count, Length: 1166360, dtype: int64

In [63]:
mtx[1161472].toarray().tolist()[0].count(1)

12413

In [37]:
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix

def plot_coo_matrix(m):
    if not isinstance(m, coo_matrix):
        m = coo_matrix(m)
    fig = plt.figure()
    ax = fig.add_subplot(111, facecolor='black')
    ax.plot(m.col, m.row, 's', color='white', ms=1)
    ax.set_xlim(0, m.shape[1])
    ax.set_ylim(0, m.shape[0])
    ax.set_aspect('equal')
    for spine in ax.spines.values():
        spine.set_visible(False)
    ax.invert_yaxis()
    ax.set_aspect('equal')
    ax.set_xticks([])
    ax.set_yticks([])
    return ax

In [10]:
train.columns = ['userId:token', 'id:token', 'artistId:token']

In [18]:
train['timestamp:float'] = range(len(train))

In [19]:
train[['userId:token', 'id:token', 'timestamp:float']].to_csv('recsys_music_vk/recbole/vk.csv', index=False)

In [12]:
gb = train[['id:token', 'artistId:token']].groupby('id:token').agg(pd.Series.mode)

In [13]:
gb.reset_index()['id:token'].value_counts()

id:token
-9223319875987156276    1
 3078715584218385455    1
 3078557274697066264    1
 3078567090825961540    1
 3078574230345570405    1
                       ..
-3080689036666377598    1
-3080685305053056212    1
-3080642327310137654    1
-3080641383902613352    1
 9223353768117050793    1
Name: count, Length: 629330, dtype: int64

In [1]:
from recbole.quick_start import run_recbole

2025-02-11 16:54:16.984940: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [29]:
run_recbole(model='Pop', dataset='vk', 
           config_file_list=['recsys_music_vk/recbole/vk/config.yaml']
           )

11 Feb 18:06    INFO  ['/home/xbar19/ML/MLEnv/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/xbar19/.local/share/jupyter/runtime/kernel-bff4fdc4-3b5d-433b-95ff-cb3e4fbb6560.json']
11 Feb 18:06    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recsys_music_vk/recbole/vk
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 3
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.8, 0.19, 0.01]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
re

In [14]:
gb.to_csv('recsys_music_vk/recbole/vk.csv', index=False)

In [7]:
train['rating'] = 1

In [5]:
reader = Reader(rating_scale=(0, 1))

In [6]:
algo = SVD(n_factors=50)

In [8]:
ds = Dataset.load_from_df(train[['userId', 'id', 'rating']], reader)

In [14]:
trainset, testset = train_test_split(ds, test_size=0.2, random_state=42)

In [15]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f54851b9000>

In [19]:
user_listened_tracks = defaultdict(set)
for _, row in tqdm(train.iterrows()):
    user_listened_tracks[row['userId']].add(row['id'])

4920104it [01:15, 65087.59it/s]


In [21]:
all_tracks = train['id'].unique()
def get_top_n_recommendations(user_id, n=3):
    # Уберем треки, которые пользователь уже слушал
    unlistened_tracks = [track for track in all_tracks if track not in user_listened_tracks[user_id]]
    # Предскажем рейтинги для непрослушанных треков
    predictions = [algo.predict(user_id, track) for track in unlistened_tracks]
    # Отсортируем предсказания по рейтингу
    predictions.sort(key=lambda x: x.est, reverse=True)
    # Вернем топ-N треков
    return [pred.iid for pred in predictions[:n]]

In [22]:
pd.read_csv('recsys_music_vk/submission.csv')

Unnamed: 0,userId,recommended_id_1,recommended_id_2,recommended_id_3
0,-9223331337535253792,-2227263297444998835,-4695036104375440069,-8860367773155756475
1,-9223315435871723421,-2227263297444998835,-4695036104375440069,-8860367773155756475
2,-9223312953116183861,-2227263297444998835,-4695036104375440069,-8860367773155756475
3,-9223189477397287224,-2227263297444998835,-4695036104375440069,-8860367773155756475
4,-9223099495950652905,-2227263297444998835,-4695036104375440069,-8860367773155756475
...,...,...,...,...
388891,9223205520969697450,-2227263297444998835,-4695036104375440069,-8860367773155756475
388892,9223233690124762843,-2227263297444998835,-4695036104375440069,-8860367773155756475
388893,9223256937547721480,-2227263297444998835,-4695036104375440069,-8860367773155756475
388894,9223293703257417871,-2227263297444998835,-4695036104375440069,-8860367773155756475


In [48]:
submission_users = pd.read_csv('recsys_music_vk/submission.csv')['userId'].unique()

# Генерация рекомендаций для каждого пользователя
# recommendations = []
# for user in tqdm(submission_users):
#     recommended_tracks = get_top_n_recommendations(user, n=3)
#     recommendations.append([user] + recommended_tracks)

In [3]:
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.preprocessing import LabelEncoder

In [25]:
user_encoder = LabelEncoder()
track_encoder = LabelEncoder()
artist_encoder = LabelEncoder()

# Кодируем userId, id и artistId
train['userId'] = user_encoder.fit_transform(train['userId'])
train['id'] = track_encoder.fit_transform(train['id'])
train['artistId'] = artist_encoder.fit_transform(train['artistId'])

In [41]:
data = train.copy()

In [42]:
dataset = Dataset()
dataset.fit(users=data['userId'].unique(),
            items=data['id'].unique(),
            item_features=data['artistId'].unique())

In [43]:
interactions, _ = dataset.build_interactions(zip(data['userId'], data['id']))

In [44]:
item_features = dataset.build_item_features((x, [y]) for x, y in zip(data['id'], data['artistId']))

In [45]:
model = LightFM(no_components=50, loss='warp')  # no_components — аналог n_factors в SVD
model.fit(interactions, item_features=item_features, epochs=20, num_threads=4, verbose=True)

Epoch: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [02:26<00:00,  7.33s/it]


<lightfm.lightfm.LightFM at 0x7f532c708790>

In [46]:
def get_top_n_recommendations(user_id, n=3):
    # Получаем все треки
    all_tracks = data['id'].unique()
    # Предсказываем рейтинги для всех треков
    scores = model.predict(user_id, all_tracks, item_features=item_features)
    # Сортируем треки по убыванию рейтинга
    top_tracks = all_tracks[np.argsort(-scores)]
    # Возвращаем топ-N треков
    return top_tracks[:n]

In [4]:
import rectools
from rectools.dataset import Dataset
from rectools.dataset.features import DenseFeatures, SparseFeatures, IdMap
from rectools import Columns

In [62]:
train

Unnamed: 0,userId,id,artistId
0,-9223365420690745920,4789011704403477758,4862000404086590321
1,-9223357012432074795,-3685301030413601284,-5911062739952260612
2,-9223357012432074795,-1040959798245656474,6738173673409941080
3,-9223331337535253792,-8750332497321603794,-2531033142411854446
4,-9223331337535253792,7353026449734842135,-285672225852509948
...,...,...,...
4920099,9223323429584020370,-6973328648673682165,-3904789406522786641
4920100,9223323429584020370,3712711740551503640,-3690361587617342969
4920101,9223341540783520542,8309155247938420417,-7126264979390954459
4920102,9223351321092564051,-3093196378698218885,5622172636319693268


In [63]:
Columns.User

'user_id'

In [5]:
train.columns = [Columns.User, Columns.Item, 'artistId']
train

Unnamed: 0,user_id,item_id,artistId
0,-9223365420690745920,4789011704403477758,4862000404086590321
1,-9223357012432074795,-3685301030413601284,-5911062739952260612
2,-9223357012432074795,-1040959798245656474,6738173673409941080
3,-9223331337535253792,-8750332497321603794,-2531033142411854446
4,-9223331337535253792,7353026449734842135,-285672225852509948
...,...,...,...
4920099,9223323429584020370,-6973328648673682165,-3904789406522786641
4920100,9223323429584020370,3712711740551503640,-3690361587617342969
4920101,9223341540783520542,8309155247938420417,-7126264979390954459
4920102,9223351321092564051,-3093196378698218885,5622172636319693268


In [5]:
idmap = IdMap.from_values(train['item_id'].values)

In [75]:
train.loc[train['item_id'] == 5070279068330501452]

Unnamed: 0,user_id,item_id,artistId
283176,-8120145184429995217,5070279068330501452,2383145530433774070
2029953,-1636041827672401810,5070279068330501452,5843005928244594932
2201188,-989160456579025781,5070279068330501452,-4633425149168328588
2360170,-406090589880800750,5070279068330501452,6831642995075930029
2842265,1432046587625526532,5070279068330501452,5843005928244594932
3922845,5456855360814845806,5070279068330501452,1972603909932029167
4245012,6665956668819439866,5070279068330501452,5843005928244594932
4888875,9107201330287019163,5070279068330501452,5843005928244594932


In [6]:
train['item_id'] = train['item_id'].astype(str) + '_' + train['artistId'].astype(str)

In [6]:
train

Unnamed: 0,user_id,item_id,artistId
0,-9223365420690745920,4789011704403477758_4862000404086590321,4862000404086590321
1,-9223357012432074795,-3685301030413601284_-5911062739952260612,-5911062739952260612
2,-9223357012432074795,-1040959798245656474_6738173673409941080,6738173673409941080
3,-9223331337535253792,-8750332497321603794_-2531033142411854446,-2531033142411854446
4,-9223331337535253792,7353026449734842135_-285672225852509948,-285672225852509948
...,...,...,...
4920099,9223323429584020370,-6973328648673682165_-3904789406522786641,-3904789406522786641
4920100,9223323429584020370,3712711740551503640_-3690361587617342969,-3690361587617342969
4920101,9223341540783520542,8309155247938420417_-7126264979390954459,-7126264979390954459
4920102,9223351321092564051,-3093196378698218885_5622172636319693268,5622172636319693268


In [74]:
train[['item_id', 'artistId']].drop_duplicates()['item_id'].value_counts()

item_id
 5070279068330501452    5
-6467494677281402274    5
 4481730207054338272    4
-1355749385261276747    4
 8527365250157953982    4
                       ..
 8312374410148845933    1
-806689224870263072     1
 3938851722130084185    1
-7852283413896693105    1
 2123071927435539666    1
Name: count, Length: 629330, dtype: int64

In [71]:
item_features = DenseFeatures.from_dataframe(train[['item_id', 'artistId']].drop_duplicates(), id_col='item_id', id_map=idmap)

ValueError: Ids in dataframe must be unique

In [51]:
len(submission_users)

388896

In [54]:
from surprise.prediction_algorithms import Prediction

In [7]:
train[Columns.Datetime] = 1

In [8]:
train[Columns.Weight] = 1

In [9]:
ds = Dataset.construct(train.drop(columns='artistId'))

In [11]:
ds

Dataset(user_id_map=IdMap(external_ids=array([-9223365420690745920, -9223357012432074795, -9223331337535253792,
       ...,  9223341540783520542,  9223351321092564051,
        9223371011342822842])), item_id_map=IdMap(external_ids=array(['4789011704403477758_4862000404086590321',
       '-3685301030413601284_-5911062739952260612',
       '-1040959798245656474_6738173673409941080', ...,
       '-2931774007284753508_-4218997110618987591',
       '-6245722852470884312_-1384941453457413172',
       '2123071927435539666_-1384941453457413172'], dtype=object)), interactions=Interactions(df=         user_id  item_id  weight                      datetime
0              0        0     1.0 1970-01-01 00:00:00.000000001
1              1        1     1.0 1970-01-01 00:00:00.000000001
2              1        2     1.0 1970-01-01 00:00:00.000000001
3              2        3     1.0 1970-01-01 00:00:00.000000001
4              2        4     1.0 1970-01-01 00:00:00.000000001
...          ...      ... 

In [10]:
from rectools.models import LightFMWrapperModel
from lightfm import LightFM

model = LightFMWrapperModel(
        # внутри модели указываем параметр no_components
        # это размезность эмбеддингов, которые выучит модель
        model=LightFM(no_components = 10, item_alpha=1)
        )



In [11]:
model.fit(ds)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f2d52b75210>

In [12]:
sub = pd.read_csv('recsys_music_vk/submission.csv')
sub

Unnamed: 0,userId,recommended_id_1,recommended_id_2,recommended_id_3
0,-9223331337535253792,-2227263297444998835,-4695036104375440069,-8860367773155756475
1,-9223315435871723421,-2227263297444998835,-4695036104375440069,-8860367773155756475
2,-9223312953116183861,-2227263297444998835,-4695036104375440069,-8860367773155756475
3,-9223189477397287224,-2227263297444998835,-4695036104375440069,-8860367773155756475
4,-9223099495950652905,-2227263297444998835,-4695036104375440069,-8860367773155756475
...,...,...,...,...
388891,9223205520969697450,-2227263297444998835,-4695036104375440069,-8860367773155756475
388892,9223233690124762843,-2227263297444998835,-4695036104375440069,-8860367773155756475
388893,9223256937547721480,-2227263297444998835,-4695036104375440069,-8860367773155756475
388894,9223293703257417871,-2227263297444998835,-4695036104375440069,-8860367773155756475


In [13]:
recs = model.recommend(users=sub['userId'].unique(),
                       dataset=ds, 
                       k=3, filter_viewed=True)

In [14]:
recs.loc[recs['rank'] == 2, 'item_id'].value_counts()

item_id
2957260132109383577_-3200592824099817597     221790
7200408697552426542_-1820520629703737542     167027
-6483949695788118226_-4335273471619998413        12
2248023499283390095_-2092946365572069367          9
7173417807261608128_-8250830488160323337          7
-77588350185074450_-1238516356316409863           5
7784327918925939019_3271291963079767286           5
5207811820975355724_-8012373155940195240          4
8172706905504809477_-881523960919551605           4
-8959801802950779801_6758737809529525644          4
-8020190406615945128_-5585260879725572941         3
1521957696872268582_-5999388932388518213          3
-441802527107339623_-1166204734664754196          2
2822072713254465645_3720242214155296729           2
-7579097815205358934_-8952756520606106828         2
-8763062237036787703_-7494663662323819446         2
-379663161176396764_8867807711981221634           1
6968241575079683196_-6764623431090898875          1
2159321784421463369_-5951134964865843780          1
5020

In [20]:
recs.loc[recs['rank'] == 3, 'item_id'].value_counts()

item_id
7200408697552426542_-1820520629703737542     221797
2957260132109383577_-3200592824099817597     167031
2248023499283390095_-2092946365572069367          6
-6483949695788118226_-4335273471619998413         5
7173417807261608128_-8250830488160323337          5
2509952360005167174_6430413640089748666           4
-8959801802950779801_6758737809529525644          4
-4515975528852520966_-4488560309806482897         4
7653157716566687317_1397082796315862019           3
-4434608498776032978_2051084851707779665          3
-441802527107339623_-1166204734664754196          2
930151347829982872_569336915767488415             2
-2632380308348811469_-7165433957338165203         2
-815369767266630179_1413612511491545561           2
7784327918925939019_3271291963079767286           2
6968241575079683196_-6764623431090898875          2
1521957696872268582_-5999388932388518213          1
-6586756604999748916_-4481147419298292872         1
5294160809619344104_4561115434137379568           1
7207

In [19]:
def inference(user):
    preds = recs[recs['user_id'] == user]
    p = []
    for i in range(1, 4):
        p.append(int(preds.loc[preds['rank'] == i, 'item_id'].item().split('_')[0]))
    return p

In [20]:
sub[[f'recommended_id_{i}' for i in range(1, 4)]] = sub['userId'].apply(inference)

ValueError: Columns must be same length as key

In [23]:
sub[[f'recommended_id_{i}' for i in range(1, 4)]] = None

In [25]:
sub

Unnamed: 0,userId,recommended_id_1,recommended_id_2,recommended_id_3
0,-9223331337535253792,-2227263297444998835,-8860367773155756475,-4695036104375440069
1,-9223315435871723421,-2227263297444998835,-8860367773155756475,-4695036104375440069
2,-9223312953116183861,-2227263297444998835,-8860367773155756475,-4695036104375440069
3,-9223189477397287224,-2227263297444998835,-8860367773155756475,-4695036104375440069
4,-9223099495950652905,-2227263297444998835,-8860367773155756475,-4695036104375440069
...,...,...,...,...
388891,9223205520969697450,,,
388892,9223233690124762843,,,
388893,9223256937547721480,,,
388894,9223293703257417871,,,


In [19]:
preds

Unnamed: 0,user_id,item_id,score,rank
132072,-6788780282756898784,-6483949695788118226_-4335273471619998413,0.067667,1
132073,-6788780282756898784,7200408697552426542_-1820520629703737542,0.067642,2
132074,-6788780282756898784,2957260132109383577_-3200592824099817597,0.067641,3


In [24]:
sub

Unnamed: 0,userId,recommended_id_1,recommended_id_2,recommended_id_3
0,-9223331337535253792,-6483949695788118226,2957260132109383577,7200408697552426542
1,-9223315435871723421,-6483949695788118226,2957260132109383577,7200408697552426542
2,-9223312953116183861,-6483949695788118226,7200408697552426542,2957260132109383577
3,-9223189477397287224,-6483949695788118226,2957260132109383577,7200408697552426542
4,-9223099495950652905,-6483949695788118226,7200408697552426542,2957260132109383577
...,...,...,...,...
388891,9223205520969697450,-2227263297444998835,-4695036104375440069,-8860367773155756475
388892,9223233690124762843,-2227263297444998835,-4695036104375440069,-8860367773155756475
388893,9223256937547721480,-2227263297444998835,-4695036104375440069,-8860367773155756475
388894,9223293703257417871,-2227263297444998835,-4695036104375440069,-8860367773155756475


In [27]:
sub['recommended_id_1'] = recs.loc[recs['rank'] == 1, 'item_id'].values
sub['recommended_id_2'] = recs.loc[recs['rank'] == 2, 'item_id'].values
sub['recommended_id_3'] = recs.loc[recs['rank'] == 3, 'item_id'].values

In [28]:
sub

Unnamed: 0,userId,recommended_id_1,recommended_id_2,recommended_id_3
0,-9223331337535253792,-6483949695788118226_-4335273471619998413,2957260132109383577_-3200592824099817597,7200408697552426542_-1820520629703737542
1,-9223315435871723421,-6483949695788118226_-4335273471619998413,7200408697552426542_-1820520629703737542,2957260132109383577_-3200592824099817597
2,-9223312953116183861,-6483949695788118226_-4335273471619998413,7200408697552426542_-1820520629703737542,2957260132109383577_-3200592824099817597
3,-9223189477397287224,-6483949695788118226_-4335273471619998413,7200408697552426542_-1820520629703737542,2957260132109383577_-3200592824099817597
4,-9223099495950652905,-6483949695788118226_-4335273471619998413,2957260132109383577_-3200592824099817597,7200408697552426542_-1820520629703737542
...,...,...,...,...
388891,9223205520969697450,-6483949695788118226_-4335273471619998413,2957260132109383577_-3200592824099817597,7200408697552426542_-1820520629703737542
388892,9223233690124762843,-6483949695788118226_-4335273471619998413,2957260132109383577_-3200592824099817597,7200408697552426542_-1820520629703737542
388893,9223256937547721480,-6483949695788118226_-4335273471619998413,2957260132109383577_-3200592824099817597,7200408697552426542_-1820520629703737542
388894,9223293703257417871,-6483949695788118226_-4335273471619998413,2957260132109383577_-3200592824099817597,7200408697552426542_-1820520629703737542


In [31]:
sub['recommended_id_1'] = sub['recommended_id_1'].str.split('_').apply(lambda x: x[0])

0         -6483949695788118226
1         -6483949695788118226
2         -6483949695788118226
3         -6483949695788118226
4         -6483949695788118226
                  ...         
388891    -6483949695788118226
388892    -6483949695788118226
388893    -6483949695788118226
388894    -6483949695788118226
388895    -6483949695788118226
Name: recommended_id_1, Length: 388896, dtype: object

In [23]:
recs.loc[recs['rank'] == 1, ]

Unnamed: 0,user_id,item_id,score,rank
0,-9223331337535253792,-6483949695788118226_-4335273471619998413,0.047694,1
3,-9223312953116183861,-6483949695788118226_-4335273471619998413,0.187984,1
6,-9223099495950652905,-6483949695788118226_-4335273471619998413,0.086028,1
9,-9222957206048518287,-6483949695788118226_-4335273471619998413,0.047699,1
12,-9222870482588617996,-6483949695788118226_-4335273471619998413,0.025611,1
...,...,...,...,...
1166673,9222233924659284930,-6483949695788118226_-4335273471619998413,0.000614,1
1166676,9222344996066051590,-6483949695788118226_-4335273471619998413,0.000614,1
1166679,9222512814920300906,-6483949695788118226_-4335273471619998413,0.000614,1
1166682,9222609400741559907,-6483949695788118226_-4335273471619998413,0.000614,1


In [17]:
for user in tqdm(sub['userId'].unique()):
    preds = recs[recs['user_id'] == user]
    for i in range(1, 4):
        sub.loc[sub['userId'] == user, f'recommended_id_{i}'] = int(preds.loc[preds['rank'] == i, 'item_id'].item().split('_')[0])

 13%|██████████████                                                                                            | 51416/388896 [04:02<26:31, 212.05it/s]

KeyboardInterrupt



In [30]:
pd.read_csv('recsys_music_vk/submission.csv')

Unnamed: 0,userId,recommended_id_1,recommended_id_2,recommended_id_3
0,-9223331337535253792,-2227263297444998835,-4695036104375440069,-8860367773155756475
1,-9223315435871723421,-2227263297444998835,-4695036104375440069,-8860367773155756475
2,-9223312953116183861,-2227263297444998835,-4695036104375440069,-8860367773155756475
3,-9223189477397287224,-2227263297444998835,-4695036104375440069,-8860367773155756475
4,-9223099495950652905,-2227263297444998835,-4695036104375440069,-8860367773155756475
...,...,...,...,...
388891,9223205520969697450,-2227263297444998835,-4695036104375440069,-8860367773155756475
388892,9223233690124762843,-2227263297444998835,-4695036104375440069,-8860367773155756475
388893,9223256937547721480,-2227263297444998835,-4695036104375440069,-8860367773155756475
388894,9223293703257417871,-2227263297444998835,-4695036104375440069,-8860367773155756475


In [31]:
sub

Unnamed: 0,userId,recommended_id_1,recommended_id_2,recommended_id_3
0,-9223331337535253792,-2227263297444998835,-8860367773155756475,-4695036104375440069
1,-9223315435871723421,-2227263297444998835,-8860367773155756475,-4695036104375440069
2,-9223312953116183861,-2227263297444998835,-8860367773155756475,-4695036104375440069
3,-9223189477397287224,-2227263297444998835,-8860367773155756475,-4695036104375440069
4,-9223099495950652905,-2227263297444998835,-8860367773155756475,-4695036104375440069
...,...,...,...,...
388891,9223205520969697450,-2227263297444998835,-8860367773155756475,-4695036104375440069
388892,9223233690124762843,-2227263297444998835,-8860367773155756475,-4695036104375440069
388893,9223256937547721480,-2227263297444998835,-8860367773155756475,-4695036104375440069
388894,9223293703257417871,-2227263297444998835,-8860367773155756475,-4695036104375440069


In [29]:
sub.to_csv('recsys_music_vk/submissions/lfm.csv', index=False)

In [16]:
recs

Unnamed: 0,user_id,item_id,score,rank
0,-9223331337535253792,-2227263297444998835_3978571263572442387,4.335444,1
1,-9223331337535253792,-8860367773155756475_3181968405932737874,4.321002,2
2,-9223331337535253792,-4695036104375440069_-1317879221316914278,4.266007,3
3,-9223312953116183861,-2227263297444998835_3978571263572442387,4.474041,1
4,-9223312953116183861,-8860367773155756475_3181968405932737874,4.461549,2
...,...,...,...,...
1166683,9222609400741559907,-8860367773155756475_3181968405932737874,4.296211,2
1166684,9222609400741559907,-4695036104375440069_-1317879221316914278,4.241176,3
1166685,9223256937547721480,-2227263297444998835_3978571263572442387,4.310917,1
1166686,9223256937547721480,-8860367773155756475_3181968405932737874,4.296211,2


In [56]:
model.predict()

5

In [49]:
# submission_users = user_encoder.transform(submission_users)  # Кодируем userId

# Генерация рекомендаций для каждого пользователя
recommendations = []
for user in tqdm(submission_users):
    recommended_tracks = get_top_n_recommendations(user, n=3)
    # Декодируем id треков обратно в исходный формат
    recommended_tracks = track_encoder.inverse_transform(recommended_tracks)
    recommendations.append([user_encoder.inverse_transform([user])[0]] + list(recommended_tracks))

  0%|                                                                                                                       | 0/388896 [00:00<?, ?it/s]


TypeError: object of type 'numpy.int64' has no len()

In [26]:
train

Unnamed: 0,userId,id,artistId,rating
0,0,477925,108357,1
1,1,189097,25598,1
2,1,279382,122729,1
3,2,16063,51457,1
4,2,565644,68717,1
...,...,...,...,...
4920099,1166356,77050,41053,1
4920100,1166356,441231,42657,1
4920101,1166357,598218,16233,1
4920102,1166358,209331,114256,1


In [None]:
cross_validate