In [1]:
import pandas as pd
import numpy as np
from copy import copy

from recommender.data_processing import map_column, get_context
from recommender.training import Dataset
from recommender.models import Recommender

from torch.utils.data import DataLoader
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

import pytorch_lightning as pl
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/train.csv')
df.sort_values(by='ts_listen', inplace=True)

In [3]:
sample_size = 100
sample_id = np.sort(df.user_id.unique())[:sample_size].tolist()
sample_data = copy(df.query(f"user_id=={sample_id}"))
sample_data, sample_mapping, sample_inverse_mapping = map_column(sample_data, col_name="media_id")

gb = sample_data.groupby(by="user_id")

In [4]:
train_list, test_list = [], []

# Sampling based on user and split 80:20 by sorted 'ts_listen'

for group in list(gb.groups):
    df_group = gb.get_group(group)
    train_group, test_group = train_test_split(df_group, test_size=0.2, random_state=0, shuffle=False)
    train_list.append(train_group)
    test_list.append(test_group)

In [5]:
train = pd.concat(train_list)
test = pd.concat(test_list)

In [6]:
data_train = copy(train.query("is_listened==1"))
grp_by_train = data_train.groupby(by="user_id")
groups = list(grp_by_train.groups)

In [7]:
history_size = 120

train_data = Dataset(
    groups=groups,
    grp_by=grp_by_train,
    split="train",
    history_size=history_size,
)

In [12]:
batch_size = 32

train_loader = DataLoader(
    train_data,
    batch_size=batch_size,
    num_workers=0,
    shuffle=False,
)

In [13]:
log_dir = "recommender_logs"
model_dir = "recommender_models"
# model_dir = "recommender_models_cpu"

model = Recommender(
    vocab_size=len(sample_mapping) + 2,
    lr=1e-4,
    dropout=0.3,
)

logger = TensorBoardLogger(
    save_dir=log_dir,
)

checkpoint_callback = ModelCheckpoint(
    monitor="valid_loss",
    mode="min",
    dirpath=model_dir,
    filename="",
)

In [16]:
epochs = 100

trainer = pl.Trainer(
    max_epochs=epochs,
    logger=logger,
    # accelerator='gpu',
    # devices=1,
    callbacks=[checkpoint_callback],
)

trainer.fit(model, train_loader, train_loader)

In [15]:
result_val = trainer.test(dataloaders=train_loader)

output_json = {
    "val_loss": result_val[0]["test_loss"],
    "best_model_path": checkpoint_callback.best_model_path,
}

print(output_json)


  rank_zero_warn(
Restoring states from the checkpoint path at /Users/moon/HSLU/RecommenderSystem/recommender_models/epoch=85-step=344.ckpt
Loaded model weights from checkpoint at /Users/moon/HSLU/RecommenderSystem/recommender_models/epoch=85-step=344.ckpt
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_accuracy        0.0024678826328310864
        test_loss            9.973223686218262
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
{'val_loss': 9.973223686218262, 'best_model_path': '/Users/moon/HSLU/RecommenderSystem/recommender_models/epoch=85-step=344.ckpt'}


In [20]:
sample_data.query('media_id==206493')

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened,media_id_mapped
1643346,10,1478522553,206493,40434,0,20070424,0,0,155,0,0,70,1503,25,1,8
1643347,10,1478532067,206493,40434,0,20070424,0,0,155,0,0,70,1503,25,0,8


In [110]:
sample_data.groupby('user_id').count()

Unnamed: 0_level_0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,artist_id,user_age,is_listened,media_id_mapped
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,6317,6317,6317,6317,6317,6317,6317,6317,6317,6317,6317,6317,6317,6317,6317
1,6240,6240,6240,6240,6240,6240,6240,6240,6240,6240,6240,6240,6240,6240,6240
2,6211,6211,6211,6211,6211,6211,6211,6211,6211,6211,6211,6211,6211,6211,6211
3,5581,5581,5581,5581,5581,5581,5581,5581,5581,5581,5581,5581,5581,5581,5581
4,5426,5426,5426,5426,5426,5426,5426,5426,5426,5426,5426,5426,5426,5426,5426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3009,3009,3009,3009,3009,3009,3009,3009,3009,3009,3009,3009,3009,3009,3009
96,3008,3008,3008,3008,3008,3008,3008,3008,3008,3008,3008,3008,3008,3008,3008
97,2997,2997,2997,2997,2997,2997,2997,2997,2997,2997,2997,2997,2997,2997,2997
98,2989,2989,2989,2989,2989,2989,2989,2989,2989,2989,2989,2989,2989,2989,2989


In [75]:
list_medias = data_train.query('user_id==0')['media_id'].values.tolist()[-100:]

In [80]:
len(list_medias[:80]), len(list_medias[-20:]), len(list_medias)

(80, 20, 100)

In [84]:
list_medias[:80][-1], list_medias[79]

(7024835, 7024835)

In [85]:
import torch

PAD = 0
MASK = 1

# list_medias = []

ids = [PAD] * (120 - len(list_medias[:80]) - 1) + [sample_mapping[a] for a in list_medias[:80]] + [MASK]

# ids = [PAD] * (120 - 1 - 1) + [sample_mapping[206493]] + [MASK]

src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)

In [86]:
with torch.no_grad():
    prediction = model(src)

In [87]:
masked_pred = prediction[0, -1].numpy()

sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]

sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]

# return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie]

In [104]:
sorted_predicted_ids

[53496,
 48658,
 35909,
 52684,
 39334,
 52678,
 45669,
 49871,
 49220,
 52671,
 48085,
 55795,
 10252,
 11774,
 52680,
 52922,
 45497,
 50664,
 45646,
 47984,
 55791,
 55201,
 52670,
 16945,
 34303,
 49610,
 51716,
 31712,
 48366,
 38990,
 48416,
 48381,
 25103,
 20802,
 45581,
 37889,
 9784,
 3979,
 48269,
 1803,
 42573,
 40383,
 39094,
 37272,
 49600,
 4642,
 40562,
 49238,
 52673,
 48412,
 27741,
 50593,
 52672,
 10192,
 31934,
 54604,
 54246,
 10017,
 45296,
 41025,
 44285,
 34565,
 51343,
 53953,
 54792,
 21096,
 42383,
 17721,
 41850,
 1163,
 17601,
 12979,
 47734,
 25436,
 46065,
 29550,
 9484,
 28553,
 52084,
 44099,
 49084,
 34145,
 53483,
 44546,
 54049,
 51298,
 54999,
 31684,
 53391,
 38510,
 39233,
 9400,
 37698,
 51685,
 6753,
 56192,
 3769,
 53414,
 39741,
 48657,
 46507,
 3976,
 2256,
 46943,
 54818,
 43215,
 45698,
 47953,
 40990,
 8442,
 52681,
 18963,
 33299,
 20912,
 42293,
 53890,
 36330,
 47624,
 33478,
 44457,
 20063,
 30997,
 48257,
 13081,
 36888,
 50004,
 433

In [109]:
recommended = [sample_inverse_mapping[a] for a in sorted_predicted_ids[:1000] if a in sample_inverse_mapping.keys()]
# sample_inverse_mapping[sorted_predicted_ids[0]]

if list_medias[80] in recommended:
    print('yes')

In [108]:
list_medias[80]

15075139

In [2]:
data, mapping, inverse_mapping = map_column(data, col_name="media_id")

NameError: name 'map_column' is not defined

In [1]:
mapping[206493]

NameError: name 'mapping' is not defined

In [5]:
# data.query(f"media_id=={inverse_mapping[36474]}")

# data.query(f"user_id==[88,11]").media_id_mapped.unique()


In [6]:
# sorted_predicted_ids[:30]