In [118]:
import pandas as pd
import numpy as np
import pickle
import cornac
from cornac.eval_methods import RatioSplit

In [3]:
items = pd.read_parquet('../data/processed/items.parquet.gzip')
users = pd.read_parquet('../data/processed/users.parquet.gzip')
interactions = pd.read_parquet('../data/processed/interactions.parquet.gzip')

In [5]:
feedback = list(interactions.itertuples(index=False, name=None))

In [9]:
ratio_split = RatioSplit(
    data=feedback,
    test_size=0.1,
    val_size=0.15,
    exclude_unknowns=True,
    verbose=True,
    seed=123,
)

rating_threshold = 1.0
exclude_unknowns = True




---
Training data:
Number of users = 67454
Number of items = 20191
Number of ratings = 3363663
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0




---
Test data:
Number of users = 53543
Number of items = 20157
Number of ratings = 449010
Number of unknown users = 0
Number of unknown items = 0




---
Validation data:
Number of users = 59044
Number of items = 20187
Number of ratings = 673468
---
Total users = 67454
Total items = 20191


In [10]:
# Instantiate evaluation metrics
ndcg_50 = cornac.metrics.NDCG(k=50)
rec_50 = cornac.metrics.Recall(k=50)
prec_50 = cornac.metrics.Precision(k=50)
map_metr = cornac.metrics.MAP()
metrics = [ndcg_50, rec_50, prec_50, map_metr]

most_pop = cornac.models.MostPop()
mf =  cornac.models.MF(k=30, max_iter=200, learning_rate=0.01, lambda_reg=0.02, seed=123, verbose=True),
pmf = cornac.models.PMF(k=30, max_iter=200, learning_rate=0.001, lambda_reg=0.001, seed=123, verbose=True),
bpr = cornac.models.BPR(k=30, max_iter=400, learning_rate=0.001, lambda_reg=0.001, seed=123, verbose=True)
wbpr = cornac.models.WBPR(k=30, max_iter=400, learning_rate=0.001, lambda_reg=0.001, verbose=True)

models = [most_pop, mf, pmf, bpr, wbpr]

In [13]:
# Put everything together into an experiment and run it
cornac.Experiment(
    eval_method=ratio_split,
    models=models,
    metrics=metrics,
    user_based=True, 
    show_validation=True, 
).run()


[MostPop] Training started!

[MostPop] Evaluation started!


Ranking:   0%|          | 0/53543 [00:00<?, ?it/s]

Ranking:   0%|          | 0/59044 [00:00<?, ?it/s]


[BPR] Training started!


  0%|          | 0/400 [00:00<?, ?it/s]

Optimization finished!

[BPR] Evaluation started!


Ranking:   0%|          | 0/53543 [00:00<?, ?it/s]

Ranking:   0%|          | 0/59044 [00:00<?, ?it/s]


[WBPR] Training started!


  0%|          | 0/400 [00:00<?, ?it/s]

Optimization finished!

[WBPR] Evaluation started!


Ranking:   0%|          | 0/53543 [00:00<?, ?it/s]

Ranking:   0%|          | 0/59044 [00:00<?, ?it/s]


VALIDATION:
...
        |    MAP | NDCG@50 | Precision@50 | Recall@50 | Time (s)
------- + ------ + ------- + ------------ + --------- + --------
MostPop | 0.0035 |  0.0065 |       0.0023 |    0.0143 | 233.7985
BPR     | 0.0141 |  0.0295 |       0.0106 |    0.0603 | 609.6479
WBPR    | 0.0165 |  0.0353 |       0.0115 |    0.0754 | 634.7556

TEST:
...
        |    MAP | NDCG@50 | Precision@50 | Recall@50 | Train (s) | Test (s)
------- + ------ + ------- + ------------ + --------- + --------- + --------
MostPop | 0.0032 |  0.0060 |       0.0017 |    0.0145 |    0.3739 | 202.0260
BPR     | 0.0123 |  0.0263 |       0.0078 |    0.0599 |  695.7398 | 563.3354
WBPR    | 0.0146 |  0.0318 |       0.0085 |    0.0754 |  443.1084 | 586.7267



In [14]:
# Instantiate the recommender models to be compared
gmf = cornac.models.GMF(
    num_factors=8,
    num_epochs=10,
    learner="adam",
    batch_size=256,
    lr=0.001,
    num_neg=50,
    seed=123,
    verbose=True,
)
mlp = cornac.models.MLP(
    layers=[64, 32, 16, 8],
    act_fn="tanh",
    learner="adam",
    num_epochs=10,
    batch_size=256,
    lr=0.001,
    num_neg=50,
    seed=123,
    verbose=True
)
neumf1 = cornac.models.NeuMF(
    num_factors=8,
    layers=[64, 32, 16, 8],
    act_fn="tanh",
    learner="adam",
    num_epochs=20,
    batch_size=256,
    lr=0.001,
    num_neg=50,
    seed=123,
    verbose=True,
)
neumf2 = cornac.models.NeuMF(
    name="NeuMF_pretrained",
    learner="adam",
    num_epochs=20,
    batch_size=256,
    lr=0.001,
    num_neg=50,
    seed=123,
    num_factors=gmf.num_factors,
    layers=mlp.layers,
    act_fn=mlp.act_fn,
    verbose=True
).pretrain(gmf, mlp)



In [15]:
# Put everything together into an experiment and run it
cornac.Experiment(
    eval_method=ratio_split,
    models=[gmf, mlp, neumf1, neumf2],
    metrics=metrics,
    user_based=True, 
    show_validation=True, 
).run()


[GMF] Training started!


  0%|          | 0/10 [00:00<?, ?it/s]


[GMF] Evaluation started!


Ranking:   0%|          | 0/53543 [00:00<?, ?it/s]

Ranking:   0%|          | 0/59044 [00:00<?, ?it/s]


[MLP] Training started!


  0%|          | 0/10 [00:00<?, ?it/s]


[MLP] Evaluation started!


Ranking:   0%|          | 0/53543 [00:00<?, ?it/s]

Ranking:   0%|          | 0/59044 [00:00<?, ?it/s]


[NeuMF] Training started!


  0%|          | 0/20 [00:00<?, ?it/s]


[NeuMF] Evaluation started!


Ranking:   0%|          | 0/53543 [00:00<?, ?it/s]

Ranking:   0%|          | 0/59044 [00:00<?, ?it/s]


[NeuMF_pretrained] Training started!


  0%|          | 0/20 [00:00<?, ?it/s]


[NeuMF_pretrained] Evaluation started!


Ranking:   0%|          | 0/53543 [00:00<?, ?it/s]

Ranking:   0%|          | 0/59044 [00:00<?, ?it/s]


VALIDATION:
...
                 |    MAP | NDCG@50 | Precision@50 | Recall@50 | Time (s)
---------------- + ------ + ------- + ------------ + --------- + --------
GMF              | 0.0179 |  0.0387 |       0.0126 |    0.0845 | 360.1399
MLP              | 0.0223 |  0.0488 |       0.0149 |    0.1081 | 906.1952
NeuMF            | 0.0200 |  0.0441 |       0.0129 |    0.1018 | 984.2480
NeuMF_pretrained | 0.0211 |  0.0471 |       0.0141 |    0.1069 | 802.1427

TEST:
...
                 |    MAP | NDCG@50 | Precision@50 | Recall@50 |  Train (s) | Test (s)
---------------- + ------ + ------- + ------------ + --------- + ---------- + --------
GMF              | 0.0161 |  0.0352 |       0.0093 |    0.0849 | 28575.1548 | 333.9424
MLP              | 0.0195 |  0.0434 |       0.0109 |    0.1064 | 30695.6753 | 841.1369
NeuMF            | 0.0174 |  0.0388 |       0.0094 |    0.0983 | 60843.6664 | 909.4717
NeuMF_pretrained | 0.0191 |  0.0425 |       0.0103 |    0.1060 | 66395.0623 | 737.0431



In [None]:
with open('train_set.pkl', 'wb') as f:
    pickle.dump(mlp.train_set, f)

In [120]:
with open('train_set.pkl', 'rb') as f:
    train_set_load = pickle.load(f)

# mlp2.train_set = train_set_load

# Recommendations Example

In [104]:
# show the best recommendation to the user with id = uid
uid = 8
rec_id = mlp.rank(uid)[0][np.argmax(mlp.rank(uid)[1])]
for i in ratio_split.test_set.iid_map.items():
    if i[1] == rec_id:
        iid = i[0]
items.query('unique_id == @iid')

Unnamed: 0,id,owner_id,date,text,likes,reposts,attachments,likers,unique_id
929,894,-196395451,2020-11-26 15:49:10,"этот свитер лучшее, что я встречала🤧\n\nиз плю...",192,28,['https://sun6-23.userapi.com/impg/xxXZ6PuuUZq...,"[47135954, 70033750, 82021529, 113275938, 1199...",894-196395451
