In [72]:
import cornac
import pandas as pd

from cornac.data.reader import read_text

In [64]:
cornac.__version__

'1.14.2'

In [65]:
TEXT_MODALITY_FILE_PATH = "../../data/text-modality.txt"
TRAINING_FILE_PATH = "../../data/train_ratings_seen.csv"

In [66]:
texts, ids = read_text(TEXT_MODALITY_FILE_PATH, sep="::")

In [67]:
len(ids)

39520

In [68]:
len(texts)

39520

In [69]:
item_text_modality = cornac.data.TextModality(
    corpus=texts,
    ids=ids,
    tokenizer=cornac.data.text.BaseTokenizer(sep=" ", stop_words="english"),
    max_vocab=800,
    max_doc_freq=0.5,
)

In [70]:
reader = cornac.data.Reader()
ratings = reader.read(TRAINING_FILE_PATH, "UIR", sep=",", skip_lines=1)

In [71]:
len(ratings)

119922

In [73]:
train_df = pd.DataFrame(ratings, columns=["user_id", "item_id", "ratings"])
count_user = train_df.copy()
count_user["count"] = 1
counted = count_user[["user_id", "count"]].groupby(["user_id"]).sum().reset_index()
to_remove_users = counted[counted["count"] < 3]["user_id"].unique()

In [75]:
ratings = [r for r in ratings if r[0] not in to_remove_users]
len(ratings)

119921

In [79]:
VERBOSE = True
SEED = 42

ratio_split = cornac.eval_methods.StratifiedSplit(
    ratings,
    group_by="user",
    fmt="UIR",
    test_size=0.1,
    rating_threshold=1.0,
    exclude_unknowns=False,
    seed=SEED,
    verbose=VERBOSE,
    item_text=item_text_modality,
)

convmf = cornac.models.ConvMF(
    n_epochs=5,
    verbose=VERBOSE,
    seed=SEED,
)

# Define metrics
metrics = [
    cornac.metrics.RMSE(),
    cornac.metrics.FMeasure(k=20),
    cornac.metrics.AUC(),
    cornac.metrics.MRR(),
    cornac.metrics.NCRR(k=20),
    cornac.metrics.NDCG(k=20),
    cornac.metrics.Recall(k=20)
]

cornac.Experiment(
    eval_method=ratio_split,
    models=[convmf],
    metrics=metrics,
    user_based=True,
    verbose=VERBOSE,
    save_dir="./run",
).run()

rating_threshold = 1.0
exclude_unknowns = False
---
Training data:
Number of users = 5065
Number of items = 36663
Number of ratings = 105949
Max rating = 5.0
Min rating = 1.0
Global mean = 3.9
---
Test data:
Number of users = 5065
Number of items = 8961
Number of ratings = 13972
Number of unknown users = 0
Number of unknown items = 2857
---
Total users = 5065
Total items = 39520

[ConvMF] Training started!
Epoch: 1/5


Optimizing CNN: 100%|██████████| 5/5 [24:14<00:00, 290.94s/it]


Loss: 64875018192.94695 Elapsed: 4376.9780s Converge: 6487501819294695709767316294497949216505447992797635247341568.000000 
Epoch: 2/5


Optimizing CNN: 100%|██████████| 5/5 [23:24<00:00, 280.92s/it]


Loss: 201457673.69819 Elapsed: 5131.4179s Converge: 0.996895 
Epoch: 3/5


Optimizing CNN: 100%|██████████| 5/5 [24:05<00:00, 289.11s/it]


Loss: 99645746.40399 Elapsed: 4831.9298s Converge: 0.505376 
Epoch: 4/5


Optimizing CNN: 100%|██████████| 5/5 [3:52:30<00:00, 2790.06s/it]  


Loss: 94037139.40897 Elapsed: 17464.7101s Converge: 0.056285 
Epoch: 5/5


Optimizing CNN: 100%|██████████| 5/5 [25:01<00:00, 300.26s/it]


Loss: 91574834.35241 Elapsed: 5282.7690s Converge: 0.026184 

[ConvMF] Evaluation started!


Rating: 100%|██████████| 13972/13972 [00:01<00:00, 9625.22it/s] 
Ranking: 100%|██████████| 5065/5065 [01:00<00:00, 83.66it/s]

ConvMF model is saved to ./run/ConvMF/2022-06-16_10-44-31-255319.pkl

TEST:
...
       |   RMSE |    AUC |  F1@20 |    MRR | NCRR@20 | NDCG@20 | Recall@20 |  Train (s) | Test (s)
------ + ------ + ------ + ------ + ------ + ------- + ------- + --------- + ---------- + --------
ConvMF | 0.7340 | 0.5615 | 0.0030 | 0.0086 |  0.0036 |  0.0062 |    0.0131 | 42494.7957 |  64.7510




