In [1]:
import os
import optuna
import gc
import pandas as pd
import pickle

from src.core.loaders import CsvDataLoader, PickleSaver
from src.core.pipelines import MatrixFactorizationTrainingPipeline
from src.core.model import PytorchMatrixFactorizationModel, MFAdvanced
from src.core.utils import load_params
from src.core.evaluation import calculate_metrics

os.chdir("../")

config_path = "config.yaml"

config = load_params(config_path)

num_users = len(
    CsvDataLoader(
        interactions_path=config.basic.interactions_path,
        articles_path=config.basic.articles_path,
    )
    .get_interactions()
    .personId.unique()
)

num_items = len(
    CsvDataLoader(
        interactions_path=config.basic.interactions_path,
        articles_path=config.basic.articles_path,
    )
    .get_interactions()
    .contentId.unique()
)

training_pipeline = MatrixFactorizationTrainingPipeline(
    data_loader=CsvDataLoader(
        interactions_path=config.basic.interactions_path,
        articles_path=config.basic.articles_path,
    ),
    saver=PickleSaver(path="./"),
    model=PytorchMatrixFactorizationModel(
        model=MFAdvanced(
            num_users=num_users,
            num_items=num_items,
            emb_dim=config.matrix_factorization.emb_dim,
            init=config.matrix_factorization.init,
            bias=config.matrix_factorization.bias,
            sigmoid=config.matrix_factorization.sigmoid,
        ),
        batch_size=config.matrix_factorization.batch_size,
        lr=config.matrix_factorization.lr,
        num_epochs=config.matrix_factorization.num_epochs,
        num_workers=config.matrix_factorization.num_workers,
        device=config.matrix_factorization.device,
    ),
    top_n=config.basic.top_n,
    split_date=config.basic.split_date,
    event_type_strength=config.event_type_strength,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
interactions_df = training_pipeline.data_loader.get_interactions()

training_pipeline.label_encode_ids(
    interactions_df=interactions_df, saver=training_pipeline.saver
)

interactions_train, interactions_test = training_pipeline.training_preprocess(
    interactions_df, training_pipeline.event_type_strength
)

interactions_labels = training_pipeline.create_labels(test=interactions_test)

In [None]:
results = []


def mf_hyperparam_opt(trial):
    params = {
        "sigmoid": True,
        "bias": True,
        "init": True,
        "lr": trial.suggest_float("lr", 0.0001, 0.2, log=True),
        "num_epochs": trial.suggest_int("num_epochs", 5, 30),
        "emb_dim": trial.suggest_int("emb_dim", 5, 40),
        "batch_size": 124,
    }

    training_pipeline.model = PytorchMatrixFactorizationModel(
        model=MFAdvanced(
            num_users=num_users,
            num_items=num_items,
            emb_dim=params["emb_dim"],
            init=params["init"],
            bias=params["bias"],
            sigmoid=params["sigmoid"],
        ),
        batch_size=params["batch_size"],
        lr=params["lr"],
        num_epochs=params["num_epochs"],
        num_workers=4,
        device="cpu",
    )

    training_pipeline.model.train(interactions_train, interactions_test)

    interactions_labels["preds"] = [
        training_pipeline.model.recommend_item(
            user_id=person_id,
            item_ids=interactions_train.contentId.unique(),
            top_n=training_pipeline.top_n,
        )
        for person_id in interactions_labels["personId"].tolist()
    ]

    metrics = calculate_metrics(
        prediction_col="preds", interactions_labels=interactions_labels
    )

    results.append(
        {
            "Trial": trial.number,
            **metrics,
            "last_epoch_train_loss": training_pipeline.model.epoch_train_losses[-1],
            "last_epoch_val_loss": training_pipeline.model.epoch_val_losses[-1],
            **params,
        }
    )

    pickle.dump(
        training_pipeline.model,
        open(f"experiments/pytorch_mf/mf_model{trial.number}.pkl", "wb"),
    )
    pickle.dump(results, open("experiments/pytorch_mf/results.pkl", "wb"))
    gc.collect()
    print(results)

    return metrics["precision_at_10"]


study = optuna.create_study(study_name="OptunaXgb", direction="maximize")
study.optimize(mf_hyperparam_opt, n_trials=20)

In [3]:
results = pickle.load(open("experiments/pytorch_mf/results.pkl", "rb"))
pd.DataFrame(results).sort_values("precision_at_10", ascending=False)

Unnamed: 0,Trial,precision_at_10,recall_at_10,last_epoch_train_loss,last_epoch_val_loss,sigmoid,bias,init,lr,num_epochs,emb_dim,batch_size
14,14,0.002289,0.009984,0.088883,0.29197,True,True,True,0.011806,13,32,124
13,13,0.001937,0.007226,0.091899,0.276071,True,True,True,0.012085,14,31,124
15,15,0.001761,0.008524,0.110583,0.285703,True,True,True,0.013499,11,30,124
16,16,0.001585,0.005641,0.132691,0.280824,True,True,True,0.014482,5,30,124
17,17,0.001056,0.004886,0.082346,0.26178,True,True,True,0.01254,22,25,124
19,19,0.001056,0.004079,0.024092,0.299904,True,True,True,0.005888,22,29,124
1,1,0.001056,0.004665,0.037117,0.65184,True,True,True,0.001216,16,35,124
3,3,0.000704,0.003668,0.062506,0.459135,True,True,True,0.002007,17,10,124
11,11,0.000704,0.00314,0.088402,0.364573,True,True,True,0.003414,14,5,124
9,9,0.000704,0.001496,0.018952,0.356789,True,True,True,0.00517,18,19,124
