In [1]:
import datetime
import numpy as np
import pandas as pd
from numpy import random as npr

from modules import utils

In [2]:
SEED = 2021
REAL_DATASET_SAMPLE_SIZE = 1
N_CHUNKS = 5
npr.seed(SEED)

In [3]:
rating_df = pd.read_csv("data/rating.csv")
rating_df = rating_df.sample(frac=REAL_DATASET_SAMPLE_SIZE)
rating_df["rating"] = rating_df["rating"]
rating_df

DeepCTR-PyTorch version 0.2.7 detected. Your version is 0.2.5.
Use `pip install -U deepctr-torch` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.2.7


Unnamed: 0,user_id,item_id,rating,timestamp
16334,128,49,1.0,977434870
440002,2689,3238,0.8,973344429
296395,1757,3243,0.4,974790728
170347,1084,699,0.6,974941318
281936,1690,1132,0.6,974775705
...,...,...,...,...
133741,864,2400,0.6,975281400
476288,2925,218,0.8,1006658564
923705,5576,3451,0.2,959289879
224597,1357,575,0.8,974769790


In [4]:
def evaluate(df):
    errors = {
        "svd": utils.evaluate_svd(df.copy(), rating_scale=(0, 1)),
        "knn": utils.evaluate_knn(df.copy(), rating_scale=(0, 1)),
        "autorec": utils.evaluate_autorec(df.copy())
    }
    return errors

In [None]:
%%time
npr.seed(SEED)
parts = np.array_split(rating_df, N_CHUNKS)
error_logs = []


for i, part in enumerate(parts):
    start = datetime.datetime.utcnow()    
    part = utils.map_idx_to_matrix_indices(part)
    error = evaluate(part)
    error_logs.append({"part": i, "error": error})
    duration = datetime.datetime.utcnow() - start
    print(f"Part {i} processed within {duration}")
    


RMSE: 0.1963
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.1981
Load data finished. Number of users: 6034 Number of items: 3483
IAutoRec.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


2021-08-26 11:22:47.819217: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2021-08-26 11:22:47.830770: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2494140000 Hz
2021-08-26 11:22:47.832801: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f75ac000b60 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-08-26 11:22:47.832824: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2021-08-26 11:22:47.837011: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-08-26 11:22:47.837033: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)
2021-08-26 11:22:47.837054: I tenso

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Network built
Epoch: 0000; RMSE:0.3614998750848397; MAE:0.3026073585765083
Epoch: 0003; RMSE:0.24343916324525028; MAE:0.18918725413818357
Epoch: 0006; RMSE:0.2072519699562346; MAE:0.16249212706561916
Epoch: 0009; RMSE:0.20163269765771447; MAE:0.1598107518264304
Epoch: 0012; RMSE:0.1991688485752722; MAE:0.1582988678143831
Epoch: 0015; RMSE:0.19854040338378426; MAE:0.15703645876679892
Epoch: 0018; RMSE:0.19873700834689106; MAE:0.1570683672879768
Epoch: 0021; RMSE:0.19781636363418173; MAE:0.15590163126470533
Epoch: 0024; RMSE:0.19649345583401048; MAE:0.1552357591598913
Epoch: 0027; RMSE:0.19681667310431483; MAE:0.15564839287187296
Epoch: 0030; RMSE:0.19589018773671993; MAE:0.15441066273824727
Epoch: 0033; RMSE:0.19637675942092145; MAE:0.15503956001187086
Epoch: 0036; RMSE:0.19479341589913177; MAE:0.15379838801268925
Epoch: 0039; RMSE:0.19436737989986413; MAE:0.1536160847782264
Epoch: 0042; RMSE:0.19344808749984604; MAE:0.15296829165214126
Epoch: 0045; RMSE:0.1934531243412405; MAE:0.152837

In [None]:
[len(p) for p in parts]

In [None]:
error_logs

In [None]:
errors = []
for e in error_logs:
    for k, v in e["error"].items():
        errors.append({"part": e["part"], "error": v, "model": k})
    

In [None]:
error_df = pd.DataFrame(errors)
error_df

In [None]:
error_df.to_csv("./cache/find_best_chunks_results_5_parts.csv", index=False)

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.barplot(data=error_df, x="part", y="error", hue="model")