In [1]:
import os
import sys
import numpy as np
import pandas as pd
from numpy import random as npr

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
SEED = 2021
BOOKS_DATASET_PATH = "books_dataset_cleaned.csv"

## Load and transform restaurants data

In [3]:
real_dataset = pd.read_csv(BOOKS_DATASET_PATH)
real_dataset = real_dataset.drop(["Location", "rating_sum", "Book_Title", "Count_All_Rate"], axis=1)
real_dataset.head()

Unnamed: 0,user_id,Age,Country,ISBN,book_rating,rating_Avg,Book_Author,Year_Of_Publication,Publisher
80942,6505,56.0,usa,671627759,6,6.0,Janet Dailey,1986.0,Pocket
369220,236700,31.224355,brazil,345431553,5,5.0,Robert Sheckley,1999.0,Del Rey Books
138069,110160,30.0,usa,743206061,9,7.173913,Mary Higgins Clark,2003.0,Simon &amp; Schuster
359883,212328,34.0,usa,852634692,6,6.0,E. Leadbeat,1983.0,Hyperion Books
348158,187517,28.0,usa,1579547141,9,9.0,Jorge Cruise,2003.0,Rodale Books


In [4]:
real_dataset["ISBN"] = real_dataset["ISBN"].astype('category').cat.codes

In [5]:
real_dataset["book_rating"] = real_dataset["book_rating"] / 10

In [6]:
real_dataset.dtypes

user_id                  int64
Age                    float64
Country                 object
ISBN                     int16
book_rating            float64
rating_Avg             float64
Book_Author             object
Year_Of_Publication    float64
Publisher               object
dtype: object

In [7]:
real_dataset

Unnamed: 0,user_id,Age,Country,ISBN,book_rating,rating_Avg,Book_Author,Year_Of_Publication,Publisher
80942,6505,56.000000,usa,8796,0.6,6.000000,Janet Dailey,1986.0,Pocket
369220,236700,31.224355,brazil,2819,0.5,5.000000,Robert Sheckley,1999.0,Del Rey Books
138069,110160,30.000000,usa,9971,0.9,7.173913,Mary Higgins Clark,2003.0,Simon &amp; Schuster
359883,212328,34.000000,usa,11658,0.6,6.000000,E. Leadbeat,1983.0,Hyperion Books
348158,187517,28.000000,usa,13282,0.9,9.000000,Jorge Cruise,2003.0,Rodale Books
...,...,...,...,...,...,...,...,...,...
255962,66942,31.000000,malaysia,6438,0.5,7.000000,Alexs D. Pate,1997.0,DreamWorks
293114,98391,52.000000,usa,10524,0.8,8.000000,Robyn Carr,2004.0,Mira
3430,33318,33.000000,usa,4360,0.8,7.000000,Jane Hamilton,1999.0,Anchor Books/Doubleday
34331,144038,49.000000,usa,420,0.8,7.716981,Barbara Kingsolver,1994.0,Perennial


## Training DeepFM components


In [8]:
def generate_filename(*, base, ext="csv"):
    now = datetime.datetime.utcnow()
    return "exp2_" + base + "_" + now.strftime("%d%m%y_%H%M%S") + "." + ext

In [9]:
attributes_names = {
    "dense": [
        "Age",
        "rating_Avg",
        "Year_Of_Publication"
    ],
    "sparse": [
        "Book_Author",
        "Country",
        "Publisher",
    ]
}

In [10]:
import dataclasses
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from modules.models import DeepFmModel
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


@dataclasses.dataclass
class DeepFmInputDataset:
    data: object
    dnn_feats: object
    linear_feats: object
    feat_names: object


class DeepFMDataLoader:
    def __init__(self, *, sparse_features, dense_features):
        self._sparse_feats = sparse_features
        self._dense_feats = dense_features
        
    def load(self, dataset):
        nn_input = pd.DataFrame()
        nn_input[self._sparse_feats] = dataset[self._sparse_feats]
        nn_input[self._dense_feats] = dataset[self._dense_feats]
        
        for feat in self._sparse_feats:
            encoder = LabelEncoder()
            nn_input[feat] = encoder.fit_transform(nn_input[feat])
            
        mms = MinMaxScaler(feature_range=(0,1))
        nn_input[self._dense_feats] = mms.fit_transform(nn_input[self._dense_feats])
        
        # problems may be here
        sparse_feature_columns = [
            SparseFeat(feat, vocabulary_size=nn_input[feat].nunique(), embedding_dim=4) 
            for i, feat in enumerate(self._sparse_feats)
        ]

        dense_feature_columns = [DenseFeat(feat, 1,) for feat in self._dense_feats]
        
        dnn_feat_cols = sparse_feature_columns + dense_feature_columns
        linear_feat_cols = sparse_feature_columns + dense_feature_columns
        
        feat_names = get_feature_names(linear_feat_cols + dnn_feat_cols)
        input_dataset = DeepFmInputDataset(
            data=nn_input,
            dnn_feats=dnn_feat_cols,
            linear_feats=linear_feat_cols,
            feat_names=feat_names
        )
        return input_dataset

In [11]:
def to_rating_matrix(dataset, predicted_response):
    result = pd.DataFrame()
    result["rating"] = predicted_response.reshape((len(predicted_response),))
    result["user_id"] = dataset["user_id"]
    result["item_id"] = dataset["item_id"]
    matrix = result.pivot(index="user_id", columns="item_id", values="rating")
    return matrix


In [12]:
def merge_feats(feats_a, feats_b):
    assert len(feats_a) == len(feats_b)
    merged = []
    for feat_a, feat_b in zip(feats_a, feats_b):
        if isinstance(feat_a, DenseFeat):
            continue
        if feat_a.vocabulary_size >= feat_b.vocabulary_size:
            merged.append(feat_a)
        else:
            merged.append(feat_b)
    return merged

In [13]:
def train_deepfm(feats, feat_names, x, y):
    deepfm = DeepFmModel(feats, feats, feat_names)
    train_set, test_set = train_test_split(x, test_size=0.2)
    deepfm.train(train_set, target_values=y[:len(train_set)])
    return deepfm
    

In [14]:
def pretrain_deepfm_model(*, data_loader, train_set, test_set):
    nn_train_input = data_loader.load(train_set)
    nn_test_input = data_loader.load(test_set)
    y = train_set["book_rating"].values
    
    merged_feats = merge_feats(nn_train_input.dnn_feats, nn_test_input.dnn_feats)
    deepfm = train_deepfm(merged_feats, nn_train_input.feat_names, x=nn_train_input.data, y=y)
    return deepfm



In [15]:
def split_dataset(dataset):
    n = len(dataset)
    mid = int(n / 2)
    return dataset[:mid], dataset[mid:]
    

In [21]:
from sdv.tabular import CTGAN, GaussianCopula, CopulaGAN
import datetime
from multiprocessing import Pool

def fit_syn_generator(df):
    model = CTGAN(verbose=True, epochs=10)
    df = df.astype("int64", errors="ignore") # Convert all numbers to int64
    model.fit(df.copy())
    return model


def fit_worker(args):
    dataset, name = args
    generator = fit_syn_generator(dataset)
    generator.save(generate_filename(base=name, ext="bin"))
    return generator


def fit_parallel(real_dataset):
    users = real_dataset[["user_id", "Age", "Country"]].drop_duplicates().drop("user_id", axis=1)
    items = real_dataset[
        ["ISBN", "Book_Author", "Publisher", "rating_Avg", "Year_Of_Publication"]
    ].drop_duplicates().drop("ISBN", axis=1)
    args = [
        (users, "users_generator"),
        (items, "items_generator")
    ]
    with Pool() as p:
        generators = p.map(fit_worker, args)
    return generators


def fit_synthetic_generators(real_dataset):
    users = real_dataset[["user_id", "Age", "Country"]].drop_duplicates().drop("user_id", axis=1)
    items = real_dataset[
        ["ISBN", "Book_Author", "Publisher", "rating_Avg", "Year_Of_Publication"]
    ].drop_duplicates().drop("ISBN", axis=1)
    
    users_generator = fit_syn_generator(users)
    users_generator.save(generate_filename(base="users_generator", ext="bin"))
    items_generator = fit_syn_generator(items)
    users_generator.save(generate_filename(base="items_generator", ext="bin"))
    return users_generator, items_generator


def generate_synthetic_data(users_generator, items_generator, n_users=100, n_items=100):
    syn_users = users_generator.sample(n_users)
    syn_items = items_generator.sample(n_items)
    syn_users["user_id"] = range(len(syn_users))
    syn_items["item_id"] = range(len(syn_items))
    syn_users["_merge_key"] = syn_items["_merge_key"] = 1
    
    syn_dataset = pd.merge(syn_items, syn_users, on="_merge_key")
    syn_dataset = syn_dataset.drop(["_merge_key"], axis=1)
    return syn_dataset


def measure_durations(fn, dataset, n, step=5):
    measures = []
    for i in range(step, n, step):
        start = datetime.datetime.utcnow()
        fn(dataset[:i])
        duration = datetime.datetime.utcnow() - start
        measures.append({"n": i, "time": duration.seconds})
        print(f"Duration {duration.seconds}")
    return pd.DataFrame(measures)


In [22]:
%%time
np.random.seed(SEED)
syn_sample = real_dataset.sample(frac=0.05)
users_generator, items_generator = fit_parallel(syn_sample)

  random_state=random_state).fit(X).labels_


Epoch 1, Loss G:  3.5466,Loss D: -0.0026
Epoch 1, Loss G:  6.3361,Loss D: -0.0011
CPU times: user 66.2 ms, sys: 75.3 ms, total: 141 ms
Wall time: 3.5 s


In [23]:
np.random.seed(SEED)
syn_dataset = generate_synthetic_data(users_generator, items_generator)
syn_dataset.head()

Unnamed: 0,Book_Author,Publisher,rating_Avg,Year_Of_Publication,item_id,Age,Country,user_id
0,Eric Francis,Casterman,4,2000,0,76,philippines,0
1,Eric Francis,Casterman,4,2000,0,42,spain,1
2,Eric Francis,Casterman,4,2000,0,32,japan,2
3,Eric Francis,Casterman,4,2000,0,37,united kingdom,3
4,Eric Francis,Casterman,4,2000,0,25,philippines,4


In [24]:
def calculate_rating_matrices(data_loader, real_dataset, syn_dataset):
    p1, p2 = split_dataset(real_dataset)
    deepfm_1 = pretrain_deepfm_model(data_loader=data_loader, train_set=p1, test_set=syn_dataset)
    deepfm_2 = pretrain_deepfm_model(data_loader=data_loader, train_set=p2, test_set=syn_dataset)
    nn_syn_dataset = data_loader.load(syn_dataset)
    
    y1 = deepfm_1.predict(nn_syn_dataset.data)
    y2 = deepfm_2.predict(nn_syn_dataset.data)
    
    matrix_1 = to_rating_matrix(syn_dataset, y1)
    matrix_2 = to_rating_matrix(syn_dataset, y2)
    return matrix_1, matrix_2



In [25]:
data_loader = DeepFMDataLoader(sparse_features=attributes_names["sparse"], dense_features=attributes_names["dense"])

In [26]:
%%time
np.random.seed(SEED)
nn_sample = real_dataset.sample(frac=0.3)
matrix_1, matrix_2 = calculate_rating_matrices(data_loader, nn_sample, syn_dataset)

cpu
Train on 1842 samples, validate on 461 samples, 8 steps per epoch
Epoch 1/10
0s - loss:  0.4412 - mse:  0.4241 - val_mse:  0.2681
Epoch 2/10
0s - loss:  0.1608 - mse:  0.1535 - val_mse:  0.0603
Epoch 3/10
0s - loss:  0.0389 - mse:  0.0387 - val_mse:  0.0458
Epoch 4/10
0s - loss:  0.0535 - mse:  0.0522 - val_mse:  0.0400
Epoch 5/10
0s - loss:  0.0342 - mse:  0.0334 - val_mse:  0.0345
Epoch 6/10
0s - loss:  0.0315 - mse:  0.0320 - val_mse:  0.0385
Epoch 7/10
0s - loss:  0.0304 - mse:  0.0300 - val_mse:  0.0341
Epoch 8/10
0s - loss:  0.0271 - mse:  0.0264 - val_mse:  0.0331
Epoch 9/10
0s - loss:  0.0256 - mse:  0.0253 - val_mse:  0.0334
Epoch 10/10
0s - loss:  0.0232 - mse:  0.0230 - val_mse:  0.0344
cpu
Train on 1842 samples, validate on 461 samples, 8 steps per epoch
Epoch 1/10
0s - loss:  0.3896 - mse:  0.3695 - val_mse:  0.1730
Epoch 2/10
0s - loss:  0.0902 - mse:  0.0846 - val_mse:  0.0395
Epoch 3/10
0s - loss:  0.0509 - mse:  0.0506 - val_mse:  0.0650
Epoch 4/10
0s - loss:  0.04

In [27]:
matrix_1

item_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.758257,0.733744,0.725057,0.695541,0.750845,0.809182,0.724481,0.775497,0.769044,0.673083,...,0.761500,0.667631,0.747353,0.764645,0.734191,0.754713,0.757430,0.764274,0.796554,0.720825
1,0.756114,0.732610,0.724292,0.695347,0.748988,0.805469,0.723354,0.772809,0.766858,0.673747,...,0.759087,0.668261,0.745434,0.762149,0.733042,0.752140,0.755305,0.761989,0.793645,0.719691
2,0.725727,0.704152,0.696138,0.668497,0.719290,0.772267,0.694957,0.741370,0.736606,0.648504,...,0.728515,0.643148,0.715694,0.731397,0.704492,0.721103,0.725290,0.731558,0.762117,0.691297
3,0.740540,0.718087,0.709906,0.681659,0.733780,0.788382,0.708858,0.756662,0.751384,0.660927,...,0.743436,0.655542,0.730222,0.746409,0.718466,0.736223,0.739954,0.746404,0.777461,0.705198
4,0.758257,0.733744,0.725057,0.695541,0.750845,0.809182,0.724481,0.775497,0.769044,0.673083,...,0.761500,0.667631,0.747353,0.764645,0.734191,0.754713,0.757430,0.764274,0.796554,0.720825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.749341,0.726319,0.718022,0.689444,0.742418,0.798019,0.717088,0.765776,0.760143,0.668240,...,0.752312,0.662802,0.738857,0.755324,0.726720,0.745231,0.748671,0.755240,0.786641,0.713429
96,0.757744,0.733611,0.725077,0.695714,0.750442,0.808132,0.724328,0.774833,0.768522,0.673548,...,0.760858,0.668031,0.746931,0.764020,0.734073,0.754060,0.756882,0.763725,0.795789,0.720670
97,0.740539,0.718086,0.709904,0.681658,0.733779,0.788381,0.708857,0.756660,0.751383,0.660926,...,0.743435,0.655541,0.730221,0.746408,0.718464,0.736222,0.739953,0.746402,0.777460,0.705197
98,0.740539,0.718086,0.709905,0.681658,0.733779,0.788381,0.708857,0.756660,0.751383,0.660927,...,0.743435,0.655541,0.730221,0.746408,0.718465,0.736222,0.739953,0.746403,0.777460,0.705197


## Create response function

In [28]:
import os
import sys
import importlib
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from modules import models, evaluator, trainers, utils
importlib.reload(models)
importlib.reload(evaluator)
importlib.reload(trainers)



<module 'modules.trainers' from '/Users/vldpro/Workspace/university/recsys/modules/trainers.py'>

In [29]:
class ResponseFunction:
    def __init__(self, deepfm_matrix_1, deepfm_matrix_2):
        assert deepfm_matrix_1.shape == deepfm_matrix_2.shape
        self._deepfm_matrix_1 = deepfm_matrix_1
        self._deepfm_matrix_2 = deepfm_matrix_2
        
    def __call__(self, a1: float, a2: float):
        a3 = max(0.0, 1 - a1 - a2)
        return (
            a1 * self._deepfm_matrix_1
            + a2 * self._deepfm_matrix_2
            + a3 * npr.normal(0, 1, size=self._deepfm_matrix_1.shape)
        )
    

resp_function = evaluator.ResponseFunctionConfig(
    factory=ResponseFunction, args=[matrix_1, matrix_2]
)

## Evaluation

In [31]:
%%time
_evaluators = [
    evaluator.TrainTestExecutorConfig(
        factory=trainers.AutoRecTrainTestExecutor,
        args={"config": {"epoch": 50}},
        model_name="autorec"
    ),
    evaluator.TrainTestExecutorConfig(
        factory=trainers.SvdTrainTestExecutor,
        args={},
        model_name="svd"
    ),
    evaluator.TrainTestExecutorConfig(
        factory=trainers.KnnTrainTestExecutor,
        args={},
        model_name="knn"
    )
]

np.random.seed(SEED)
_evaluator = evaluator.Evaluator(resp_function, n_proc=4)
_res = _evaluator.evaluate(
    _evaluators, 
    a_sample_rate=3,
    test_size=0.1,
    sample_sizes=[0.1]
)

In [32]:
_res

In [None]:
_res.to_csv(generate_filename(base="evalution_result"))

## Visualization

In [None]:
error_surface = utils.group_points_by_minimum_error(_res)
error_surface

In [None]:
from modules import utils
importlib.reload(utils)

import plotly.express as px

for ss in [0.1]:
    fig = px.scatter_3d(
        error_surface[error_surface["sample_size"] == ss], 
        x='a1', 
        y='a2', 
        z='rmse',
        size="rmse",
        size_max=18, 
        opacity=1,
        color="model_name",
        color_continuous_scale=px.colors.sequential.thermal[::-1]
    )

    fig.update_layout(
        margin=dict(l=20, r=20, t=20, b=20),
    )

    fig.show("notebook")