In [30]:
import os
import sys
import numpy as np
import pandas as pd
from numpy import random as npr

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
SEED = 2021
BOOKS_DATASET_PATH = "books_dataset_cleaned.csv"

## Load and transform restaurants data

In [3]:
real_dataset = pd.read_csv(BOOKS_DATASET_PATH)
real_dataset = real_dataset.drop(["Location", "rating_sum", "Book_Title", "Count_All_Rate"], axis=1)
real_dataset = real_dataset.sample(frac=0.05)
real_dataset.head()

Unnamed: 0,user_id,Age,Country,ISBN,book_rating,rating_Avg,Book_Author,Year_Of_Publication,Publisher
108209,101876,38.0,usa,0020199600,8,8.416667,F. Scott Fitzgerald,1988.0,Scribner Paper Fiction
136194,254224,37.958174,usa,067172262X,8,8.708333,William Shakespeare,1992.0,Washington Square Press
236874,51883,31.0,usa,0373272480,10,10.0,Mary Mcbride,2002.0,Silhouette
67363,5206,18.0,spain,8408048082,8,7.5,Paulo Coelho,2003.0,Planeta Pub Corp
123452,257493,25.0,usa,0380820854,10,7.5,Julia Quinn,2003.0,Avon


In [4]:
real_dataset["ISBN"] = real_dataset["ISBN"].astype('category').cat.codes

In [5]:
real_dataset["book_rating"] = real_dataset["book_rating"] / 10

In [6]:
real_dataset.dtypes

user_id                  int64
Age                    float64
Country                 object
ISBN                     int16
book_rating            float64
rating_Avg             float64
Book_Author             object
Year_Of_Publication    float64
Publisher               object
dtype: object

In [7]:
real_dataset

Unnamed: 0,user_id,Age,Country,ISBN,book_rating,rating_Avg,Book_Author,Year_Of_Publication,Publisher
108209,101876,38.000000,usa,4,0.8,8.416667,F. Scott Fitzgerald,1988.0,Scribner Paper Fiction
136194,254224,37.958174,usa,246,0.8,8.708333,William Shakespeare,1992.0,Washington Square Press
236874,51883,31.000000,usa,76,1.0,10.000000,Mary Mcbride,2002.0,Silhouette
67363,5206,18.000000,spain,371,0.8,7.500000,Paulo Coelho,2003.0,Planeta Pub Corp
123452,257493,25.000000,usa,105,1.0,7.500000,Julia Quinn,2003.0,Avon
...,...,...,...,...,...,...,...,...,...
340131,229243,29.000000,usa,24,0.8,8.800000,Francesca Lia Block,1991.0,HarperTrophy
289151,98391,52.000000,usa,41,0.5,5.000000,Rett MacPherson,2004.0,St. Martin's Minotaur
26477,60185,46.000000,usa,89,1.0,7.206897,Bernhard Schlink,1999.0,Vintage Books USA
322304,139579,49.000000,usa,287,0.8,8.000000,Hasan Shah,1993.0,New Directions Publishing Corporation


## Training DeepFM components


In [8]:
def generate_filename(*, base, ext="csv"):
    now = datetime.datetime.utcnow()
    return "exp2_" + base + "_" + now.strftime("%d%m%y_%H%M%S") + "." + ext

In [9]:
attributes_names = {
    "dense": [
        "Age",
        "rating_Avg",
        "Year_Of_Publication"
    ],
    "sparse": [
        "Book_Author",
        "Country",
        "Publisher",
    ]
}

In [10]:
import dataclasses
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from modules.models import DeepFmModel
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


@dataclasses.dataclass
class DeepFmInputDataset:
    data: object
    dnn_feats: object
    linear_feats: object
    feat_names: object


class DeepFMDataLoader:
    def __init__(self, *, sparse_features, dense_features):
        self._sparse_feats = sparse_features
        self._dense_feats = dense_features
        
    def load(self, dataset):
        nn_input = pd.DataFrame()
        nn_input[self._sparse_feats] = dataset[self._sparse_feats]
        nn_input[self._dense_feats] = dataset[self._dense_feats]
        
        for feat in self._sparse_feats:
            encoder = LabelEncoder()
            nn_input[feat] = encoder.fit_transform(nn_input[feat])
            
        mms = MinMaxScaler(feature_range=(0,1))
        nn_input[self._dense_feats] = mms.fit_transform(nn_input[self._dense_feats])
        
        # problems may be here
        sparse_feature_columns = [
            SparseFeat(feat, vocabulary_size=nn_input[feat].nunique(), embedding_dim=4) 
            for i, feat in enumerate(self._sparse_feats)
        ]

        dense_feature_columns = [DenseFeat(feat, 1,) for feat in self._dense_feats]
        
        dnn_feat_cols = sparse_feature_columns + dense_feature_columns
        linear_feat_cols = sparse_feature_columns + dense_feature_columns
        
        feat_names = get_feature_names(linear_feat_cols + dnn_feat_cols)
        input_dataset = DeepFmInputDataset(
            data=nn_input,
            dnn_feats=dnn_feat_cols,
            linear_feats=linear_feat_cols,
            feat_names=feat_names
        )
        return input_dataset

In [11]:
def to_rating_matrix(dataset, predicted_response):
    result = pd.DataFrame()
    result["rating"] = predicted_response.reshape((len(predicted_response),))
    result["user_id"] = dataset["user_id"]
    result["item_id"] = dataset["item_id"]
    matrix = result.pivot(index="user_id", columns="item_id", values="rating")
    return matrix


In [12]:
def merge_feats(feats_a, feats_b):
    assert len(feats_a) == len(feats_b)
    merged = []
    for feat_a, feat_b in zip(feats_a, feats_b):
        if isinstance(feat_a, DenseFeat):
            continue
        if feat_a.vocabulary_size >= feat_b.vocabulary_size:
            merged.append(feat_a)
        else:
            merged.append(feat_b)
    return merged

In [13]:
def train_deepfm(feats, feat_names, x, y):
    deepfm = DeepFmModel(feats, feats, feat_names)
    train_set, test_set = train_test_split(x, test_size=0.2)
    deepfm.train(train_set, target_values=y[:len(train_set)])
    return deepfm
    

In [14]:
def pretrain_deepfm_model(*, data_loader, train_set, test_set):
    nn_train_input = data_loader.load(train_set)
    nn_test_input = data_loader.load(test_set)
    y = train_set["book_rating"].values
    
    merged_feats = merge_feats(nn_train_input.dnn_feats, nn_test_input.dnn_feats)
    deepfm = train_deepfm(merged_feats, nn_train_input.feat_names, x=nn_train_input.data, y=y)
    return deepfm



In [15]:
def split_dataset(dataset):
    n = len(dataset)
    mid = int(n / 2)
    return dataset[:mid], dataset[mid:]
    

In [16]:
from sdv.tabular import CTGAN, GaussianCopula, CopulaGAN
import datetime
from multiprocessing import Pool

def fit_syn_generator(df):
    model = CTGAN(verbose=True, epochs=10)
    df = df.astype("int64", errors="ignore") # Convert all numbers to int64
    model.fit(df.copy())
    return model


def fit_worker(args):
    dataset, name = args
    generator = fit_syn_generator(dataset)
    generator.save(generate_filename(base=name, ext="bin"))
    return generator


def fit_parallel(real_dataset):
    users = real_dataset[["user_id", "Age", "Country"]].drop_duplicates().drop("user_id", axis=1)
    items = real_dataset[
        ["ISBN", "Book_Author", "Publisher", "rating_Avg", "Year_Of_Publication"]
    ].drop_duplicates().drop("ISBN", axis=1)
    print(f"Users shape {users.shape}")
    print(f"Items shape {items.shape}")
    args = [
        (users, "users_generator"),
        (items, "items_generator")
    ]
    with Pool() as p:
        generators = p.map(fit_worker, args)
    return generators


def fit_synthetic_generators(real_dataset):
    users = real_dataset[["user_id", "Age", "Country"]].drop_duplicates().drop("user_id", axis=1)
    items = real_dataset[
        ["ISBN", "Book_Author", "Publisher", "rating_Avg", "Year_Of_Publication"]
    ].drop_duplicates().drop("ISBN", axis=1)
    
    users_generator = fit_syn_generator(users)
    users_generator.save(generate_filename(base="users_generator", ext="bin"))
    items_generator = fit_syn_generator(items)
    users_generator.save(generate_filename(base="items_generator", ext="bin"))
    return users_generator, items_generator


def generate_synthetic_data(users_generator, items_generator, n_users=100, n_items=100):
    syn_users = users_generator.sample(n_users)
    syn_items = items_generator.sample(n_items)
    syn_users["user_id"] = range(len(syn_users))
    syn_items["item_id"] = range(len(syn_items))
    syn_users["_merge_key"] = syn_items["_merge_key"] = 1
    
    syn_dataset = pd.merge(syn_items, syn_users, on="_merge_key")
    syn_dataset = syn_dataset.drop(["_merge_key"], axis=1)
    return syn_dataset


def measure_durations(fn, dataset, n, step=5):
    measures = []
    for i in range(step, n, step):
        start = datetime.datetime.utcnow()
        fn(dataset[:i])
        duration = datetime.datetime.utcnow() - start
        measures.append({"n": i, "time": duration.seconds})
        print(f"Duration {duration.seconds}")
    return pd.DataFrame(measures)


DeepCTR-PyTorch version 0.2.6 detected. Your version is 0.2.5.
Use `pip install -U deepctr-torch` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.2.6


In [17]:
%%time
np.random.seed(SEED)
users_generator, items_generator = fit_parallel(real_dataset)

Users shape (355, 2)
Items shape (379, 4)


  random_state=random_state).fit(X).labels_


Epoch 1, Loss G:  2.9787,Loss D:  0.0014
Epoch 2, Loss G:  3.0279,Loss D:  0.0044
Epoch 3, Loss G:  2.9289,Loss D: -0.0079
Epoch 4, Loss G:  2.9445,Loss D: -0.0136
Epoch 1, Loss G:  5.6613,Loss D:  0.0024
Epoch 5, Loss G:  2.9085,Loss D:  0.0032
Epoch 6, Loss G:  2.9142,Loss D: -0.0016
Epoch 7, Loss G:  2.9044,Loss D: -0.0246
Epoch 8, Loss G:  2.8575,Loss D: -0.0035
Epoch 2, Loss G:  5.6249,Loss D:  0.0024
Epoch 9, Loss G:  2.8517,Loss D: -0.0122
Epoch 10, Loss G:  2.8277,Loss D: -0.0223
Epoch 3, Loss G:  5.6461,Loss D: -0.0048
Epoch 4, Loss G:  5.6377,Loss D: -0.0203
Epoch 5, Loss G:  5.6066,Loss D: -0.0106
Epoch 6, Loss G:  5.6192,Loss D: -0.0051
Epoch 7, Loss G:  5.6368,Loss D: -0.0132
Epoch 8, Loss G:  5.6220,Loss D: -0.0294
Epoch 9, Loss G:  5.5931,Loss D: -0.0271
Epoch 10, Loss G:  5.6109,Loss D: -0.0290
CPU times: user 83.8 ms, sys: 75.3 ms, total: 159 ms
Wall time: 5.7 s


In [18]:
np.random.seed(SEED)
syn_dataset = generate_synthetic_data(users_generator, items_generator)
syn_dataset.head()

Unnamed: 0,Book_Author,Publisher,rating_Avg,Year_Of_Publication,item_id,Age,Country,user_id
0,Ann Rinaldi,Little Brown &amp; Company,5,2005,0,38,usa,0
1,Ann Rinaldi,Little Brown &amp; Company,5,2005,0,34,belgium,1
2,Ann Rinaldi,Little Brown &amp; Company,5,2005,0,44,argentina,2
3,Ann Rinaldi,Little Brown &amp; Company,5,2005,0,40,united kingdom,3
4,Ann Rinaldi,Little Brown &amp; Company,5,2005,0,41,philippines,4


In [19]:
def calculate_rating_matrices(data_loader, real_dataset, syn_dataset):
    p1, p2 = split_dataset(real_dataset)
    deepfm_1 = pretrain_deepfm_model(data_loader=data_loader, train_set=p1, test_set=syn_dataset)
    deepfm_2 = pretrain_deepfm_model(data_loader=data_loader, train_set=p2, test_set=syn_dataset)
    nn_syn_dataset = data_loader.load(syn_dataset)
    
    y1 = deepfm_1.predict(nn_syn_dataset.data)
    y2 = deepfm_2.predict(nn_syn_dataset.data)
    
    matrix_1 = to_rating_matrix(syn_dataset, y1)
    matrix_2 = to_rating_matrix(syn_dataset, y2)
    return matrix_1, matrix_2



In [20]:
data_loader = DeepFMDataLoader(sparse_features=attributes_names["sparse"], dense_features=attributes_names["dense"])

In [21]:
%%time
np.random.seed(SEED)
matrix_1, matrix_2 = calculate_rating_matrices(data_loader, real_dataset, syn_dataset)

cpu
Train on 122 samples, validate on 31 samples, 1 steps per epoch
Epoch 1/10
0s - loss:  0.6198 - mse:  0.6198 - val_mse:  0.5638
Epoch 2/10
0s - loss:  0.5433 - mse:  0.5433 - val_mse:  0.5020
Epoch 3/10
0s - loss:  0.4814 - mse:  0.4814 - val_mse:  0.4557
Epoch 4/10
0s - loss:  0.4347 - mse:  0.4347 - val_mse:  0.4154
Epoch 5/10
0s - loss:  0.3942 - mse:  0.3942 - val_mse:  0.3755
Epoch 6/10
0s - loss:  0.3543 - mse:  0.3543 - val_mse:  0.3361
Epoch 7/10
0s - loss:  0.3152 - mse:  0.3152 - val_mse:  0.2974
Epoch 8/10
0s - loss:  0.2770 - mse:  0.2770 - val_mse:  0.2598
Epoch 9/10
0s - loss:  0.2402 - mse:  0.2402 - val_mse:  0.2237
Epoch 10/10
0s - loss:  0.2051 - mse:  0.2051 - val_mse:  0.1894
cpu
Train on 122 samples, validate on 31 samples, 1 steps per epoch
Epoch 1/10
0s - loss:  0.5662 - mse:  0.5662 - val_mse:  0.6441
Epoch 2/10
0s - loss:  0.4983 - mse:  0.4983 - val_mse:  0.5802
Epoch 3/10
0s - loss:  0.4409 - mse:  0.4409 - val_mse:  0.5295
Epoch 4/10
0s - loss:  0.3955 -

In [22]:
matrix_1

item_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.378651,0.368736,0.377377,0.367857,0.368551,0.368349,0.378979,0.378373,0.378024,0.358348,...,0.368541,0.379108,0.368619,0.377718,0.376881,0.368327,0.368393,0.378881,0.378124,0.378821
1,0.388960,0.378957,0.387741,0.377837,0.378760,0.378600,0.389444,0.388602,0.387997,0.368395,...,0.378788,0.389623,0.378875,0.387769,0.386825,0.378439,0.378477,0.389337,0.388411,0.389290
2,0.388645,0.378901,0.387604,0.377967,0.378744,0.378442,0.389152,0.388596,0.388269,0.368384,...,0.378631,0.389435,0.378724,0.387722,0.386964,0.378259,0.378524,0.389185,0.388458,0.389144
3,0.378643,0.368729,0.377369,0.367850,0.368543,0.368341,0.378971,0.378365,0.378016,0.358341,...,0.368534,0.379100,0.368612,0.377710,0.376874,0.368320,0.368386,0.378873,0.378116,0.378814
4,0.388696,0.378483,0.387018,0.377579,0.378288,0.378154,0.388775,0.388205,0.387798,0.368111,...,0.378331,0.388821,0.378417,0.387777,0.386850,0.378329,0.378176,0.388638,0.387840,0.388561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.378686,0.368771,0.377411,0.367891,0.368585,0.368383,0.379013,0.378407,0.378056,0.358382,...,0.368576,0.379142,0.368654,0.377752,0.376915,0.368362,0.368427,0.378916,0.378158,0.378856
96,0.378651,0.368736,0.377377,0.367857,0.368551,0.368349,0.378979,0.378373,0.378024,0.358348,...,0.368541,0.379108,0.368619,0.377718,0.376881,0.368327,0.368393,0.378881,0.378124,0.378821
97,0.388604,0.378771,0.387605,0.377688,0.378546,0.378408,0.389193,0.388240,0.387640,0.368298,...,0.378615,0.389335,0.378686,0.387360,0.386430,0.378178,0.378253,0.389013,0.388080,0.388978
98,0.388720,0.378907,0.387651,0.377939,0.378752,0.378481,0.389243,0.388625,0.388241,0.368366,...,0.378668,0.389500,0.378757,0.387736,0.386938,0.378287,0.378519,0.389245,0.388478,0.389194


## Create response function

In [23]:
import os
import sys
import importlib
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from modules import models, evaluator, trainers, utils
importlib.reload(models)
importlib.reload(evaluator)
importlib.reload(trainers)



<module 'modules.trainers' from '/Users/vldpro/Workspace/university/recsys/modules/trainers.py'>

In [24]:
class ResponseFunction:
    def __init__(self, deepfm_matrix_1, deepfm_matrix_2):
        assert deepfm_matrix_1.shape == deepfm_matrix_2.shape
        self._deepfm_matrix_1 = deepfm_matrix_1
        self._deepfm_matrix_2 = deepfm_matrix_2
        
    def __call__(self, a1: float, a2: float):
        a3 = max(0.0, 1 - a1 - a2)
        return (
            a1 * self._deepfm_matrix_1
            + a2 * self._deepfm_matrix_2
            + a3 * npr.normal(0, 1, size=self._deepfm_matrix_1.shape)
        )
    

resp_function = evaluator.ResponseFunctionConfig(
    factory=ResponseFunction, args=[matrix_1, matrix_2]
)

## Evaluation

In [25]:
%%time
_evaluators = [
    evaluator.TrainTestExecutorConfig(
        factory=trainers.AutoRecTrainTestExecutor,
        args={"config": {"epoch": 50}},
        model_name="autorec"
    ),
    evaluator.TrainTestExecutorConfig(
        factory=trainers.SvdTrainTestExecutor,
        args={},
        model_name="svd"
    ),
    evaluator.TrainTestExecutorConfig(
        factory=trainers.KnnTrainTestExecutor,
        args={},
        model_name="knn"
    )
]

np.random.seed(SEED)
_evaluator = evaluator.Evaluator(resp_function, n_proc=4)
_res = _evaluator.evaluate(
    _evaluators, 
    a_sample_rate=3,
    test_size=0.1,
    sample_sizes=[0.1]
)

Subprocess started.Subprocess started.Subprocess started.


Load data finished. Number of users:Load data finished. Number of users:Load data finished. Number of users:   100100100   Number of items:Number of items:Number of items:   100100100


IAutoRec.IAutoRec.

In [26]:
_res

In [27]:
_res.to_csv(generate_filename(base="evalution_result"))

## Visualization

In [28]:
error_surface = utils.group_points_by_minimum_error(_res)
error_surface

In [29]:
from modules import utils
importlib.reload(utils)

import plotly.express as px

for ss in [0.1]:
    fig = px.scatter_3d(
        error_surface[error_surface["sample_size"] == ss], 
        x='a1', 
        y='a2', 
        z='rmse',
        size="rmse",
        size_max=18, 
        opacity=1,
        color="model_name",
        color_continuous_scale=px.colors.sequential.thermal[::-1]
    )

    fig.update_layout(
        margin=dict(l=20, r=20, t=20, b=20),
    )

    fig.show("notebook")