In [38]:
import os
import sys
import numpy as np
import pandas as pd
from numpy import random as npr

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)



In [39]:
SEED = 2021
BOOKS_DATASET_PATH = "books_dataset_cleaned.csv"

## Load and transform restaurants data

In [44]:
real_dataset = pd.read_csv(BOOKS_DATASET_PATH)
real_dataset = real_dataset.drop(["Location"], axis=1)
real_dataset = real_dataset.sample(int(len(real_dataset) * 0.001))
real_dataset.head()

Unnamed: 0,user_id,Age,Country,ISBN,book_rating,rating_Avg,rating_sum,Count_All_Rate,Book_Title,Book_Author,Year_Of_Publication,Publisher
334121,160643,22.0,france,2266065998,5,5.0,5,4,L'Empire dÃ?Â©barque,Marion Zimmer Bradley,1999.0,Pocket
115510,174601,37.0,usa,505525178,3,7.3,73,22,Improper English,Katie Macalister,2003.0,Love Spell
191683,113259,30.0,usa,446611913,10,8.0,240,67,Up Country,Nelson DeMille,2003.0,Warner Vision
151192,171904,28.0,spain,739307320,8,7.666667,23,5,The Devil Wears Prada,LAUREN WEISBERGER,2003.0,Random House Audio
173793,30711,32.0,australia,64472051,7,8.166667,49,13,The Hounds of the Morrigan,Pat O'Shea,1999.0,HarperTrophy


In [45]:
real_dataset["ISBN"] = real_dataset["ISBN"].astype('category').cat.codes

In [46]:
real_dataset["book_rating"] = real_dataset["book_rating"] / 10

In [47]:
real_dataset.dtypes

user_id                  int64
Age                    float64
Country                 object
ISBN                     int16
book_rating            float64
rating_Avg             float64
rating_sum               int64
Count_All_Rate           int64
Book_Title              object
Book_Author             object
Year_Of_Publication    float64
Publisher               object
dtype: object

In [48]:
real_dataset

Unnamed: 0,user_id,Age,Country,ISBN,book_rating,rating_Avg,rating_sum,Count_All_Rate,Book_Title,Book_Author,Year_Of_Publication,Publisher
334121,160643,22.0,france,356,0.5,5.000000,5,4,L'Empire dÃ?Â©barque,Marion Zimmer Bradley,1999.0,Pocket
115510,174601,37.0,usa,198,0.3,7.300000,73,22,Improper English,Katie Macalister,2003.0,Love Spell
191683,113259,30.0,usa,173,1.0,8.000000,240,67,Up Country,Nelson DeMille,2003.0,Warner Vision
151192,171904,28.0,spain,270,0.8,7.666667,23,5,The Devil Wears Prada,LAUREN WEISBERGER,2003.0,Random House Audio
173793,30711,32.0,australia,21,0.7,8.166667,49,13,The Hounds of the Morrigan,Pat O'Shea,1999.0,HarperTrophy
...,...,...,...,...,...,...,...,...,...,...,...,...
249110,60244,47.0,usa,126,0.9,9.000000,9,5,The Shelter of Each Other: Rebuilding Our Fami...,Mary Bray Pipher,1996.0,Putnam Publishing Group
77793,83363,34.0,usa,28,1.0,7.909091,174,47,The Pearl,John Steinbeck,2000.0,Penguin USA (Paper)
36868,186784,17.0,usa,146,0.9,9.033981,1861,334,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003.0,Scholastic
259705,98391,52.0,usa,199,0.9,7.500000,60,25,Love Bites,Lynsay Sands,2004.0,Love Spell


## Training DeepFM components


In [49]:
def generate_filename(*, base, ext="csv"):
    now = datetime.datetime.utcnow()
    return "exp2_" + base + "_" + now.strftime("%d%m%y_%H%M%S") + "." + ext

In [50]:
attributes_names = {
    "dense": [
        "Age",
        "rating_Avg",
        "rating_sum",
        "Count_All_Rate",
        "Year_Of_Publication"
    ],
    "sparse": [
        "Book_Title",
        "Book_Author",
        "Country",
        "Publisher",
    ]
}

In [51]:
import dataclasses
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from modules.models import DeepFmModel
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


@dataclasses.dataclass
class DeepFmInputDataset:
    data: object
    dnn_feats: object
    linear_feats: object
    feat_names: object


class DeepFMDataLoader:
    def __init__(self, *, sparse_features, dense_features):
        self._sparse_feats = sparse_features
        self._dense_feats = dense_features
        
    def load(self, dataset):
        nn_input = pd.DataFrame()
        nn_input[self._sparse_feats] = dataset[self._sparse_feats]
        nn_input[self._dense_feats] = dataset[self._dense_feats]
        
        for feat in self._sparse_feats:
            encoder = LabelEncoder()
            nn_input[feat] = encoder.fit_transform(nn_input[feat])
            
        mms = MinMaxScaler(feature_range=(0,1))
        nn_input[self._dense_feats] = mms.fit_transform(nn_input[self._dense_feats])
        
        # problems may be here
        sparse_feature_columns = [
            SparseFeat(feat, vocabulary_size=nn_input[feat].nunique(), embedding_dim=4) 
            for i, feat in enumerate(self._sparse_feats)
        ]

        dense_feature_columns = [DenseFeat(feat, 1,) for feat in self._dense_feats]
        
        dnn_feat_cols = sparse_feature_columns + dense_feature_columns
        linear_feat_cols = sparse_feature_columns + dense_feature_columns
        
        feat_names = get_feature_names(linear_feat_cols + dnn_feat_cols)
        input_dataset = DeepFmInputDataset(
            data=nn_input,
            dnn_feats=dnn_feat_cols,
            linear_feats=linear_feat_cols,
            feat_names=feat_names
        )
        return input_dataset

In [52]:
def to_rating_matrix(dataset, predicted_response):
    result = pd.DataFrame()
    result["rating"] = predicted_response.reshape((len(predicted_response),))
    result["user_id"] = dataset["user_id"]
    result["item_id"] = dataset["item_id"]
    matrix = result.pivot(index="user_id", columns="item_id", values="rating")
    return matrix


In [53]:
def merge_feats(feats_a, feats_b):
    assert len(feats_a) == len(feats_b)
    merged = []
    for feat_a, feat_b in zip(feats_a, feats_b):
        if isinstance(feat_a, DenseFeat):
            continue
        if feat_a.vocabulary_size >= feat_b.vocabulary_size:
            merged.append(feat_a)
        else:
            merged.append(feat_b)
    return merged

In [54]:
def train_deepfm(feats, feat_names, x, y):
    deepfm = DeepFmModel(feats, feats, feat_names)
    train_set, test_set = train_test_split(x, test_size=0.2)
    deepfm.train(train_set, target_values=y[:len(train_set)])
    return deepfm
    

In [55]:
def pretrain_deepfm_model(*, data_loader, train_set, test_set):
    nn_train_input = data_loader.load(train_set)
    nn_test_input = data_loader.load(test_set)
    y = train_set["book_rating"].values
    
    merged_feats = merge_feats(nn_train_input.dnn_feats, nn_test_input.dnn_feats)
    deepfm = train_deepfm(merged_feats, nn_train_input.feat_names, x=nn_train_input.data, y=y)
    return deepfm



In [56]:
def split_dataset(dataset):
    n = len(dataset)
    mid = int(n / 2)
    return dataset[:mid], dataset[mid:]
    

In [57]:
from sdv.tabular import CTGAN
import datetime


def fit_syn_generator(df):
    model = CTGAN()
    df = df.astype("int64", errors="ignore") # Convert all numbers to int64
    model.fit(df.copy())
    return model


def fit_synthetic_generators(real_dataset):
    users = real_dataset[["user_id", "Age", "Country"]].drop_duplicates().drop("user_id", axis=1)
    items = real_dataset[
        ["ISBN", "Book_Title", "Book_Author", "Publisher", "rating_Avg", "rating_sum", "Count_All_Rate", "Year_Of_Publication"]
    ].drop_duplicates().drop("ISBN", axis=1)
    
    users_generator = fit_syn_generator(users)
    users_generator.save(generate_filename(base="users_generator", ext="bin"))
    items_generator = fit_syn_generator(items)
    users_generator.save(generate_filename(base="items_generator", ext="bin"))
    return users_generator, items_generator


def generate_synthetic_data(users_generator, items_generator, n_users=100, n_items=100):
    syn_users = users_generator.sample(n_users)
    syn_items = items_generator.sample(n_items)
    syn_users["user_id"] = range(len(syn_users))
    syn_items["item_id"] = range(len(syn_items))
    syn_users["_merge_key"] = syn_items["_merge_key"] = 1
    
    syn_dataset = pd.merge(syn_items, syn_users, on="_merge_key")
    syn_dataset = syn_dataset.drop(["_merge_key"], axis=1)
    return syn_dataset


def measure_durations(fn, dataset, n, step=5):
    measures = []
    for i in range(step, n, step):
        start = datetime.datetime.utcnow()
        fn(dataset[:i])
        duration = datetime.datetime.utcnow() - start
        measures.append({"n": i, "time": duration.seconds})
        print(f"Duration {duration.seconds}")
    return pd.DataFrame(measures)


In [58]:
%%time
np.random.seed(SEED)
users_generator, items_generator = fit_synthetic_generators(real_dataset)

  random_state=random_state).fit(X).labels_


CPU times: user 3min 41s, sys: 50.2 s, total: 4min 32s
Wall time: 4min 49s


In [59]:
np.random.seed(SEED)
syn_dataset = generate_synthetic_data(users_generator, items_generator)
syn_dataset.head()

Unnamed: 0,Book_Title,Book_Author,Publisher,rating_Avg,rating_sum,Count_All_Rate,Year_Of_Publication,item_id,Age,Country,user_id
0,The Protein Power Lifeplan,Jack Kerouac,Dover Publications,4,-37,2,2000,0,37,sweden,0
1,The Protein Power Lifeplan,Jack Kerouac,Dover Publications,4,-37,2,2000,0,48,switzerland,1
2,The Protein Power Lifeplan,Jack Kerouac,Dover Publications,4,-37,2,2000,0,38,germany,2
3,The Protein Power Lifeplan,Jack Kerouac,Dover Publications,4,-37,2,2000,0,39,italy,3
4,The Protein Power Lifeplan,Jack Kerouac,Dover Publications,4,-37,2,2000,0,39,canada,4


In [60]:
def calculate_rating_matrices(data_loader, real_dataset, syn_dataset):
    p1, p2 = split_dataset(real_dataset)
    deepfm_1 = pretrain_deepfm_model(data_loader=data_loader, train_set=p1, test_set=syn_dataset)
    deepfm_2 = pretrain_deepfm_model(data_loader=data_loader, train_set=p2, test_set=syn_dataset)
    nn_syn_dataset = data_loader.load(syn_dataset)
    
    y1 = deepfm_1.predict(nn_syn_dataset.data)
    y2 = deepfm_2.predict(nn_syn_dataset.data)
    
    matrix_1 = to_rating_matrix(syn_dataset, y1)
    matrix_2 = to_rating_matrix(syn_dataset, y2)
    return matrix_1, matrix_2



In [61]:
data_loader = DeepFMDataLoader(sparse_features=attributes_names["sparse"], dense_features=attributes_names["dense"])

In [62]:
%%time
np.random.seed(SEED)
matrix_1, matrix_2 = calculate_rating_matrices(data_loader, real_dataset, syn_dataset)

cpu
Train on 121 samples, validate on 31 samples, 1 steps per epoch
Epoch 1/10
0s - loss:  0.6530 - mse:  0.6530 - val_mse:  0.5470
Epoch 2/10
0s - loss:  0.5729 - mse:  0.5729 - val_mse:  0.4870
Epoch 3/10
0s - loss:  0.5080 - mse:  0.5080 - val_mse:  0.4402
Epoch 4/10
0s - loss:  0.4570 - mse:  0.4570 - val_mse:  0.4046
Epoch 5/10
0s - loss:  0.4177 - mse:  0.4177 - val_mse:  0.3713
Epoch 6/10
0s - loss:  0.3809 - mse:  0.3809 - val_mse:  0.3378
Epoch 7/10
0s - loss:  0.3441 - mse:  0.3441 - val_mse:  0.3043
Epoch 8/10
0s - loss:  0.3077 - mse:  0.3077 - val_mse:  0.2711
Epoch 9/10
0s - loss:  0.2719 - mse:  0.2719 - val_mse:  0.2385
Epoch 10/10
0s - loss:  0.2370 - mse:  0.2370 - val_mse:  0.2068
cpu
Train on 122 samples, validate on 31 samples, 1 steps per epoch
Epoch 1/10
0s - loss:  0.5708 - mse:  0.5708 - val_mse:  0.5696
Epoch 2/10
0s - loss:  0.4842 - mse:  0.4842 - val_mse:  0.4950
Epoch 3/10
0s - loss:  0.4130 - mse:  0.4130 - val_mse:  0.4307
Epoch 4/10
0s - loss:  0.3521 -

In [63]:
matrix_1

item_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.346836,0.338359,0.347897,0.337266,0.347657,0.327909,0.347032,0.347389,0.327288,0.345246,...,0.317865,0.348191,0.346392,0.337118,0.327928,0.337685,0.337359,0.347435,0.317859,0.347308
1,0.348118,0.338478,0.348816,0.338067,0.348511,0.328199,0.347718,0.348001,0.327929,0.344994,...,0.318365,0.348809,0.346925,0.337883,0.328507,0.338424,0.337587,0.348092,0.318359,0.347880
2,0.347331,0.338617,0.348321,0.337941,0.348107,0.328418,0.347565,0.347823,0.327830,0.345436,...,0.318233,0.348597,0.346888,0.337410,0.328347,0.338227,0.337455,0.348250,0.318227,0.347791
3,0.347598,0.338532,0.348653,0.337776,0.348349,0.328107,0.347555,0.347879,0.327491,0.345175,...,0.318076,0.348982,0.346569,0.337556,0.328376,0.338447,0.337295,0.347951,0.318069,0.347838
4,0.347968,0.338774,0.348928,0.338017,0.348639,0.328346,0.347804,0.348125,0.327766,0.345410,...,0.318374,0.349239,0.346828,0.337905,0.328644,0.338677,0.337618,0.348128,0.318368,0.348082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.347766,0.339243,0.349456,0.338055,0.348833,0.328276,0.348026,0.348618,0.327801,0.345699,...,0.318450,0.349459,0.347035,0.337940,0.328876,0.339010,0.338059,0.348192,0.318444,0.348315
96,0.337674,0.328865,0.338867,0.327937,0.338438,0.318254,0.337729,0.338173,0.317875,0.335466,...,0.308400,0.338854,0.336999,0.327784,0.318575,0.328461,0.327920,0.338032,0.308394,0.337948
97,0.347751,0.339214,0.349382,0.337792,0.348742,0.328084,0.347849,0.348506,0.327671,0.345681,...,0.318432,0.349322,0.346916,0.338018,0.328783,0.338789,0.338247,0.347766,0.318426,0.348149
98,0.347683,0.338199,0.348339,0.337412,0.348050,0.327731,0.347178,0.347532,0.327447,0.344810,...,0.318046,0.348353,0.346466,0.337636,0.328087,0.337828,0.337528,0.347274,0.318040,0.347379


## Create response function

In [64]:
import os
import sys
import importlib
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from modules import models, evaluator, trainers, utils
importlib.reload(models)
importlib.reload(evaluator)
importlib.reload(trainers)



<module 'modules.trainers' from '/Users/vldpro/Workspace/university/recsys/modules/trainers.py'>

In [65]:
class ResponseFunction:
    def __init__(self, deepfm_matrix_1, deepfm_matrix_2):
        assert deepfm_matrix_1.shape == deepfm_matrix_2.shape
        self._deepfm_matrix_1 = deepfm_matrix_1
        self._deepfm_matrix_2 = deepfm_matrix_2
        
    def __call__(self, a1: float, a2: float):
        a3 = max(0.0, 1 - a1 - a2)
        return (
            a1 * self._deepfm_matrix_1
            + a2 * self._deepfm_matrix_2
            + a3 * npr.normal(0, 1, size=self._deepfm_matrix_1.shape)
        )
    

resp_function = evaluator.ResponseFunctionConfig(
    factory=ResponseFunction, args=[matrix_1, matrix_2]
)

## Evaluation

In [66]:
%%time
_evaluators = [
    evaluator.TrainTestExecutorConfig(
        factory=trainers.AutoRecTrainTestExecutor,
        args={"config": {"epoch": 50}},
        model_name="autorec"
    ),
    evaluator.TrainTestExecutorConfig(
        factory=trainers.SvdTrainTestExecutor,
        args={},
        model_name="svd"
    ),
    evaluator.TrainTestExecutorConfig(
        factory=trainers.KnnTrainTestExecutor,
        args={},
        model_name="knn"
    )
]

np.random.seed(SEED)
_evaluator = evaluator.Evaluator(resp_function, n_proc=4)
_res = _evaluator.evaluate(
    _evaluators, 
    a_sample_rate=10,
    test_size=0.1,
    sample_sizes=[0.1]
)

Subprocess started.Subprocess started.Subprocess started.
Subprocess started.


Load data finished. Number of users: 100 Number of items: 100
Load data finished. Number of users: 100 Number of items: 100
Load data finished. Number of users: 100Load data finished. Number of users: Number of items: 100 
100 Number of items: 100
IAutoRec.IAutoRec.IAutoRec.


IAutoRec.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Epoch: 0000; Epoch: 0000; Epoch: 0000; Epoch: 0000; RMSE:0.9112774094228897; MAE:0.7271760487266331RMSE:0.9917435703405726; MAE:0.7714118467206817

Epoch: 0003; Epoch: 0003; RMSE:0.9112774094228897; MAE:0.7271760487266331RMSE:0.6987835147413282; MAE:0.553635586223501

RMSE:0.9917435703405726; MAE:0.7714118467206817Epoch: 0003; 
Epoch: 0006; RMSE:0.9917435703405726; MAE:0.7714118467206817RMSE:0.6987835147413282; MAE:0.553635586223501Epoch: 0006; 
Epoch: 0009; RMSE:0.9917435703405726; MAE:0.7714118467206817

Epoch: 0012; Epoch: 0006; RMSE:0.7805905768541495; MAE:0.6147580099384602RMSE:0.9112774094228897; MAE:0.7271760487266331
RMSE:0.6987835147413282; MAE:0.553635586223501
Epoch: 0003; 
Epoch: 0009; Epoch: 0009; RMSE:0.6987835147413282; MAE:0.553635586223501RMSE:0.9112774094228897; MAE:0.7271760487266331
RMSE:0.7805905768541495; MAE:0.6147580099384602Epoch: 0012; 

Epoch: 0006; RMSE:0.9917435703405726; MAE:0.7714118467206817Epoch: 0012; 
Epoch: 0015; RMSE:0.7805905768541495; MAE:0.614758

In [67]:
_res

In [68]:
_res.to_csv(generate_filename(base="evalution_result"))

## Visualization

In [69]:
error_surface = utils.group_points_by_minimum_error(_res)
error_surface

In [70]:
from modules import utils
importlib.reload(utils)

import plotly.express as px

for ss in [0.1]:
    fig = px.scatter_3d(
        error_surface[error_surface["sample_size"] == ss], 
        x='a1', 
        y='a2', 
        z='rmse',
        size="rmse",
        size_max=18, 
        opacity=1,
        color="model_name",
        color_continuous_scale=px.colors.sequential.thermal[::-1]
    )

    fig.update_layout(
        margin=dict(l=20, r=20, t=20, b=20),
    )

    fig.show("notebook")