In [39]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
torchctr_root = '../'

import sys
sys.path.append(torchctr_root)

In [41]:
import random
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.optim import Adam

In [42]:
pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_rows = 999
pd.options.display.max_columns = 100

In [43]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


## Load data

In [44]:
import joblib

df_samples = joblib.load(f'{torchctr_root}/data/amazon_data.pkl')

In [45]:
len( df_samples['reviewerID'].unique() )

192403

In [46]:
len( df_samples['asin'].unique() )

63001

In [47]:
len( df_samples['brand'].unique() )

3526

In [48]:
len( df_samples['categories'].unique() )

801

In [49]:
# sequence features from latest to oldest
df_samples['his_asin_seq'] = df_samples['his_asin_seq'].map(lambda x: x[::-1])

## Create pytorch dataset

In [15]:
## Hash buckets
# feat_configs = [
#     {"name": "reviewerID", "dtype": "category", "emb_dim": 17,  "hash_buckets": 'auto'},
#     {"name": "asin", "dtype": "category", "emb_dim": 15, 'min_freq': 3},
    
#     {"name": "price", "dtype": "numerical", "norm": 'std'},
#     {"name": "brand", "dtype": "category",  "emb_dim": 12, 'min_freq': 3},
#     {"name": "categories", "dtype": "category",  "emb_dim": 12, 'min_freq': 3},

#     {"name": "his_asin_seq", "dtype": "category", "emb_dim": 15, 'min_freq': 3, "islist": True, "maxlen": 256},
# ]

## Auto generate feat_configs
from torchctr.utils import auto_generate_feature_configs
feat_configs = auto_generate_feature_configs(
    df_samples[['reviewerID', 'asin', 'price', 'brand', 'categories', 'his_asin_seq']]
)

print(feat_configs)

target_cols = ['label', ]

[{'name': 'reviewerID', 'dtype': 'category', 'emb_dim': 17, 'min_freq': 3}, {'name': 'asin', 'dtype': 'category', 'emb_dim': 15, 'min_freq': 3}, {'name': 'price', 'dtype': 'numerical', 'norm': 'std', 'mean': np.float64(74.40153304932919), 'std': np.float64(123.75264929565961)}, {'name': 'brand', 'dtype': 'category', 'emb_dim': 11, 'min_freq': 3}, {'name': 'categories', 'dtype': 'category', 'emb_dim': 9, 'min_freq': 3}, {'name': 'his_asin_seq', 'dtype': 'category', 'islist': True, 'emb_dim': 15, 'min_freq': 3, 'max_len': 256}]


In [16]:
import polars as pl

df_samples = pl.from_pandas(df_samples)

In [37]:
from torchctr.transformer import FeatureTransformer

transformer = FeatureTransformer(feat_configs=['reviewerID', 'asin', 'price', 'brand', 'categories', 'his_asin_seq'], category_min_freq=3, verbose=True) # will auto generate other feat_configs 
df_train, df_test = transformer.split(df_samples, test_size=0.2, shuffle=True, group_col='reviewerID')
print(df_train.shape, df_test.shape)

df_train = transformer.fit_transform(df_train)
df_test = transformer.transform(df_test)



(1351158, 10) (338030, 10)


2024-11-15 18:37:18 torchctr INFO - Auto-generated feature configurations: [{'name': 'reviewerID', 'dtype': 'category', 'emb_dim': 17, 'min_freq': 3}, {'name': 'asin', 'dtype': 'category', 'emb_dim': 15, 'min_freq': 3}, {'name': 'price', 'dtype': 'numerical', 'norm': 'std', 'mean': 74.4471722884851, 'std': 123.8221824638805}, {'name': 'brand', 'dtype': 'category', 'emb_dim': 11, 'min_freq': 3}, {'name': 'categories', 'dtype': 'category', 'emb_dim': 9, 'min_freq': 3}, {'name': 'his_asin_seq', 'dtype': 'category', 'islist': True, 'max_len': 256, 'emb_dim': 15, 'min_freq': 3}]
2024-11-15 18:37:18 torchctr INFO - Processing feature reviewerID...
2024-11-15 18:37:18 torchctr INFO - Feature reviewerID vocab size: None -> 153923
2024-11-15 18:37:18 torchctr INFO - Processing feature asin...
2024-11-15 18:37:18 torchctr INFO - Feature asin vocab size: None -> 62333
2024-11-15 18:37:18 torchctr INFO - Processing feature price...
2024-11-15 18:37:18 torchctr INFO - Feature price updated: mean=74

In [21]:
df_test.head()

reviewerID,asin,unixReviewTime,overall,title,price,brand,categories,label,his_asin_seq,_index
u32,u32,i64,f64,str,f64,u32,u32,i64,list[u32],i64
0,44949,1359331200,5.0,"""AMD FD8350FRHKBOX FX-8350 FX-…",0.922561,2553,108,1,[],0
0,45173,1361145600,4.0,"""TRENDnet 1-Port Print Server T…",-0.261893,719,27,1,[39436],1
0,30796,1361145600,4.0,,0.206122,719,700,1,"[39436, 61681]",2
0,19417,1361145600,5.0,"""Thermaltake Water 2.0 PRO/All …",-0.537453,719,80,1,"[39436, 61681, 45308]",3
0,36971,1389744000,5.0,"""Replacement Battery for PLANTR…",-0.521381,3065,363,1,"[39436, 61681, … 42890]",4


In [22]:
from datasets import Dataset

# train_dataset = Dataset.from_pandas(df_train).with_format('torch', columns=[x['name'] for x in feat_configs] + target_cols)
# test_dataset = Dataset.from_pandas(df_test).with_format('torch', columns=[x['name'] for x in feat_configs] + target_cols)

train_dataset = Dataset.from_polars(df_train).with_format('torch', columns=[x['name'] for x in feat_configs] + target_cols)
test_dataset = Dataset.from_polars(df_test).with_format('torch', columns=[x['name'] for x in feat_configs] + target_cols)

In [23]:
from torchctr.nn.functional import pad_sequences_to_maxlen

def collate_fn(batch):
    # list of dict to dict of list
    # print(batch)

    batch_dense = []
    batch_sparse = {}
    for k in transformer.feat_configs:
        if k['type'] == 'dense':
            # print(k['name'])
            batch_dense.append(
                torch.tensor([sample[k['name']] for sample in batch], dtype=torch.float32)
            )
        elif k['type'] == 'sparse':
            if k.get('islist'):
                sparse_feat = [sample[k['name']] for sample in batch]
                # pad sequences
                sparse_feat = pad_sequences_to_maxlen(sparse_feat, batch_first=True, padding_value=-100, max_length=128)
            else:
                sparse_feat = torch.tensor([[sample[k['name']]] for sample in batch], dtype=torch.long)
            batch_sparse[k['name']] = sparse_feat

    batch_features = {
        'dense_features': torch.stack(batch_dense, dim=1),
        **batch_sparse
    }
    batch_labels = torch.tensor([[sample['label']] for sample in batch], dtype=torch.float32)
    
    return batch_features, batch_labels

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=512, shuffle=True, collate_fn=collate_fn)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=512, shuffle=False, collate_fn=collate_fn)

In [61]:
print( len(train_dataloader) )
for features, labels in DataLoader(test_dataset, batch_size=2, num_workers=0, shuffle=True, collate_fn=collate_fn):
    print(features)
    print(labels)
    break

2642
{'dense_features': tensor([[-0.4717],
        [ 3.0279]]), 'reviewerID': tensor([[0],
        [0]]), 'asin': tensor([[15335],
        [28189]]), 'brand': tensor([[2673],
        [ 982]]), 'categories': tensor([[260],
        [222]]), 'his_asin_seq': tensor([[22703,  8495, 28753,  8802, 36551, 47189,  8762, 31913, 53720,  8971,
         22570, 48729, 34835, 46590, 33885,  4333, 53178,  5877,  9980, 33010,
         56951, 49439, 59185, 35703, 20967, 59088, 56510, 49491, 15631, 58362,
         12246,  1024, 17604, 30655, 33966, 56054,  8261, 60505, 57148, 35790,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -10

## Train Model

In [62]:
from torchctr.models import DNN

dnn_hidden_units = [128,64,32]
model = DNN(transformer.feat_configs, hidden_units=dnn_hidden_units)
model = model.to(device)
print(model)

2024-11-15 16:31:57 torchctr INFO - Model Input: dense_size=1, sparse_size=67


DNN(
  (embeddings): ModuleDict(
    (reviewerID): Embedding(153923, 17)
    (asin): Embedding(62384, 15)
    (brand): Embedding(3504, 11)
    (categories): Embedding(800, 9)
    (his_asin_seq): Embedding(61926, 15)
  )
  (tower): Sequential(
    (0): Linear(in_features=68, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=64, out_features=32, bias=True)
    (9): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.5, inplace=False)
    (12): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [63]:
optimizer = Adam(model.parameters(),  lr = 0.002, weight_decay = 1e-9)
lr_scd = lr_scheduler.StepLR(optimizer, step_size=len(train_dataloader), gamma=0.8)

In [64]:
from torchctr.trainer import Trainer

trainer = Trainer(
    model, 
    optimizer=optimizer,
    lr_scheduler=lr_scd,
    max_epochs=5,
    early_stopping_rounds=3,
    save_ckpt_path='./ckpt/'
)

model = trainer.fit(train_dataloader, eval_dataloader = test_dataloader, ret_model = 'final') #, init_ckpt_path='./ckpt/')

2024-11-15 16:32:29 torchctr INFO - [Validation] Epoch: 0/5, Validation Loss: {'loss': 0.7817239257702349}
2024-11-15 16:32:29 torchctr INFO - Learning rate: 0.002
2024-11-15 16:32:29 torchctr INFO - [Training] Epoch: 1/5 iter 0/2642, Training Loss: {'loss': 0.8893228769302368}
2024-11-15 16:32:36 torchctr INFO - [Training] Epoch: 1/5 iter 100/2642, Training Loss: {'loss': 0.5205774176120758}
2024-11-15 16:32:43 torchctr INFO - [Training] Epoch: 1/5 iter 200/2642, Training Loss: {'loss': 0.45305554926395414}
2024-11-15 16:32:51 torchctr INFO - [Training] Epoch: 1/5 iter 300/2642, Training Loss: {'loss': 0.42856981287399926}
2024-11-15 16:32:58 torchctr INFO - [Training] Epoch: 1/5 iter 400/2642, Training Loss: {'loss': 0.4170663586258888}
2024-11-15 16:33:05 torchctr INFO - [Training] Epoch: 1/5 iter 500/2642, Training Loss: {'loss': 0.4071468803882599}
2024-11-15 16:33:12 torchctr INFO - [Training] Epoch: 1/5 iter 600/2642, Training Loss: {'loss': 0.40109682058294616}
2024-11-15 16:33

In [65]:
ckpt = trainer.load_ckpt('./ckpt')
model.load_state_dict(ckpt['model'].state_dict())

  
2024-11-15 16:46:29 torchctr INFO - Loaded model state_dict from checkpoint.
2024-11-15 16:46:29 torchctr INFO - Loaded model.training from checkpoint.
2024-11-15 16:46:29 torchctr INFO - Loaded model.feat_configs from checkpoint.
2024-11-15 16:46:29 torchctr INFO - Loaded optimizer = Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differ... from checkpoint.
2024-11-15 16:46:29 torchctr INFO - Loaded lr_scheduler = <torch.optim.lr_scheduler.StepLR object at 0x7f1ebca7bef0> from checkpoint.
2024-11-15 16:46:29 torchctr INFO - Loaded logger = <Logger torchctr (INFO)> from checkpoint.
2024-11-15 16:46:29 torchctr INFO - Loaded ckpt_file_prefix = checkpoint from checkpoint.
2024-11-15 16:46:29 torchctr INFO - Loaded num_epoch = 4 from checkpoint.
2024-11-15 16:46:29 torchctr INFO - Loaded global_steps = 10568 from checkpoint.
2024-11-15 16:46:29 torchctr INFO - Loaded save_ckpt_path = ./ckpt/ from checkpoint.
2024-11-15 16:46:29 torchctr INF

<All keys matched successfully>

In [66]:
test_preds = []
test_labels = []
model.eval()

for features, labels in test_dataloader:
    outputs = model(features)
    test_preds.append(outputs[:,0])
    test_labels.append(labels[:,0])
test_preds = torch.concat(test_preds, dim=0).detach().cpu().numpy()
test_labels = torch.concat(test_labels, dim=0).detach().cpu().numpy()

In [67]:
print(test_preds.shape, test_labels.shape)

(336650,) (336650,)


In [68]:
from sklearn.metrics import roc_auc_score

auc_score = roc_auc_score(test_labels, test_preds)
print("AUC Score:", auc_score)

AUC Score: 0.6764908327220651


# Generate and Test service

In [54]:
# Run this on terminal under root of project
# !python -m torchctr.serving.serve --name dnn --path examples/ckpt/checkpoint.010568.ckpt --dep_paths examples

import os
ret = os.system('curl http://localhost:8000/dnn/health')

{"status":"ok"}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    15  100    15    0     0   2461      0 --:--:-- --:--:-- --:--:--  3000


In [66]:
def test_predict(df, name):
    ''' 
    Test the prediction of the model.
    First launch the server by `python -m torchctr.serve --name {name} --path path/to/model`
    '''
    import requests
    import json

    data = {
        'features': df.to_dict(orient='list')
    }
    print(f"Data: {data}")
    data_json = json.dumps(data)
    response = requests.post(
        f"http://localhost:8000/{name}/predict", 
        data=data_json, 
        headers={"Content-Type": "application/json"}
    )

    if response.status_code == 200:
        prediction = response.json()
        print(f"Prediction: {prediction}")
    else:
        print(f"Request failed with status code: {response.status_code}")
        print(f"Response: {response.text}")

In [67]:
if ret == 0:
    test_predict(df_samples.sample(3), name='dnn')

Data: {'features': {'reviewerID': ['A2VXD6WHKCT5GC', 'A1TS6B7G1G5ESX', 'A11LO0DDZ9YT2A'], 'asin': ['B002RO8YK0', 'B0031RGEUE', 'B006QB1RPY'], 'unixReviewTime': [1317859200, 1382745600, 1361145600], 'overall': [5.0, 4.0, 5.0], 'title': ['50 Clear ClamShell CD DVD Case, Clam Shells', 'Olympus Stylus Tough 6020 14MP Digital Camera with 5x Wide Angle Zoom and 2.7 inch LCD (Green)', 'ASUS RT-N66U Dual-Band Wireless-N900 Gigabit Router'], 'price': [9.0, 188.99, 127.95], 'brand': ['mediaxpo', nan, 'Asus'], 'categories': ['Disc Jewel Cases', 'Cameras', 'Routers'], 'label': [1, 1, 1], 'his_asin_seq': [['B001NPEBGU', 'B0013EMKXC', 'B000R3CQM0', 'B0001FTVDQ'], ['B001V9KG0I'], ['B0013DZ9C2', 'B004V4N3NW', 'B004CQZVX4', 'B0040QE98O']]}}
Prediction: [[1.8661800622940063], [1.6356334686279297], [3.7728238105773926]]
