In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.optim import Adam

In [3]:
pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_rows = 999
pd.options.display.max_columns = 100

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


## Load data

In [5]:
import joblib

df_samples = joblib.load('data/amazon_data.pkl')

In [6]:
len( df_samples['reviewerID'].unique() )

192403

In [7]:
len( df_samples['asin'].unique() )

63001

In [8]:
len( df_samples['brand'].unique() )

3526

In [9]:
len( df_samples['categories'].unique() )

801

In [10]:
# sequence features from latest to oldest
df_samples['his_asin_seq'] = df_samples['his_asin_seq'].map(lambda x: x[::-1])

## Create pytorch dataset

In [11]:
## Hash buckets
# feat_configs = [
#     {"name": "reviewerID", "dtype": "category", "emb_dim": 12, "min_freq": 3, "hash_buckets": 1000000},
#     {"name": "asin", "dtype": "category", "emb_dim": 12, "min_freq": 3, "hash_buckets": 1000000},
    
#     {"name": "price", "dtype": "numerical", "norm": "std"},
#     {"name": "brand", "dtype": "category", "min_freq": 3, "emb_dim": 12},
#     {"name": "categories", "dtype": "category", "min_freq": 3, "emb_dim": 12},

#     {"name": "his_asin_seq", "dtype": "category", "islist": True, "min_freq": 3, "emb_dim": 12, "hash_buckets": 1000000},
# ]

## Dynamic Embedding
# feat_configs = [
#     {"name": "reviewerID", "dtype": "category", "emb_dim": 12, "min_freq": 3},
#     {"name": "asin", "dtype": "category", "emb_dim": 12, "min_freq": 3},
    
#     {"name": "price", "dtype": "numerical", "norm": "std"},
#     {"name": "brand", "dtype": "category", "min_freq": 3, "emb_dim": 12},
#     {"name": "categories", "dtype": "category", "min_freq": 3, "emb_dim": 12},

#     {"name": "his_asin_seq", "dtype": "category", "islist": True, "min_freq": 3, "emb_dim": 12},
# ]

## Auto generate feat_configs
from core.utils import auto_generate_feature_configs
feat_configs = auto_generate_feature_configs(
    df_samples[['reviewerID', 'asin', 'price', 'brand', 'categories', 'his_asin_seq']]
)

print(feat_configs)

target_cols = ['label', ]

[{'name': 'reviewerID', 'dtype': 'category', 'emb_dim': 17, 'min_freq': 3}, {'name': 'asin', 'dtype': 'category', 'emb_dim': 15, 'min_freq': 3}, {'name': 'price', 'dtype': 'numerical', 'norm': 'std', 'mean': 74.40153304932919, 'std': 123.75264929566384}, {'name': 'brand', 'dtype': 'category', 'emb_dim': 11, 'min_freq': 3}, {'name': 'categories', 'dtype': 'category', 'emb_dim': 9, 'min_freq': 3}, {'name': 'his_asin_seq', 'dtype': 'category', 'islist': True, 'emb_dim': 15, 'min_freq': 3, 'max_len': 256}]


In [12]:
from core.sample import traintest_split

df_train, df_test = traintest_split(df_samples, test_size=0.2, shuffle=True, group_id='reviewerID')
print(len(df_train), len(df_test))

1352538 336650


In [13]:
# from core.dataset import FeatureTransformer

# transformer = FeatureTransformer(feat_configs)

# df_train = transformer.transform(df_train, is_train=True, n_jobs=4)
# df_test = transformer.transform(df_test, is_train=False, n_jobs=4)

In [14]:
from core.dataset import DataFrameDataset

train_dataset = DataFrameDataset(df_train, feat_configs, target_cols, is_raw=True, is_train=True, n_jobs=1, verbose=True)
test_dataset = DataFrameDataset(df_test, feat_configs, target_cols, is_raw=True, is_train=False, n_jobs=1)

==> Feature transforming (is_train=True), note that feat_configs will be updated when is_train=True...
Processing feature reviewerID...
Converting category reviewerID to indices...
Feature reviewerID vocab size: None -> 153923
Processing feature asin...
Converting category asin to indices...
Feature asin vocab size: None -> 62384
Processing feature price...
Feature price mean: 74.40153304932919, std: 123.75264929566384, min: 0.01, max: 999.99
Processing feature brand...
Converting category brand to indices...
Feature brand vocab size: None -> 3503
Processing feature categories...
Converting category categories to indices...
Feature categories vocab size: None -> 800
Processing feature his_asin_seq...
Converting category his_asin_seq to indices...
Feature his_asin_seq vocab size: None -> 61925
==> Feature transforming (is_train=True) done...
==> Dense features: ['price']
==> Sparse features: ['reviewerID', 'asin', 'brand', 'categories']
==> Sequence dense features: []
==> Sequence spars

In [15]:
# max([v['idx'] for k,v in feat_configs[3]['vocab'].items()])
# feat_configs[3]['num_embeddings']

In [16]:
df_train.head()

Unnamed: 0,reviewerID,asin,unixReviewTime,overall,title,price,brand,categories,label,his_asin_seq
314370,69777,1870,1375660800,4.0,B+W 58mm Kaesemann Circular Polarizer with Mul...,0.117965,153,197,1,[102]
538816,47184,140,1342137600,5.0,OtterBox Defender Series Case with Screen Prot...,-0.076213,78,1,1,"[5869, 36203, 387]"
194675,111551,2570,1397088000,5.0,Olympus VN-702PC Voice Recorder,-0.124616,54,140,1,"[18766, 12848, 139]"
543055,10936,6521,1357516800,4.0,Manfrotto 701HDV Pro Fluid Video Mini Head,1.822898,1,153,1,"[26, 10157]"
355447,99991,3273,1365724800,4.0,Bear Motion Luxury Buffalo Hide Vintage Leathe...,-0.197261,550,1,1,"[30, 918, 9748]"


In [17]:
train_dataloader = DataLoader(train_dataset, batch_size=512, num_workers=8, shuffle=True, collate_fn=DataFrameDataset.collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=512, num_workers=8, shuffle=False, collate_fn=DataFrameDataset.collate_fn)

In [18]:
print( len(train_dataloader) )
for features, labels in DataLoader(train_dataset, batch_size=1, num_workers=0, shuffle=True, collate_fn=DataFrameDataset.collate_fn):
    print(features)
    print(labels)
    break

2642
{'dense_features': tensor([[-0.4802]]), 'reviewerID': tensor([[117207]], dtype=torch.int32), 'asin': tensor([[4197]], dtype=torch.int32), 'brand': tensor([[178]], dtype=torch.int32), 'categories': tensor([[10]], dtype=torch.int32), 'his_asin_seq': [tensor([42973]), tensor([4086]), tensor([11370])]}
tensor([[1.]])


## Train Model

In [19]:
from model import DNN

dnn_hidden_units = [128,64,32]
model = DNN(feat_configs, hidden_units=dnn_hidden_units)
model = model.to(device)
print(model)

==> Model Input: dense_size=1, sparse_size=67
DNN(
  (embeddings): ModuleDict(
    (reviewerID): Embedding(153923, 17)
    (asin): Embedding(62384, 15)
    (brand): Embedding(3504, 11)
    (categories): Embedding(800, 9)
    (his_asin_seq): Embedding(61926, 15)
  )
  (tower): Sequential(
    (0): Linear(in_features=68, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=64, out_features=32, bias=True)
    (9): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.5, inplace=False)
    (12): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [20]:
optimizer = Adam(model.parameters(),  lr = 0.002, weight_decay = 1e-9)
lr_scd = lr_scheduler.StepLR(optimizer, step_size=len(train_dataloader), gamma=0.8)

In [21]:
from core.trainer import Trainer

trainer = Trainer(
    model, 
    optimizer=optimizer,
    lr_scheduler=lr_scd,
    max_epochs=5,
    early_stopping_rounds=3,
    save_ckpt_path='./ckpt/'
)

model = trainer.fit(train_dataloader, eval_dataloader = test_dataloader, ret_model = 'final') #, init_ckpt_path='./ckpt/')

INFO:Trainer:[Validation] Epoch: 0/5, Validation Loss: {'loss': 0.6943016727101115}
INFO:Trainer:Learning rate: 0.002
INFO:Trainer:[Training] Epoch: 1/5 iter 0/2642, Training Loss: {'loss': 0.7151992917060852}
INFO:Trainer:[Training] Epoch: 1/5 iter 100/2642, Training Loss: {'loss': 0.46210706382989886}
INFO:Trainer:[Training] Epoch: 1/5 iter 200/2642, Training Loss: {'loss': 0.4188847088813782}
INFO:Trainer:[Training] Epoch: 1/5 iter 300/2642, Training Loss: {'loss': 0.4051204830408096}
INFO:Trainer:[Training] Epoch: 1/5 iter 400/2642, Training Loss: {'loss': 0.3967913106828928}
INFO:Trainer:[Training] Epoch: 1/5 iter 500/2642, Training Loss: {'loss': 0.39070094853639603}
INFO:Trainer:[Training] Epoch: 1/5 iter 600/2642, Training Loss: {'loss': 0.3864480660359065}
INFO:Trainer:[Training] Epoch: 1/5 iter 700/2642, Training Loss: {'loss': 0.38305971090282714}
INFO:Trainer:[Training] Epoch: 1/5 iter 800/2642, Training Loss: {'loss': 0.3801479394733906}
INFO:Trainer:[Training] Epoch: 1/5 

In [22]:
ckpt = trainer.load_ckpt('./ckpt')
model.load_state_dict(ckpt['model'].state_dict())

INFO:Trainer:Loaded model state_dict from checkpoint.
INFO:Trainer:Loaded model.training from checkpoint.
INFO:Trainer:Loaded model.feat_configs from checkpoint.
INFO:Trainer:Loaded optimizer = Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differ... from checkpoint.
INFO:Trainer:Loaded lr_scheduler = <torch.optim.lr_scheduler.StepLR object at 0x340936aa0> from checkpoint.
INFO:Trainer:Loaded logger = <Logger Trainer (INFO)> from checkpoint.
INFO:Trainer:Loaded ckpt_file_prefix = checkpoint from checkpoint.
INFO:Trainer:Loaded num_epoch = 4 from checkpoint.
INFO:Trainer:Loaded global_steps = 10568 from checkpoint.
INFO:Trainer:Loaded save_ckpt_path = ./ckpt/ from checkpoint.
INFO:Trainer:Loaded metadata_fn = ./ckpt//metadata.json from checkpoint.
INFO:Trainer:Loaded max_epochs = 5 from checkpoint.
INFO:Trainer:Loaded early_stopping_rounds = 3 from checkpoint.
INFO:Trainer:Checkpoint loaded from ./ckpt/checkpoint.010568.ckpt.


<All keys matched successfully>

In [23]:
test_preds = []
test_labels = []
model.eval()

for features, labels in test_dataloader:
    outputs = model(features)
    test_preds.append(outputs[:,0])
    test_labels.append(labels[:,0])
test_preds = torch.concat(test_preds, dim=0).detach().cpu().numpy()
test_labels = torch.concat(test_labels, dim=0).detach().cpu().numpy()

In [24]:
print(test_preds.shape, test_labels.shape)

(336650,) (336650,)


In [25]:
from sklearn.metrics import roc_auc_score

auc_score = roc_auc_score(test_labels, test_preds)
print("AUC Score:", auc_score)

AUC Score: 0.6776512619734152


# Generate and Test service

In [77]:
# Run this on terminal
# !python -m core.serve --name dnn --path ckpt/checkpoint.013210.ckpt

import os
ret = os.system('curl http://localhost:8000/dnn/health')

{"status":"ok"}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    15  100    15    0     0   4563      0 --:--:-- --:--:-- --:--:-- 15000


In [78]:
from core.serve import test_predict

if ret == 0:
    test_predict(df_samples.sample(3), name='dnn')

Data: {'features': {'reviewerID': ['A9IP685WQXGQ2', 'A1LOKG5N99IFQ1', 'A2WKEA06KEAY3H'], 'asin': ['B005DF7BEK', 'B0058BG0UK', 'B004QKM5MG'], 'unixReviewTime': [1351900800, 1363305600, 1397088000], 'overall': [1.0, 5.0, 5.0], 'title': ['Fosmon High Resolution Monitor Cable (Male VGA to Male VGA) - 10 ft', 'Patriot 32GB Axle Series Cap-less USB 2.0 Flash Drive - PSF32GAUSB', 'Apple iPad 2 Leather Smart Cover - Black (MC947LL/A)'], 'price': [6.95, 17.95, 17.41], 'brand': [nan, 'Patriot', 'Apple'], 'categories': ['VGA Cables', 'USB Flash Drives', 'Cases'], 'label': [0, 1, 1], 'his_asin_seq': [['B000V0IE66', 'B002SN8YVE', '9625993428', 'B0026IBI1O'], ['B00132DG46', 'B00081A2KY'], ['B004HYHZJY', 'B008OUKZZI', 'B003C2QS90', 'B000V0ET92', 'B000V0BIWI', 'B008IZQCGK', 'B003U7WCEM', 'B00112BA78', 'B004G7D0EG', 'B0032UXY9Y', 'B001L1H0SC', 'B001202514', 'B000LTN8CC', 'B000GGQ2ZU']]}}
Prediction: {'prediction': [[2.167837619781494], [2.495988607406616], [3.760685920715332]]}
