In [18]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import random
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.optim import Adam

In [20]:
pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_rows = 999
pd.options.display.max_columns = 100

In [21]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


## Load data

In [22]:
import joblib

df_samples = joblib.load('data/amazon_data.pkl')

In [23]:
len( df_samples['reviewerID'].unique() )

192403

In [24]:
len( df_samples['asin'].unique() )

63001

In [25]:
len( df_samples['brand'].unique() )

3526

In [26]:
len( df_samples['categories'].unique() )

801

## Create pytorch dataset

In [27]:
feat_configs = [
    {"name": "reviewerID", "dtype": "category", "emb_dim": 12, "min_freq": 3, "hash_buckets": 1000000},
    {"name": "asin", "dtype": "category", "emb_dim": 12, "min_freq": 3, "hash_buckets": 1000000},
    
    {"name": "price", "dtype": "numerical", "norm": "std"},
    {"name": "brand", "dtype": "category", "min_freq": 3, "emb_dim": 12},
    {"name": "categories", "dtype": "category", "min_freq": 3, "emb_dim": 12},

    {"name": "his_asin_seq", "dtype": "category", "islist": True, "min_freq": 3, "emb_dim": 12, "hash_buckets": 1000000},
]

target_cols = ['label', ]

In [28]:
from core.sample import traintest_split

df_train, df_test = traintest_split(df_samples, test_size=0.2, shuffle=True, group_id='reviewerID')
print(len(df_train), len(df_test))

1352538 336650


In [29]:
# from core.dataset import FeatureTransformer

# transformer = FeatureTransformer(feat_configs)

# df_train = transformer.transform(df_train, is_train=True, n_jobs=4)
# df_test = transformer.transform(df_test, is_train=False, n_jobs=4)

In [None]:
from core.dataset import DataFrameDataset

train_dataset = DataFrameDataset(df_train, feat_configs, target_cols, is_raw=True, is_train=True, n_jobs=1, verbose=True)
test_dataset = DataFrameDataset(df_test, feat_configs, target_cols, is_raw=True, is_train=False, n_jobs=1)

In [None]:
df_train.head()

Unnamed: 0,reviewerID,asin,unixReviewTime,overall,title,price,brand,categories,label,his_asin_seq
314370,720607,942600,1375660800,4.0,B+W 58mm Kaesemann Circular Polarizer with Mul...,0.118381,152,196,1,"[-100, -100, -100, -100, -100, -100, -100, -10..."
538816,885984,597768,1342137600,5.0,OtterBox Defender Series Case with Screen Prot...,-0.075831,76,0,1,"[-100, -100, -100, -100, -100, -100, -100, -10..."
194675,949944,277383,1397088000,5.0,Olympus VN-702PC Voice Recorder,-0.124243,53,139,1,"[-100, -100, -100, -100, -100, -100, -100, -10..."
543055,63023,273198,1357516800,4.0,Manfrotto 701HDV Pro Fluid Video Mini Head,1.823619,0,152,1,"[-100, -100, -100, -100, -100, -100, -100, -10..."
355447,604816,70388,1365724800,4.0,Bear Motion Luxury Buffalo Hide Vintage Leathe...,-0.196901,545,0,1,"[-100, -100, -100, -100, -100, -100, -100, -10..."


In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=512, num_workers=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=512, num_workers=8, shuffle=False)

In [None]:
print( len(train_dataloader) )
for features, labels in DataLoader(train_dataset,batch_size=1,shuffle=True):
    print(features)
    print(labels)
    break

2642
{'dense_features': tensor([[-0.5916]]), 'reviewerID': tensor([[146253]], dtype=torch.int32), 'asin': tensor([[826536]], dtype=torch.int32), 'brand': tensor([[633]], dtype=torch.int32), 'categories': tensor([[14]], dtype=torch.int32), 'his_asin_seq': tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,

## Train Model

In [None]:
from model import DNN

dnn_hidden_units = [128,64,32]
model = DNN(feat_configs, hidden_units=dnn_hidden_units)
model = model.to(device)
print(model)

==> Model Input: dense_size=1, sparse_size=60
DNN(
  (embeddings): ModuleDict(
    (reviewerID): Embedding(1000000, 12)
    (asin): Embedding(1000000, 12)
    (brand): Embedding(3419, 12)
    (categories): Embedding(800, 12)
    (his_asin_seq): Embedding(1000000, 12)
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=61, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
  )
  (logits): Linear(in_features=32, out_features=1, bias=True)
)


In [None]:
optimizer = Adam(model.parameters(),  lr = 0.001, weight_decay = 1e-9)
lr_scd = lr_scheduler.StepLR(optimizer, step_size=len(train_dataloader), gamma=0.8)

In [None]:
from core.trainer import Trainer

trainer = Trainer(
    model, 
    optimizer=optimizer,
    lr_scheduler=lr_scd,
    max_epochs=5,
    early_stopping_rounds=3,
    save_ckpt_path='./ckpt/'
)

model = trainer.fit(train_dataloader, eval_dataloader = test_dataloader, ret_model = 'final') #, init_ckpt_path='./ckpt/')

INFO:DNN:[Validation] Epoch: 0/5, Validation Loss: {'loss': 0.62591848538277}
INFO:DNN:Learning rate: 0.001
INFO:DNN:[Training] Epoch: 1/5 iter 0/2642, Training Loss: {'loss': 0.6200003027915955}
INFO:DNN:[Training] Epoch: 1/5 iter 100/2642, Training Loss: {'loss': 0.38717460542917254}
INFO:DNN:[Training] Epoch: 1/5 iter 200/2642, Training Loss: {'loss': 0.3737160398066044}
INFO:DNN:[Training] Epoch: 1/5 iter 300/2642, Training Loss: {'loss': 0.3683753237128258}
INFO:DNN:[Training] Epoch: 1/5 iter 400/2642, Training Loss: {'loss': 0.3657301820069552}
INFO:DNN:[Training] Epoch: 1/5 iter 500/2642, Training Loss: {'loss': 0.3627664979696274}
INFO:DNN:[Training] Epoch: 1/5 iter 600/2642, Training Loss: {'loss': 0.36207988058527313}
INFO:DNN:[Training] Epoch: 1/5 iter 700/2642, Training Loss: {'loss': 0.3600202789902687}
INFO:DNN:[Training] Epoch: 1/5 iter 800/2642, Training Loss: {'loss': 0.3585778087377548}
INFO:DNN:[Training] Epoch: 1/5 iter 900/2642, Training Loss: {'loss': 0.3577653518

In [None]:
ckpt = trainer.load_ckpt('checkpoint')
model.load_state_dict(ckpt['model'].state_dict())

INFO:DNN:Loaded model state_dict from checkpoint.
INFO:DNN:Loaded model.training from checkpoint.
INFO:DNN:Loaded model.feat_configs from checkpoint.
INFO:DNN:Loaded optimizer = Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differ... from checkpoint.
INFO:DNN:Loaded lr_scheduler = <torch.optim.lr_scheduler.StepLR object at 0x5d65357e0> from checkpoint.
INFO:DNN:Loaded logger = <Logger DNN (INFO)> from checkpoint.
INFO:DNN:Loaded default_ckpt_prefix = checkpoint from checkpoint.
INFO:DNN:Loaded num_epoch = 4 from checkpoint.
INFO:DNN:Loaded global_steps = 10568 from checkpoint.
INFO:DNN:Loaded save_ckpt_path = ./ckpt/ from checkpoint.
INFO:DNN:Loaded max_epochs = 5 from checkpoint.
INFO:DNN:Loaded early_stopping_rounds = 3 from checkpoint.
INFO:DNN:Checkpoint loaded from ./ckpt//checkpoint.010568.ckpt.


<All keys matched successfully>

In [None]:
test_preds = []
test_labels = []
model.eval()

for features, labels in test_dataloader:
    outputs = model(features)
    test_preds.append(outputs[:,0])
    test_labels.append(labels[:,0])
test_preds = torch.concat(test_preds, dim=0).detach().cpu().numpy()
test_labels = torch.concat(test_labels, dim=0).detach().cpu().numpy()

In [None]:
print(test_preds.shape, test_labels.shape)

(336650,) (336650,)


In [None]:
from sklearn.metrics import roc_auc_score

auc_score = roc_auc_score(test_labels, test_preds)
print("AUC Score:", auc_score)

AUC Score: 0.6675204951401648
