In [1]:
import random
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.optim import Adam

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_rows = 999
pd.options.display.max_columns = 100

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


## Load data

In [4]:
import joblib

df_samples = joblib.load('data/amazon_data.pkl')

In [5]:
len( df_samples['reviewerID'].unique() )

192403

In [6]:
len( df_samples['asin'].unique() )

63001

In [7]:
len( df_samples['brand'].unique() )

3526

In [8]:
len( df_samples['categories'].unique() )

801

## Create pytorch dataset

In [9]:
feat_configs = [
    {"name": "reviewerID", "dtype": "category", "emb_dim": 12, "hash_buckets": 10000},
    {"name": "asin", "dtype": "category", "emb_dim": 12, "hash_buckets": 10000},
    
    {"name": "price", "dtype": "numerical", "norm": "std"},
    {"name": "brand", "dtype": "category", "emb_dim": 12},
    {"name": "categories", "dtype": "category", "emb_dim": 12},

    {"name": "his_asin_seq", "dtype": "category", "islist": True, "emb_dim": 12, "hash_buckets": 10000},
]

In [10]:
from sample import traintest_split

df_train, df_test = traintest_split(df_samples, test_size=0.2, shuffle=True, group_id='reviewerID')
print(len(df_train), len(df_test))

1352538 336650


In [14]:
from dataset import DataFrameDataset, feature_transform

df_train = feature_transform(df_train, feat_configs, is_train=True)
feat_configs

==> Feature transforming (is_train=True), note that feat_configs will be updated when is_train=True...
Processing feature reviewerID...
Processing feature asin...
Processing feature price...
Processing feature brand...
Processing feature categories...
Processing feature his_asin_seq...


[{'name': 'reviewerID',
  'dtype': 'category',
  'emb_dim': 12,
  'hash_buckets': 10000,
  'type': 'sparse',
  'vocab': {5640: {'idx': 0, 'freq_cnt': 663},
   728: {'idx': 1, 'freq_cnt': 591},
   1886: {'idx': 2, 'freq_cnt': 587},
   9605: {'idx': 3, 'freq_cnt': 556},
   1704: {'idx': 4, 'freq_cnt': 517},
   9290: {'idx': 5, 'freq_cnt': 500},
   189: {'idx': 6, 'freq_cnt': 496},
   2201: {'idx': 7, 'freq_cnt': 487},
   1093: {'idx': 8, 'freq_cnt': 440},
   7312: {'idx': 9, 'freq_cnt': 432},
   5126: {'idx': 10, 'freq_cnt': 429},
   6543: {'idx': 11, 'freq_cnt': 410},
   8530: {'idx': 12, 'freq_cnt': 402},
   9199: {'idx': 13, 'freq_cnt': 399},
   5519: {'idx': 14, 'freq_cnt': 383},
   3564: {'idx': 15, 'freq_cnt': 377},
   4237: {'idx': 16, 'freq_cnt': 373},
   4496: {'idx': 17, 'freq_cnt': 370},
   1124: {'idx': 18, 'freq_cnt': 369},
   3330: {'idx': 19, 'freq_cnt': 369},
   9667: {'idx': 20, 'freq_cnt': 366},
   1187: {'idx': 21, 'freq_cnt': 366},
   6534: {'idx': 22, 'freq_cnt': 365

In [15]:
df_train.head()

Unnamed: 0,reviewerID,asin,unixReviewTime,overall,title,price,brand,categories,label,his_asin_seq
314370,6403,889,1375660800,4.0,B+W 58mm Kaesemann Circular Polarizer with Mul...,0.118381,150,196,1,"[-100, -100, -100, -100, -100, -100, -100, -10..."
538816,5691,126,1342137600,5.0,OtterBox Defender Series Case with Screen Prot...,-0.075831,75,0,1,"[-100, -100, -100, -100, -100, -100, -100, -10..."
194675,6242,1016,1397088000,5.0,Olympus VN-702PC Voice Recorder,-0.124243,52,139,1,"[-100, -100, -100, -100, -100, -100, -100, -10..."
543055,2353,3007,1357516800,4.0,Manfrotto 701HDV Pro Fluid Video Mini Head,1.823619,0,152,1,"[-100, -100, -100, -100, -100, -100, -100, -10..."
355447,5972,1266,1365724800,4.0,Bear Motion Luxury Buffalo Hide Vintage Leathe...,-0.196901,543,0,1,"[-100, -100, -100, -100, -100, -100, -100, -10..."


In [16]:
df_test = feature_transform(df_test, feat_configs, is_train=False)

==> Feature transforming (is_train=False) ...
Processing feature reviewerID...
Processing feature asin...
Processing feature price...
Processing feature brand...
Processing feature categories...
Processing feature his_asin_seq...


In [17]:
sparse_cols = [f['name'] for f in feat_configs if f['type'] == 'sparse' and not f.get('islist')]
seq_sparse_cols = [f['name'] for f in feat_configs if f['type'] == 'sparse' and f.get('islist')]
dense_cols = ['price',]
target_cols = ['label', ]

In [18]:
train_dataset = DataFrameDataset(
    df_train, 
    sparse_cols, 
    seq_sparse_cols, 
    dense_cols, 
    seq_dense_cols=None, 
    target_cols=target_cols, 
    padding_value=-100
).to(device)

test_dataset = DataFrameDataset(
    df_test, 
    sparse_cols, 
    seq_sparse_cols, 
    dense_cols, 
    seq_dense_cols=None, 
    target_cols=target_cols, 
    padding_value=-100
).to(device)

In [19]:
train_dataloader = DataLoader(train_dataset, batch_size=512, num_workers=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=512, num_workers=8, shuffle=False)

In [20]:
print( len(train_dataloader) )
for features, labels in DataLoader(train_dataset,batch_size=1,shuffle=True):
    print(features)
    print(labels)
    break

2642
{'dense_features': tensor([[1.3307]]), 'reviewerID': tensor([[5624]], dtype=torch.int32), 'asin': tensor([[930]], dtype=torch.int32), 'brand': tensor([[2450]], dtype=torch.int32), 'categories': tensor([[7]], dtype=torch.int32), 'his_asin_seq': tensor([[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -

## Train Model

In [21]:
from model import DNN

dnn_hidden_units = [128,64,32]
model = DNN(feat_configs, hidden_units=dnn_hidden_units)
model = model.to(device)
print(model)

==> Model Input: dense_size=1, sparse_size=60
DNN(
  (embeddings): ModuleDict(
    (reviewerID): Embedding(10001, 12)
    (asin): Embedding(9985, 12)
    (brand): Embedding(3417, 12)
    (categories): Embedding(800, 12)
    (his_asin_seq): Embedding(9982, 12)
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=61, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
  )
  (logits): Linear(in_features=32, out_features=1, bias=True)
)


In [22]:
optimizer = Adam(model.parameters(),  lr = 0.001, weight_decay = 1e-9)
optimizer_scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.8)

In [23]:
from train import train_loop

train_loop(model, 
    train_dataloader,
    eval_dataloader=test_dataloader, 
    optimizer=optimizer,
    optimizer_scheduler=optimizer_scheduler,
    num_epochs = 5,
    early_stopping_rounds = 10,
    best_model_path = 'bestmodel.pth',
    final_model_path = 'finalmodel.pth',
    ret_model='final'
)

INFO:DNN:[Training] Epoch: 1/5 iter 0/2642, Training Loss: {'total': 0.6557846069335938}
INFO:DNN:[Training] Epoch: 1/5 iter 100/2642, Training Loss: {'total': 0.40360928684473035}
INFO:DNN:[Training] Epoch: 1/5 iter 200/2642, Training Loss: {'total': 0.38086355835199354}
INFO:DNN:[Training] Epoch: 1/5 iter 300/2642, Training Loss: {'total': 0.37184048374493917}
INFO:DNN:[Training] Epoch: 1/5 iter 400/2642, Training Loss: {'total': 0.3670854998379946}
INFO:DNN:[Training] Epoch: 1/5 iter 500/2642, Training Loss: {'total': 0.3646775381565094}
INFO:DNN:[Training] Epoch: 1/5 iter 600/2642, Training Loss: {'total': 0.36244674667716026}
INFO:DNN:[Training] Epoch: 1/5 iter 700/2642, Training Loss: {'total': 0.3603938809888704}
INFO:DNN:[Training] Epoch: 1/5 iter 800/2642, Training Loss: {'total': 0.3593648935854435}
INFO:DNN:[Training] Epoch: 1/5 iter 900/2642, Training Loss: {'total': 0.3585411932402187}
INFO:DNN:[Training] Epoch: 1/5 iter 1000/2642, Training Loss: {'total': 0.35776172640919

DNN(
  (embeddings): ModuleDict(
    (reviewerID): Embedding(10001, 12)
    (asin): Embedding(9985, 12)
    (brand): Embedding(3417, 12)
    (categories): Embedding(800, 12)
    (his_asin_seq): Embedding(9982, 12)
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=61, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
  )
  (logits): Linear(in_features=32, out_features=1, bias=True)
)

In [24]:
test_preds = []
test_labels = []
model.eval()

for features, labels in test_dataloader:
    outputs = model(features)
    test_preds.append(outputs[:,0])
    test_labels.append(labels[:,0])
test_preds = torch.concat(test_preds, dim=0).detach().cpu().numpy()
test_labels = torch.concat(test_labels, dim=0).detach().cpu().numpy()

In [25]:
print(test_preds.shape, test_labels.shape)

(336650,) (336650,)


In [26]:
from sklearn.metrics import roc_auc_score

auc_score = roc_auc_score(test_labels, test_preds)
print("AUC Score:", auc_score)

AUC Score: 0.6566158507325903
