In [1]:
import random
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.optim import Adam

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_rows = 999
pd.options.display.max_columns = 100

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


## Generate random data

In [4]:
n_users = 1000
n_items = 100

item_cate = ['book', 'movie', 'clothes', '3C', 'shoes', 'makeup', 'groceries']

# random generate user data
user_id = [i for i in range(n_users)]
education = [random.choice(["High School", "Undergraduate", "Graduate", "PhD", "unknown"]) for _ in range(n_users)]
gender = [random.choice(["Male", "Female",]) for _ in range(n_users)]
age = [random.randint(18, 80) for _ in range(n_users)]
income = [random.randint(1000, 100000) for _ in range(n_users)]
clk_cate_seq = [
    [random.choice(item_cate) for _ in range(random.randint(1, 5))] 
    for _ in range(n_users)
]

user_df = pd.DataFrame({
    "user_id": user_id,
    "education": education,
    "gender": gender,
    "age": age,
    "income": income,
    "clk_cate_seq": clk_cate_seq
})

# random generate item data
item_id = [i for i in range(n_items)]
item_cate = [random.choice(item_cate) for _ in range(n_items)]

item_df = pd.DataFrame({
    "item_id": item_id,
    "item_cate": item_cate
})

# random generate click data 
click_data = []
for user in user_id:
    for item in item_id:
        click_data.append({
            "user_id": user,
            "item_id": item,
            "click": random.choice([0, 1])
        })

click_df = pd.DataFrame(click_data)

In [5]:
# join user, item, click data
df_samples = click_df.merge(user_df, on="user_id").merge(item_df, on="item_id")
df_samples.head()

Unnamed: 0,user_id,item_id,click,education,gender,age,income,clk_cate_seq,item_cate
0,0,0,0,Graduate,Female,50,26481,"[clothes, groceries, makeup, clothes, groceries]",3C
1,1,0,0,PhD,Male,65,20503,[book],3C
2,2,0,0,unknown,Female,59,61327,"[clothes, movie]",3C
3,3,0,1,PhD,Female,26,95470,[book],3C
4,4,0,1,Graduate,Male,43,57121,"[movie, 3C, movie]",3C


## Create pytorch dataset

In [6]:
feat_configs = [
    {"name": "education", "dtype": "category", "emb_dim": 8},
    {"name": "gender", "dtype": "category", "emb_dim": 8},
    {"name": "age", "dtype": "numerical", "norm": "std"},
    {"name": "income", "dtype": "numerical", "hash_buckets": 10, "emb_dim": 8},
    {"name": "item_cate", "dtype": "category", "emb_dim": 8, "hash_buckets": 10},
    {"name": "clk_cate_seq", "dtype": "category", "islist": True, "emb_dim": 8, "hash_buckets": 5},

    {"name": "item_id", "dtype": "category", "emb_dim": 8, "hash_buckets": 100},
]

In [7]:
from core.sample import traintest_split

df_train, df_test = traintest_split(df_samples, test_size=0.2, shuffle=True, group_id='user_id')
print(len(df_train), len(df_test))

80000 20000


In [8]:
from core.dataset import DataFrameDataset, feature_transform

df_train = feature_transform(df_train, feat_configs, is_train=True)
feat_configs

==> Feature transforming (is_train=True), note that feat_configs will be updated when is_train=True...
Processing feature education...
Processing feature gender...
Processing feature age...
Processing feature income...
Processing feature item_cate...
Processing feature clk_cate_seq...
Processing feature item_id...


[{'name': 'education',
  'dtype': 'category',
  'emb_dim': 8,
  'type': 'sparse',
  'vocab': {'other': {'idx': 0, 'freq_cnt': 18000},
   'high school': {'idx': 1, 'freq_cnt': 16100},
   'undergraduate': {'idx': 2, 'freq_cnt': 15800},
   'graduate': {'idx': 3, 'freq_cnt': 15600},
   'phd': {'idx': 4, 'freq_cnt': 14500}}},
 {'name': 'gender',
  'dtype': 'category',
  'emb_dim': 8,
  'type': 'sparse',
  'vocab': {'male': {'idx': 0, 'freq_cnt': 41500},
   'female': {'idx': 1, 'freq_cnt': 38500},
   'other': {'idx': 2, 'freq_cnt': 0}}},
 {'name': 'age',
  'dtype': 'numerical',
  'norm': 'std',
  'type': 'dense',
  'mean': 49.87125,
  'std': 18.02668121016653,
  'min': 18,
  'max': 80},
 {'name': 'income',
  'dtype': 'numerical',
  'hash_buckets': 10,
  'emb_dim': 8,
  'type': 'sparse',
  'mean': 51158.99,
  'std': 29512.678079928733,
  'min': 1043,
  'max': 99902,
  'vocab': [nan,
   -inf,
   1043.0,
   12380.0,
   21450.0,
   32786.0,
   45138.0,
   57558.0,
   69077.0,
   79825.0,
   9111

In [9]:
df_train.head()

Unnamed: 0,user_id,item_id,click,education,gender,age,income,clk_cate_seq,item_cate
9219,219,0,0,1,1,0.561875,9,"[-100, -100, 1, 3, 2]",5
4377,377,55,1,2,1,0.506402,8,"[-100, -100, -100, 2, 0]",2
38323,323,25,1,3,0,-1.102324,4,"[-100, 3, 0, 2, 0]",0
78984,984,35,1,2,1,-0.824958,10,"[-100, -100, 0, 3, 1]",0
47801,801,0,0,1,1,0.284509,8,"[-100, -100, -100, -100, 0]",0


In [10]:
df_test = feature_transform(df_test, feat_configs, is_train=False)

==> Feature transforming (is_train=False) ...
Processing feature education...
Processing feature gender...
Processing feature age...
Processing feature income...
Processing feature item_cate...
Processing feature clk_cate_seq...
Processing feature item_id...


In [11]:
sparse_cols = ['item_id','education','gender','income','item_cate',]
seq_sparse_cols = ['clk_cate_seq',]
dense_cols = ['age',]
target_cols = ['click', ]

In [12]:
train_dataset = DataFrameDataset(
    df_train, 
    sparse_cols, 
    seq_sparse_cols, 
    dense_cols, 
    seq_dense_cols=None, 
    target_cols=target_cols, 
    padding_value=-100
).to(device)

test_dataset = DataFrameDataset(
    df_test, 
    sparse_cols, 
    seq_sparse_cols, 
    dense_cols, 
    seq_dense_cols=None, 
    target_cols=target_cols, 
    padding_value=-100
).to(device)

In [13]:
train_dataloader = DataLoader(train_dataset, batch_size=128, num_workers=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, num_workers=4, shuffle=False)

In [14]:
print( len(train_dataloader) )
for features, labels in DataLoader(train_dataset,batch_size=2,shuffle=True):
    print(features)
    print(labels)
    break

625
{'dense_features': tensor([[ 0.8947],
        [-1.7680]]), 'item_id': tensor([[ 8],
        [14]], dtype=torch.int32), 'education': tensor([[2],
        [0]], dtype=torch.int32), 'gender': tensor([[1],
        [1]], dtype=torch.int32), 'income': tensor([[4],
        [9]], dtype=torch.int32), 'item_cate': tensor([[0],
        [4]], dtype=torch.int32), 'clk_cate_seq': tensor([[-100,    2,    2,    1,    1],
        [   3,    2,    0,    2,    0]], dtype=torch.int32)}
tensor([[1.],
        [1.]])


## Train Model

In [15]:
from model import DNN

dnn_hidden_units = [128,64,32]
model = DNN(feat_configs, hidden_units=dnn_hidden_units)
model = model.to(device)
print(model)

==> Model Input: dense_size=1, sparse_size=48
DNN(
  (embeddings): ModuleDict(
    (education): Embedding(5, 8)
    (gender): Embedding(3, 8)
    (income): Embedding(13, 8)
    (item_cate): Embedding(7, 8)
    (clk_cate_seq): Embedding(5, 8)
    (item_id): Embedding(65, 8)
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=49, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
  )
  (logits): Linear(in_features=32, out_features=1, bias=True)
)


In [18]:
optimizer = Adam(model.parameters(),  lr = 0.01, weight_decay = 1e-9)
optimizer_scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

In [19]:
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('DNN')
logger.setLevel(logging.INFO)

def train_model(model: nn.Module, 
    train_dataloader: torch.utils.data.DataLoader, 
    eval_dataloader: torch.utils.data.DataLoader,
    optimizer: optim.Optimizer,
    num_epochs: int,
    early_stopping_rounds: int = None,
    best_model_path=None,
    final_model_path=None,
    ret_model='final'
):
    eval_losses = []
    best_eval_loss = None

    for epoch in range(1, num_epochs+1):
        model.train()
        train_loss = {'total': 0., }
    
        # Training 
        for k, (features, labels) in enumerate(train_dataloader):
            # features = features.to(device)
            # labels = labels.to(device)
            
            optimizer.zero_grad()   # zero the parameter gradients
            outputs = model(features)
            loss = nn.BCEWithLogitsLoss(reduction='mean')(outputs, labels)
            loss.backward()        # compute gradients
            optimizer.step()       # adjust parameters based on the calculated gradients 
            train_loss['total'] += loss.item()  # track the loss value 
            
            if k % 100 == 0:
                logger.info(f'[Training] Epoch: {epoch}/{num_epochs} iter {k}/{len(train_dataloader)}, Training Loss: {loss.item()}')
                
        optimizer_scheduler.step()
                
        for _type, _value in train_loss.items():
            train_loss[_type] = _value / len(train_dataloader)

        # Validation
        with torch.no_grad(): 
            model.eval()
            eval_loss = {'total': 0.}

            for features, labels in eval_dataloader:                
                outputs = model(features)
                loss = nn.BCEWithLogitsLoss(reduction='mean')(outputs, labels)
                eval_loss['total'] += loss.item()  # track the loss value 
                
            for _type, _value in eval_loss.items():
                eval_loss[_type] = _value / len(eval_dataloader)
                

        logger.info(f'[Validatoin] Epoch: {epoch}/{num_epochs}, Training Loss: {train_loss}, Validation Loss: {eval_loss}')

        if early_stopping_rounds:
            if len(eval_losses) >= early_stopping_rounds:
                eval_loss_his_avg = np.mean([v['total'] for v in eval_losses[-early_stopping_rounds:]])
                if eval_loss['total'] > eval_loss_his_avg:
                    logger.info(f'Early stopping at epoch {epoch}...')
                    break
        eval_losses.append(eval_loss)

        if best_model_path:
            if best_eval_loss is None or eval_loss['total'] < best_eval_loss:
                best_eval_loss = eval_loss['total']
                torch.save(model.state_dict(), best_model_path)

    if final_model_path:
        torch.save(model.state_dict(), final_model_path)
        
    if ret_model == 'best' and best_model_path:
        model.load_state_dict(torch.load(best_model_path))

    return model

train_model(model, 
    train_dataloader, 
    test_dataloader,
    optimizer,
    num_epochs = 3,
    early_stopping_rounds = 10,
    best_model_path = 'bestmodel.pth',
    final_model_path = 'finalmodel.pth',
    ret_model='final'
)

INFO:DNN:[Training] Epoch: 1/3 iter 0/625, Training Loss: 0.6918329000473022
INFO:DNN:[Training] Epoch: 1/3 iter 100/625, Training Loss: 0.6912421584129333
INFO:DNN:[Training] Epoch: 1/3 iter 200/625, Training Loss: 0.6936115026473999
INFO:DNN:[Training] Epoch: 1/3 iter 300/625, Training Loss: 0.6942870020866394
INFO:DNN:[Training] Epoch: 1/3 iter 400/625, Training Loss: 0.695564866065979
INFO:DNN:[Training] Epoch: 1/3 iter 500/625, Training Loss: 0.6914029121398926
INFO:DNN:[Training] Epoch: 1/3 iter 600/625, Training Loss: 0.6914725303649902
INFO:DNN:[Validatoin] Epoch: 1/3, Training Loss: {'total': 0.6934184716224671}, Validation Loss: {'total': 0.6931441327568831}
INFO:DNN:[Training] Epoch: 2/3 iter 0/625, Training Loss: 0.6925989389419556
INFO:DNN:[Training] Epoch: 2/3 iter 100/625, Training Loss: 0.6943463683128357
INFO:DNN:[Training] Epoch: 2/3 iter 200/625, Training Loss: 0.6943218111991882
INFO:DNN:[Training] Epoch: 2/3 iter 300/625, Training Loss: 0.6908657550811768
INFO:DNN:

DNN(
  (embeddings): ModuleDict(
    (education): Embedding(5, 8)
    (gender): Embedding(3, 8)
    (income): Embedding(13, 8)
    (item_cate): Embedding(7, 8)
    (clk_cate_seq): Embedding(5, 8)
    (item_id): Embedding(65, 8)
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=49, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
  )
  (logits): Linear(in_features=32, out_features=1, bias=True)
)