In [3]:
#import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
%matplotlib inline
from statsmodels.stats.proportion import proportions_ztest

from sklearn.metrics import auc
import xgboost as xgb


from sklearn.model_selection import train_test_split

import os


In [4]:
df = pd.read_csv('../../dataset/criteo-uplift-v2.1/criteo-uplift-v2.1.csv')

## CTR modeling

In [2]:
# !pip install deepctr_torch
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-0.10.0-py3-none-any.whl (529 kB)
[K     |████████████████████████████████| 529 kB 781 kB/s eta 0:00:01
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.10.0
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


### modeling defined

In [3]:
import torch.nn as nn
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

import torchmetrics
from torchmetrics.functional import auc
from torchmetrics.classification import BinaryF1Score
from torchmetrics.classification import BinaryPrecision
from torchmetrics import Recall

In [5]:
class SimpleCustomBatch:
    def __init__(self, data):
        transposed_data = list(zip(*data))
        self.inp = torch.stack(transposed_data[0], 0)
        self.tgt = torch.stack(transposed_data[1], 0)

    # custom memory pinning method on custom type
    def pin_memory(self):
        self.inp = self.inp.pin_memory()
        self.tgt = self.tgt.pin_memory()
        return self

def collate_wrapper(batch):
    return SimpleCustomBatch(batch)

In [6]:
X = df[['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'treatment']]
Y = df[['conversion', 'visit', 'exposure']]

train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.1, random_state=42)


In [7]:
train_data = pd.concat([X, Y], axis = 1)

In [12]:
train_data.columns

Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'treatment', 'conversion', 'visit', 'exposure'],
      dtype='object')

In [13]:
train_data['type'] = train_data['conversion'] + train_data['visit']

In [17]:
train_data0 = train_data[train_data['type']==0].sample(frac = 0.05)
train_data1 = train_data[train_data['type']==1].sample(frac = 1)
train_data2 = train_data[train_data['type']==2].sample(frac = 1)

train_data = pd.concat([train_data0, train_data1, train_data2], axis = 0)

In [20]:
train_x, train_y = train_data[['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'treatment']], train_data[['conversion', 'visit', 'exposure']]

In [21]:
train_x, test_x, train_y, test_y = torch.tensor(np.array(train_x)).to(torch.float), torch.tensor(np.array(test_x)).to(torch.float), torch.tensor(np.array(train_y)).to(torch.float), torch.tensor(np.array(test_y)).to(torch.float)
train_dataset = TensorDataset(train_x, train_y)
test_dataset = TensorDataset(test_x, test_y)

train_loader = DataLoader(train_dataset, batch_size=512, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=512, pin_memory=True)

In [29]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output1_size, output2_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.leakyrelu = nn.LeakyReLU()
        self.fc_mid = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size+output2_size, output1_size)
#         self.fc2 = nn.Linear(output2_size, output1_size)
        self.fc3 = nn.Linear(hidden_size, output2_size)
    
    def forward(self, x):
        out = self.leakyrelu(self.fc1(x))
        out = self.leakyrelu(self.fc_mid(out))
        out2 = self.fc3(out)
        out1 = self.fc2(self.leakyrelu(torch.cat((out, out2), 1)))
#         out1 = self.fc2(out2)
        return out1, out2

In [30]:
model = Net(train_x.shape[1], 2*train_x.shape[1], 2, 2)
loss_fct = CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr = 1e-4)

log_interval = 1000
epochs = 10

In [31]:
model.train()
metric = torchmetrics.Accuracy()
F1 = BinaryF1Score()
Precision = BinaryPrecision()
recall = Recall(average='macro', num_classes=2)
for _ in range(epochs):
    for batch_ndx, sample in enumerate(train_loader):
        pred_1, pred_2 = model(sample[0])


        loss1 = loss_fct(pred_1, sample[1][:,0].to(torch.long))
        loss2 = loss_fct(pred_2, sample[1][:,1].to(torch.long))


        loss = loss1 + loss2
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_ndx % log_interval == 0:
            loss = loss.item()
    #         logger.info(f"eval loss: {loss:>7f}   eval accuracy: {accuracy:>7f}  curent step: [{current_step}]")

            acc1 = metric(pred_1, sample[1][:,0].to(torch.long))
            acc2 = metric(pred_2, sample[1][:,1].to(torch.long))

            prediction1 = torch.max(F.softmax(pred_1),1)[1]
            prediction2 = torch.max(F.softmax(pred_2),1)[1]  

            auc1 = auc(prediction1, sample[1][:,0], reorder=True)
            auc2 = auc(prediction2, sample[1][:,1], reorder=True)

            F1_1 = F1(prediction1, sample[1][:,0])
            F1_2 = F1(prediction2, sample[1][:,1])

            Precision1 = Precision(prediction1, sample[1][:,0])
            Precision2 = Precision(prediction2, sample[1][:,1])

            recall1 = recall(prediction1, sample[1][:,0].to(torch.int))
            recall2 = recall(prediction2, sample[1][:,1].to(torch.int))

            print(f"Accuracy1 on batch {batch_ndx}: {acc1: .5f}, auc1:{auc1: .5f}, F1_1:{F1_1: .5f}, precision1:{Precision1: .5f}, recall1:{recall1: .5f}")
            print(f"Accuracy2 on batch {batch_ndx}: {acc2: .5f}, auc2:{auc2: .5f}, F1_2:{F1_2: .5f}, precision2:{Precision2: .5f}, recall2:{recall2: .5f}")
            print(' ')

    #     break
    
    
# metric on all batches using custom accumulation
acc = metric.compute()
print(f"Accuracy on all data: {acc}")

metric.reset()



Accuracy1 on batch 0:  1.00000, auc1: 0.00000, F1_1: 0.00000, precision1: 0.00000, recall1: 1.00000
Accuracy2 on batch 0:  0.96875, auc2: 0.00000, F1_2: 0.00000, precision2: 0.00000, recall2: 0.48438
 
Accuracy1 on batch 1000:  1.00000, auc1: 0.00000, F1_1: 0.00000, precision1: 0.00000, recall1: 1.00000
Accuracy2 on batch 1000:  1.00000, auc2: 0.00000, F1_2: 0.00000, precision2: 0.00000, recall2: 1.00000
 
Accuracy1 on batch 2000:  1.00000, auc1: 0.00000, F1_1: 0.00000, precision1: 0.00000, recall1: 1.00000
Accuracy2 on batch 2000:  1.00000, auc2: 0.00000, F1_2: 1.00000, precision2: 1.00000, recall2: 1.00000
 
Accuracy1 on batch 0:  0.00195, auc1: 0.00000, F1_1: 0.00000, precision1: 0.00000, recall1: 0.00098
Accuracy2 on batch 0:  0.00000, auc2: 0.00000, F1_2: 0.00000, precision2: 0.00000, recall2: 0.00000
 
Accuracy1 on batch 1000:  1.00000, auc1: 0.00000, F1_1: 0.00000, precision1: 0.00000, recall1: 1.00000
Accuracy2 on batch 1000:  1.00000, auc2: 0.00000, F1_2: 0.00000, precision2: 

In [32]:
model.eval()
metric = torchmetrics.Accuracy()
F1 = BinaryF1Score()
Precision = BinaryPrecision()
recall = Recall(average='macro', num_classes=2)

for batch_ndx, sample in enumerate(test_loader):
    with torch.no_grad():
        pred_1, pred_2 = model(sample[0])
        loss1 = loss_fct(pred_1, sample[1][:,0].to(torch.long))
        loss2 = loss_fct(pred_2, sample[1][:,1].to(torch.long))
        
        loss = loss1 + loss2
        
        if batch_ndx % log_interval == 0:

            acc1 = metric(pred_1, sample[1][:,0].to(torch.long))
            acc2 = metric(pred_2, sample[1][:,1].to(torch.long))

            prediction1 = torch.max(F.softmax(pred_1),1)[1]
            prediction2 = torch.max(F.softmax(pred_2),1)[1]  

            auc1 = auc(prediction1, sample[1][:,0], reorder=True)
            auc2 = auc(prediction2, sample[1][:,1], reorder=True)

            F1_1 = F1(prediction1, sample[1][:,0])
            F1_2 = F1(prediction2, sample[1][:,1])

            Precision1 = Precision(prediction1, sample[1][:,0])
            Precision2 = Precision(prediction2, sample[1][:,1])

            recall1 = recall(prediction1, sample[1][:,0].to(torch.int))
            recall2 = recall(prediction2, sample[1][:,1].to(torch.int))

            print(f"Accuracy1 on batch {batch_ndx}: {acc1: .5f}, auc1:{auc1: .5f}, F1_1:{F1_1: .5f}, precision1:{Precision1: .5f}, recall1:{recall1: .5f}")
            print(f"Accuracy2 on batch {batch_ndx}: {acc2: .5f}, auc2:{auc2: .5f}, F1_2:{F1_2: .5f}, precision2:{Precision2: .5f}, recall2:{recall2: .5f}")
            print(' ')
            
# metric on all batches using custom accumulation
acc = metric.compute()
print(f"Accuracy on all data: {acc}")

metric.reset()



Accuracy1 on batch 0:  0.00000, auc1: 0.00000, F1_1: 0.00000, precision1: 0.00000, recall1: 0.00000
Accuracy2 on batch 0:  0.04883, auc2: 0.00000, F1_2: 0.09311, precision2: 0.04883, recall2: 0.50000
 
Accuracy1 on batch 1000:  0.00391, auc1: 0.00000, F1_1: 0.00778, precision1: 0.00391, recall1: 0.50000
Accuracy2 on batch 1000:  0.04883, auc2: 0.00000, F1_2: 0.09311, precision2: 0.04883, recall2: 0.50000
 
Accuracy1 on batch 2000:  0.00000, auc1: 0.00000, F1_1: 0.00000, precision1: 0.00000, recall1: 0.00000
Accuracy2 on batch 2000:  0.04297, auc2: 0.00000, F1_2: 0.08240, precision2: 0.04297, recall2: 0.50000
 
Accuracy on all data: 0.02408854104578495
