# Evaluate Model Performance and Compute ROC_AUC

In this notebook, I will evaluate the performance of the trained wide and deep model by computing the ROC_AUC score on the test dataset. I choose ROC_AUC score as the evaluation metrics because it cares about ranking and the threshold you choose to determine the prediction outcome does not matter. The intuition behind ROC_AUC score is *the probability of a randomly chosen postive target ranking higher than a randomly chosen negative target*.

A real-world implementation of the CTR prediction project is to select a handful of applications that the user is most likely to click on. In other words, you don't have to predict the probability of click for all the tens of thousands of applications. You only need to make sure the user is interested in the top ranking handful of applications.

The main part of this notebook is copied from the training code. Then we load the saved model and output the predicted probability for each record in the test dataset and compute the ROC_AUC score.

In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import IterableDataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import gc
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score
import math
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR

In [2]:
# During the first step where I extract wanted features in a spark job,
# the features that I would like to feed to the deep part of the model
# were named as <feature_name>SEP<num_of_unique_values>SEP<embedding_dim>


WIDE_DIM = 453
COLUMNS = ['label',
           'device_modelSEP8251SEP256',
           'app_idSEP8552SEP256',
           'site_idSEP4737SEP256',
           'site_domainSEP7745SEP256',
           'app_domainSEP559SEP128',
          ]
COLUMNS = ['wide_feature_' + str(i) for i in range(WIDE_DIM)] + COLUMNS
EMBEDDING_INPUTS = COLUMNS[-5:]

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NUM_WORKERS = 6
EPOCHS = 15

In [3]:
class WideAndDeep(nn.Module):
    def __init__(self, wide_dim, embedding_inputs, hidden_layers, dropout_p=0.5):
        super().__init__()
        self.wide_dim = wide_dim
        self.embedding_inputs = embedding_inputs
        self.deep_feature_dim = 0
        self.hidden_layers = hidden_layers
        
        # For each deep feature, create an embedding layer to convert them to embeddings
        for embedding_input in self.embedding_inputs:
            col_name, vocab_size, embed_dim = embedding_input.split('SEP')
            setattr(self, col_name+'_emb_layer', nn.Embedding(int(vocab_size), int(embed_dim)))
            self.deep_feature_dim += int(embed_dim)
        
        # A series of hidden layers that take the embeddings as input
        self.linear_layer_1 = nn.Linear(self.deep_feature_dim, self.hidden_layers[0])
        self.bn_1 = nn.BatchNorm1d(self.hidden_layers[0])
        for i, hidden_layer in enumerate(self.hidden_layers[1:]):
            setattr(self, f'linear_layer_{i+2}', nn.Linear(self.hidden_layers[i], hidden_layer))
        
        self.dropout = nn.Dropout(p=dropout_p)
        
        # Final dense layer that combine the wide features and the deep features and generate output
        self.fc = nn.Linear(self.wide_dim+self.hidden_layers[-1], 1)
        
    
    def forward(self, X_w, X_d):
        embeddings = [getattr(self, col_name+'_emb_layer')(X_d[:, i].long())
                      for i, embedding_input in enumerate(self.embedding_inputs)
                      for col_name in embedding_input.split('SEP')
                      if not col_name.isdigit()
                     ]
        
        deep_out = torch.cat(embeddings, dim=-1) # concatenate the embeddings of all deep features
        
        for i, _ in enumerate(self.hidden_layers):
            deep_out = F.relu(getattr(self, f'linear_layer_{i+1}')(deep_out))
        
        X_w = self.dropout(X_w) # Apply a dropout layer to the wide features for regularization purposes
        fc_input = torch.cat([X_w, deep_out], dim=-1) # concatenate the wide and processed deep features
        out = self.fc(fc_input)
        
        return out

In [4]:
# Split the dataset into <num_workers> parts, so that each worker get a unique copy of a part of the dataset
# https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset

class CtrDataset(IterableDataset):
    def __init__(self, chunksize=10000, train=True):
        super().__init__()
        self.train = train
        if self.train:
            self.num_lines = 28303473 # wc -l ./train_full.csv
            self.path = './train_full.csv'
        else:
            self.num_lines = 12125494 # wc -l ./validation_full.csv
            self.path = './validation_full.csv'
        self.chunksize = chunksize
        self.start = 0
        self.end = self.num_lines + self.start - 1
    
    def process_data(self, data):
        for i, chunk in enumerate(data):
            if self.start + i*chunk.shape[0] >= self.end:
                break
            else:
                chunk.columns = COLUMNS
                
                # Don't repeat at the end of each partition
                size = min(self.chunksize, self.end - (self.start + i*chunk.shape[0]))
                
                X_w = chunk.iloc[:size, :WIDE_DIM].values.astype(np.float32).squeeze()
                X_d = chunk.iloc[:size][EMBEDDING_INPUTS].values.astype(np.float32).squeeze()
                label = chunk.iloc[:size]['label'].values.astype(np.float32).squeeze()
                yield X_w, X_d, label
    
    def __iter__(self):
        self.df = pd.read_csv(self.path,
                             header=None,
                             chunksize=self.chunksize,
                             skiprows=self.start,
                            )
        return self.process_data(self.df)

def worker_init_fn(worker_id):
    worker_info = torch.utils.data.get_worker_info()
    dataset = worker_info.dataset  # the dataset copy in this worker process
    overall_start = dataset.start
    overall_end = dataset.end
    # configure the dataset to only process the split workload
    per_worker = int(math.ceil((overall_end - overall_start) / float(worker_info.num_workers)))
    worker_id = worker_info.id
    dataset.start = overall_start + worker_id * per_worker
    dataset.end = min(dataset.start + per_worker, overall_end)

In [5]:
# A helper function to get the total number of batches

def get_total(dset, dl):
    temp = int(math.ceil((dset.end - dset.start) / float(dl.num_workers))) 
    total = int(math.ceil(temp / dset.chunksize)) * dl.num_workers
    
    return total

In [6]:
val_dset = CtrDataset(train=False, chunksize=8192)
val_dl = DataLoader(val_dset,
                    batch_size=1,
                    num_workers=1,
                    worker_init_fn=worker_init_fn,
                   )
val_total = get_total(val_dset, val_dl)

y_true = np.zeros(val_dl.dataset.num_lines)
y_proba = np.zeros(val_dl.dataset.num_lines)

model = WideAndDeep(wide_dim=WIDE_DIM, embedding_inputs=EMBEDDING_INPUTS, hidden_layers=[512, 256, 128],
                    dropout_p=0.7,
                   )
model.load_state_dict(torch.load('./saved_model.pt')) # load the weights from trained model (15 epochs)

pbar = tqdm(val_dl, total=val_total)
model.eval()
model.to(DEVICE)
for batch_i, (X_w, X_d, label) in enumerate(pbar):
    X_w = X_w.squeeze().to(DEVICE, non_blocking=True)
    X_d = X_d.squeeze().to(DEVICE, non_blocking=True)
    label = label.squeeze().unsqueeze(1).to(DEVICE, non_blocking=True)
    start = batch_i * val_dset.chunksize
    end = start + label.shape[0]

    with torch.no_grad() and torch.cuda.amp.autocast():
        outputs = model(X_w, X_d)
        proba = torch.sigmoid(outputs)

    y_true[start:end] = label.squeeze().detach().cpu().numpy()
    y_proba[start:end] = proba.squeeze().detach().cpu().numpy()

100%|███████████████████████████████████████| 1481/1481 [07:25<00:00,  3.33it/s]


The ROC_AUC score of the wide and deep model is 0.7497, which is ~5.4% improvement from gradient boosting tree model (0.7112).

In [7]:
roc_auc_score(y_true, y_proba)

0.7496930952066014