<a href="https://colab.research.google.com/github/usamabaig1/Attribution_Modeling/blob/attributionModeller/Attribution_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/gdrive')
# %cd /gdrive/MyDrive/Criteo

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
import shutil, sys                                                                                                                                                    
# shutil.copy( '/gdrive/My Drive/Criteo/criteo_attribution_dataset300Camps.tsv', '/content/')
shutil.copy( '/gdrive/My Drive/Criteo/criteo_attribution_dataset.tsv', '/content/')
# !cp -r 'criteo_attribution_dataset300Camps.tsv' .

'/content/criteo_attribution_dataset.tsv'

In [3]:
%pylab inline
import pandas as pd
plt.style.use('ggplot')
from scipy.optimize import minimize
from IPython.core.debugger import set_trace

Populating the interactive namespace from numpy and matplotlib


In [4]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

In [5]:
# DATA_FILE='criteo_attribution_dataset.tsv.gz'
# df = pd.read_csv(DATA_FILE, sep='\t', compression='gzip')

DATA_FILE='criteo_attribution_dataset.tsv'
# DATA_FILE='criteo_attribution_dataset300Camps.tsv'
df = pd.read_csv(DATA_FILE, sep='\t')

In [6]:
# filteredCampaigns = df['campaign'].unique()[0:300] #selecting only 300 campaigns out of 675 (12515351 out of 16468027)
# df = df[df['campaign'].isin(filteredCampaigns)]
# df.to_csv('criteo_attribution_dataset300Camps.tsv', sep = '\t')
df.shape
# !ls

(16468027, 22)

Here is a detailed description of the fields (they are tab-separated in the file):

  * **timestamp**: timestamp of the impression (starting from 0 for the first impression). The dataset is sorted according to timestamp.
  *  **uid** a unique user identifier
  * **campaign** a unique identifier for the campaign
  * **conversion** 1 if there was a conversion in the 30 days after the impression (independently of whether this impression was last click or not)
  * **conversion_timestamp** the timestamp of the conversion or -1 if no conversion was observed
  *	**conversion_id**	a unique identifier for each conversion (so that timelines can be reconstructed if needed). -1 if there was no conversion
  * **attribution** 1 if the conversion was attributed to Criteo, 0 otherwise
  * **click** 1 if the impression was clicked, 0 otherwise
  *	**click_pos** the position of the click before a conversion (0 for first-click)
  * **click_nb** number of clicks. More than 1 if there was several clicks before a conversion
  * **cost** the price paid by Criteo for this display (**disclaimer:** not the real price, only a transformed version of it)
  *	 **cpo** the cost-per-order  in case of attributed conversion (**disclaimer:** not the real price, only a transformed version of it)
  * **time\_since\_last\_click** the time since the last click (in s) for the given impression
  *	 **cat[1-9]** contextual features associated to the display. Can be used to learn the click/conversion models. We do not disclose the meaning of these features but it is not relevant for this study. Each column is a categorical variable. In the experiments, they are mapped to a fixed dimensionality space using the Hashing Trick (see paper for reference).



# 1. Data Exploration






In [7]:
pd.set_option('display.max_columns', None)
df.head()
df.shape
df.dtypes
# df.isnull().values.any()

timestamp                  int64
uid                        int64
campaign                   int64
conversion                 int64
conversion_timestamp       int64
conversion_id              int64
attribution                int64
click                      int64
click_pos                  int64
click_nb                   int64
cost                     float64
cpo                      float64
time_since_last_click      int64
cat1                       int64
cat2                       int64
cat3                       int64
cat4                       int64
cat5                       int64
cat6                       int64
cat7                       int64
cat8                       int64
cat9                       int64
dtype: object

In [8]:
# df[(df['uid']== 7306395) & (df['campaign']== 29427842)]
# a user can interact with the same campaign for more than one conversion

In [9]:
# df.nunique()

In [10]:
df['day'] = np.floor(df.timestamp / 86400.).astype(int)
# df.head()
# df[(df['uid']== 7306395) & (df['campaign']== 29427842)]

In [11]:
# df.click_pos.hist(bins = len(df.click_pos.unique()))

In [12]:
# df.click_nb.hist(bins = len(df.click_nb.unique()))

In [13]:
# df.day.hist(bins=len(df.day.unique()))

# 2. Data Manipulation


In [14]:
#the time diff between impression and conversion
df['gap_click_sale'] = -1
df.loc[df.conversion == 1, 'gap_click_sale'] = df.conversion_timestamp - df.timestamp
# df['gap_click_sale_day'] = np.floor((df['gap_click_sale']/86400)).astype(int)

In [32]:
# Attribution and the click on impression is the last click
df['last_click'] = df.attribution * (df.click_pos == df.click_nb - 1).astype(int)
# Attribution and the click on impression is the first click
df['first_click'] = df.attribution * (df.click_pos == 0).astype(int)
# Attribution only
df['all_clicks'] = df.attribution
# Attribution weighted by the weights
df['uniform'] = df.attribution / (df.click_nb).astype(float)
# df[(df['uid']== 7306395) & (df['campaign']== 29427842)]

In [33]:
FEATURES = ['campaign', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 
            'cat7', 'cat8']
INFOS = ['cost', 'cpo', 'time_since_last_click', 'last_click', 'first_click', 'all_clicks', 'uniform']

# 3. Attribution Model


In [34]:
def bootstrap_method(data, sample_count, statistic, alpha):
  # data = df['time_since_last_click']
  # sample_count = 100
  # statistic = np.mean
  # alpha = 0.05
  statitic_values = []
  for x in range(0,sample_count):
    sample = np.random.choice(np.array(data), len(np.array(data)), replace=True)
    statistic_values.append(statitic(sample))
  return (statistic_values[int((alpha/2)*sample_count)], statistic_values[int((1-alpha/2)*sample_count)] )  


In [15]:
class weightedExp(nn.Module):
  '''
    A weighted exponential activation function
  '''
  def __init__(self, weight = 1e-3):
    super().__init__()
    self.weight = nn.Parameter(torch.tensor(weight))

  def forward(self, input):
    ex = torch.exp(-1*self.weight*input)
    return (ex)


def nllh_loss(input, pred, target, lambd):
  '''
    Implments NLLH loss as mentioned in the paper
  '''
  nllh = (target*input*lambd) - ((1-target)*torch.log(1 - pred))
  nllhsum = nllh.sum()

  return nllhsum / pred.data.nelement()

In [48]:
def optize_lambda(tts, attribution, verbose = False):
  x_data = Variable(torch.Tensor(tts))
  y_data = Variable(torch.Tensor(attribution))
  learning_rate = 0.0001
  epochs = 10
  model = weightedExp()
  # criterion = nllh_loss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  for epoch in range(0,epochs):
    model.train()
    optimizer.zero_grad()

    pred = model(x_data)
    loss = nllh_loss(x_data, pred, y_data, model.weight)
    loss.backward()
    optimizer.step()
    if verbose:
      print( "Epoch:", epoch)
      print( "Lambda:", model.weight)
  
  return model.weight.item()

In [16]:
df_view = df
df_view.shape

(16468027, 24)

In [49]:
test_day =20
learning_duration = 21
df_train = df_view[(df_view.day >= test_day - learning_duration) & (df_view.day < test_day)]
df_conv = df_train[df_train.click_pos == df_train.click_nb - 1]
#only finding the chance of conversion given the time since last click
x = df_conv.gap_click_sale.values

y = df_conv.attribution.values 

lamb = optize_lambda(x, y)
lamb


1.8000719137489796e-07

In [None]:
df_view = df
df_view.shape