<a href="https://colab.research.google.com/github/usamabaig1/Attribution_Modeling/blob/attributionModeller/Attribution_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/gdrive')
# %cd /gdrive/MyDrive/Criteo

In [None]:
import shutil, sys                                                                                                                                                    
shutil.copy( '/gdrive/My Drive/Criteo/criteo_attribution_dataset300Camps.tsv', '/content/')
# shutil.copy( '/gdrive/My Drive/Criteo/criteo_attribution_dataset.tsv', '/content/')
# !cp -r 'criteo_attribution_dataset300Camps.tsv' .

In [None]:
%pylab inline
import pandas as pd
plt.style.use('ggplot')
from scipy.optimize import minimize
from IPython.core.debugger import set_trace

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

In [None]:
# DATA_FILE='criteo_attribution_dataset.tsv.gz'
# df = pd.read_csv(DATA_FILE, sep='\t', compression='gzip')

# DATA_FILE='criteo_attribution_dataset.tsv'
DATA_FILE='criteo_attribution_dataset300Camps.tsv'
df = pd.read_csv(DATA_FILE, sep='\t')

In [None]:
# filteredCampaigns = df['campaign'].unique()[0:300] #selecting only 300 campaigns out of 675 (12515351 out of 16468027)
# df = df[df['campaign'].isin(filteredCampaigns)]
# df.to_csv('criteo_attribution_dataset300Camps.tsv', sep = '\t')
df.shape
# !ls



  * **timestamp**: timestamp of the impression (starting from 0 for the first impression). The dataset is sorted according to timestamp.
  *  **uid** a unique user identifier
  * **campaign** a unique identifier for the campaign
  * **conversion** 1 if there was a conversion in the 30 days after the impression (independently of whether this impression was last click or not)
  * **conversion_timestamp** the timestamp of the conversion or -1 if no conversion was observed
  *	**conversion_id**	a unique identifier for each conversion (so that timelines can be reconstructed if needed). -1 if there was no conversion
  * **attribution** 1 if the conversion was attributed to Criteo, 0 otherwise
  * **click** 1 if the impression was clicked, 0 otherwise
  *	**click_pos** the position of the click before a conversion (0 for first-click)
  * **click_nb** number of clicks. More than 1 if there was several clicks before a conversion
  * **cost** the price paid by Criteo for this display (**disclaimer:** not the real price, only a transformed version of it)
  *	 **cpo** the cost-per-order  in case of attributed conversion (**disclaimer:** not the real price, only a transformed version of it)
  * **time\_since\_last\_click** the time since the last click (in s) for the given impression
  *	 **cat[1-9]** contextual features associated to the display. Can be used to learn the click/conversion models. We do not disclose the meaning of these features but it is not relevant for this study. Each column is a categorical variable. In the experiments, they are mapped to a fixed dimensionality space using the Hashing Trick (see paper for reference).



# 1. Data Exploration






In [None]:
pd.set_option('display.max_columns', None)
df.head()
df.shape
df.dtypes
# df.isnull().values.any()

In [None]:
# df[(df['uid']== 7306395) & (df['campaign']== 29427842)]
# a user can interact with the same campaign for more than one conversion

In [None]:
# df.nunique()

In [None]:
df['day'] = np.floor(df.timestamp / 86400.).astype(int)
# df.head()
# df[(df['uid']== 7306395) & (df['campaign']== 29427842)]

In [None]:
# df.click_pos.hist(bins = len(df.click_pos.unique()))

In [None]:
# df.click_nb.hist(bins = len(df.click_nb.unique()))

In [None]:
# df.day.hist(bins=len(df.day.unique()))

# 2. Data Manipulation


In [None]:
#the time diff between impression and conversion
df['gap_click_sale'] = -1
df.loc[df.conversion == 1, 'gap_click_sale'] = df.conversion_timestamp - df.timestamp
# df['gap_click_sale_day'] = np.floor((df['gap_click_sale']/86400)).astype(int)

In [None]:
# Attribution and the click on impression is the last click
df['last_click'] = df.attribution * (df.click_pos == df.click_nb - 1).astype(int)
# Attribution and the click on impression is the first click
df['first_click'] = df.attribution * (df.click_pos == 0).astype(int)
# Attribution only
df['all_clicks'] = df.attribution
# Attribution weighted by the weights
df['uniform'] = df.attribution / (df.click_nb).astype(float)
# df[(df['uid']== 7306395) & (df['campaign']== 29427842)]

In [None]:
FEATURES = ['campaign', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 
            'cat7', 'cat8']
INFOS = ['cost', 'cpo', 'time_since_last_click', 'last_click', 'first_click', 'all_clicks', 'uniform']

# 3. Attribution Model


In [None]:
def bootstrap_method(data, sample_count, statistic, alpha):
  """Returns bootstrap estimate of sample_count*(1-alpha) CI for statistic."""
  # data = df['time_since_last_click']
  # sample_count = 100
  # statistic = np.mean
  # alpha = 0.05
  statitic_values = []
  for x in range(0,sample_count):
    sample = np.random.choice(np.array(data), len(np.array(data)), replace=True)
    statistic_values.append(statitic(sample))
  statitic_values = np.array(sorted(statitic_values))
  return (statistic_values[int((alpha/2)*sample_count)], statistic_values[int((1-alpha/2)*sample_count)] )  


In [None]:
class weightedExp(nn.Module):
  '''
    A weighted exponential activation function
  '''
  def __init__(self, weight = 1e-3):
    super().__init__()
    self.weight = nn.Parameter(torch.tensor(weight))

  def forward(self, input):
    ex = torch.exp(-1*self.weight*input)
    return (ex)


def nllh_loss(input, pred, target, lambd):
  '''
    Implments NLLH loss as mentioned in the paper
  '''
  nllh = (target*input*lambd) - ((1-target)*torch.log(1 - pred))
  nllhsum = nllh.sum()

  return nllhsum / pred.data.nelement()

In [None]:
def optize_lambda(tts, attribution, verbose = False):
  x_data = Variable(torch.Tensor(tts))
  y_data = Variable(torch.Tensor(attribution))
  #provided paramaters give closest approximation to value of lambda mentioned in the paper
  learning_rate = 0.0001 
  epochs = 10
  model = weightedExp()
  # criterion = nllh_loss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  for epoch in range(0,epochs):
    model.train()
    optimizer.zero_grad()

    pred = model(x_data)
    loss = nllh_loss(x_data, pred, y_data, model.weight)
    loss.backward()
    optimizer.step()
    if verbose:
      print( "Epoch:", epoch)
      print( "Lambda:", model.weight)
  
  return model.weight.item()

In [None]:
df_view = df
df_view.shape
test_day =20
learning_duration = 21
df_train = df_view[(df_view.day >= test_day - learning_duration) & (df_view.day < test_day)]
df_conv = df_train[df_train.click_pos == df_train.click_nb - 1] # selecting those impressions which were clicked last
#only finding the chance of conversion given the time since last click
x = df_conv.gap_click_sale.values
y = df_conv.attribution.values 

lamb = optize_lambda(x, y)
lamb
# Lambda Value : 1.8000719137489796e-07

# AA attributions on full dataset

In [None]:
#@title
def compute_aa_attributions(test_info, normalize=True):
    test_info['idx'] = test_info.index
    converted =  test_info[test_info.all_clicks==1]
    # reconstructing the timelines for each conversion
    conversion_ids = converted['conversion_id'].unique()
    # reconstructing the timelines and applying attribution
    by_conversion = converted[['conversion_id', 'timestamp', 'idx', 'bf_pred', 'time_since_last_click', 'last_click']].groupby('conversion_id')
    new_clicks_data = []
    
    s_attr = []
    s_attr_lc = []
    # for each conversion compute attribution for each click
    for conv, evts in by_conversion:
        sorted_clicks = sorted(evts.values.tolist(), key=lambda x: x[1])
        bf_pred = [_[3] for _ in sorted_clicks]
        sum_bf = np.sum(bf_pred)
        sum_lc = np.sum([_[5] for _ in sorted_clicks])
        sum_attr = 0.0
        for pos, (_, _, idx_, bf_, tslc_, lc_) in enumerate(sorted_clicks):
            aa_attr = bf_pred[pos]
            if normalize:
                if sum_bf>0.0:
                    aa_attr/=sum_bf
                else:
                    aa_attr = 0.0
            sum_attr += aa_attr
            new_clicks_data.append((idx_, aa_attr))
        s_attr.append(sum_attr)
        s_attr_lc.append(sum_lc)
    
    # now for each click, apply attribution from computed data
    new_clicks_df = pd.DataFrame(columns=['click_idx', 'aa_attribution'])
    cidx, attr = zip(*new_clicks_data)
    new_clicks_df['click_idx'] = cidx
    new_clicks_df['aa_attribution'] = attr
    new_clicks_df = new_clicks_df.set_index('click_idx')
    joined = test_info.join(new_clicks_df)
    joined['aa_attribution'] = joined['aa_attribution'].fillna(value = 0.0)
    return joined['aa_attribution']

In [None]:
# compute the bid factor from aa attribution for each display
gap_test = df.time_since_last_click.values
previous_tslc_mask = (df.time_since_last_click >=0).astype(float)
attr_pred = np.exp(-lamb*gap_test)
attr_pred *= previous_tslc_mask
bf_pred = 1 - attr_pred
df['bf_pred'] = bf_pred
df['AA_normed'] = compute_aa_attributions(df, normalize=True)
df['AA_not_normed'] = compute_aa_attributions(df, normalize=False)
INFOS += ['bf_pred', 'AA_normed', 'AA_not_normed']

Validation Methods

In [None]:
def get_train_test_slice(df_view, test_day, learning_duration, label, features=None, 
                         hash_space=2**24, nrows=None, infos=None):
    df_test = df_view[df_view.day == test_day]
    if nrows is not None:
        df_test = df_test[:nrows]
    if features is None:
        features = FEATURES
    if infos is None:
        infos = INFOS
    df_train = df_view[(df_view.day >= test_day - learning_duration) & (df_view.day < test_day)]
    if nrows is not None:
        df_train = df_train[:nrows]
  
    X_train = df_train[features]
    X_test = df_test[features]
    
    hasher = FeatureHasher(n_features=hash_space, non_negative=1)
    
    def to_dict_values(df_view):
        return [dict([(_[0]+str(_[1]),1) for _ in zip(features,l)]) for l in df_view.values]
    
    X_train_h = hasher.fit_transform(to_dict_values(X_train))
    X_test_h = hasher.transform(to_dict_values(X_test))
    y_train = df_train[label]
    y_test = df_test[label]
    return (X_train_h, y_train), (X_test_h, y_test), df_test[infos], df_train.last_click.mean()

Compute Utilities

In [None]:
from scipy.special import gammainc
def empirical_utility(a, v, c, p):
    won = np.array(p*v > c, dtype=np.int)
    return (a*v)*won, -c*won

def expected_utility(a, v, c, p, beta=1000):
    return a*v*gammainc(beta*c+1, beta*p*v) - ((beta*c+1)/beta)*gammainc(beta*c+2, beta*p*v)

In [None]:
def evaluate_utility(y_pred, utilities, betas, test_info):
    partial_score = dict()
    for utility in utilities:
        attribution = test_info[utility]
        for beta in betas:
            if np.isinf(beta):
                est_utility = empirical_utility(attribution, test_info.cpo, test_info.cost, y_pred)
            else:
                est_utility = expected_utility(attribution, test_info.cpo, test_info.cost, y_pred, beta=beta)
            beta_str = str(beta) if not np.isinf(beta) else 'inf'
            partial_score['utility-'+utility+'-beta'+beta_str] = est_utility
    return partial_score

def get_naive_baseline(y_train, X_test):
    return np.mean(y_train)*np.ones(X_test.shape[0])

In [None]:
def evaluate_day_for_bidder(df_view, test_day, learning_duration, bidder, utilities, betas,
                            hash_space=None, features=None, clf=None, AA_bidder_label=None, recalibrate=True):
    score = dict()
    bid_profile = dict()
    label = bidder
    if bidder == 'AA':
        label = AA_bidder_label
    # get data slice
    (X_train, y_train), (X_test, y_test), test_info, y_train_lc_mean = get_train_test_slice(df_view,
                                                                           test_day,
                                                                           learning_duration,
                                                                           label=label, 
                                                                           hash_space = hash_space,
                                                                           features=features)           
    
    # learn the model
    clf.fit(X_train, y_train)
    
    # get test predictions
    y_pred = clf.predict_proba(X_test)[:,1]            
    
    # if aa bidder: modulate the bids by bid_factor computed from attribution model
    if bidder == 'AA':
        y_pred *= test_info['bf_pred']
    
    # compute the loss
    loss = log_loss(y_test, y_pred, normalize=0)
    
    # loss of baseline model
    baseline_loss = log_loss(y_test, get_naive_baseline(y_train, X_test), normalize=0)
    score['nllh'] = loss
    score['nllh_naive'] = baseline_loss
    
    # do we recalibrate output? (i.e recalibrate mean prediction). This is usually done by a control system.
    if recalibrate:
        y_pred *= (y_train_lc_mean / y_pred.mean())
    
    #how many displays are won?
    won = (y_pred*test_info.cpo > test_info.cost).astype(int)
    score['won'] = np.sum(won)
    score['n_auctions'] = y_pred.shape[0]
    
    # compute the scores on this slice
    score.update(evaluate_utility(y_pred, utilities, betas, test_info))
    
    #store bid profiles
    bid_profile['time_since_last_click'] = test_info.time_since_last_click
    bid_profile['bid'] = y_pred
    
    return score, bid_profile

In [None]:
def merge_utility_score(score):
    updates = dict()
    for k,v in score.items():
        if not 'utility' in k:
            continue
        if 'inf' in k:
            revenue, cost = v
            updates[k] = np.sum(cost) + np.sum(revenue)
            updates[k+'~revenue'] = np.sum(revenue)
            updates[k+'~cost'] = np.sum(cost)
            v = revenue + cost
        else:
            updates[k] = np.sum(v)
        bounds = bootstrap(v, 100, np.sum, .05)
        delta = (bounds[1]-bounds[0])/2.
        updates[k+'-delta'] = delta
    score.update(updates)

In [None]:
def update_score(partial_score, score):
    for k, v in partial_score.items():
        if 'utility' in k:
            if 'inf' in k:
                revenue, cost = v
                print('\t\t', k, np.sum(cost)+np.sum(revenue))
                current_revenue, current_cost = score.get(k, (np.array([]),np.array([])))
                score[k] = (
                    np.append(current_revenue, revenue),
                    np.append(current_cost, cost)
                )
            else:
                print('\t\t', k, np.sum(v))
                score[k] = np.append(score.get(k, np.array([])), v)
        else:
            print('\t\t', k, v)
            score[k] = score.get(k, 0) + v

#Evaluation

In [None]:
from datetime import datetime, timedelta
def evaluate_slices(df_view,
                    bidders=['last_click', 'first_click', 'AA'],
                    utilities=['last_click','first_click', 'AA_normed', 'AA_not_normed'],
                    betas=[np.inf, 10, 1000],
                    test_days=[22],
                    learning_duration=21,
                    hash_space=2**24,
                    features=None,
                    AA_bidder_label='all_clicks',
                    clf = LogisticRegression(solver='lbfgs', n_jobs=4),
                    recalibrate = True):
    bid_profiles = []
    scores = []
    for bidder in bidders:
        print ('*'*80)
        print("EVALUATING BIDDER:", bidder)
        score = dict()
        bid_profile = dict()
        for test_day in test_days:
            start = datetime.now()
            print('\t- day:', test_day)
            partial_score, partial_bid_profile = evaluate_day_for_bidder(
                df_view, test_day, learning_duration, bidder, 
                utilities, betas,
                hash_space=hash_space, features=features, clf=clf, 
                AA_bidder_label=AA_bidder_label, recalibrate=recalibrate
            )
            update_score(partial_score, score)
            for k, v in partial_bid_profile.items():
                bid_profile[k] = np.append(bid_profile.get(k, np.array([])), v)
            print('\t- took', datetime.now() - start)
        score['bidder'] = bidder
        bid_profile['bidder'] = bidder
        score['nllh_comp_vn'] = (score['nllh_naive'] - score['nllh']) / np.abs(score['nllh_naive'])
        score['win_rate'] = score['won'] / score['n_auctions']
        merge_utility_score(score)
        scores.append(score)
        bid_profiles.append(bid_profile)
    return pd.DataFrame(scores), pd.DataFrame(bid_profiles)

# Results

In [None]:
#full run
if False:
    scores, bid_profiles = evaluate_slices(df,
                                       bidders=['last_click',
                                                'first_click',
                                                'AA'],
                                       utilities=['last_click',
                                                  'first_click',
                                                  'AA_normed',
                                                  'AA_not_normed'],
                                       test_days=range(22,29),
                                       learning_duration=21,
                                       hash_space = 2**18,
                                       AA_bidder_label='all_clicks')

In [None]:
#simple debug run
if True:
    scores, bid_profiles = evaluate_slices(df,
                                       bidders=['last_click',
                                                'AA'],
                                       utilities=['last_click',
                                                  'AA_normed'],
                                       test_days=range(22,23),
                                       learning_duration=5,
                                       hash_space = 2**13,
                                       AA_bidder_label='all_clicks')

In [None]:
scores