In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import os

from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import keras

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')

In [None]:
from google.colab import files
uploaded = files.upload()

Saving criteo_attribution_dataset.zip to criteo_attribution_dataset.zip


In [None]:
import zipfile
import io
zf = zipfile.ZipFile(io.BytesIO(uploaded['criteo_attribution_dataset.zip']), "r")
zf.extractall()

In [2]:
#DATA_FILE='criteo_attribution_dataset.tsv.gz'
df_Criteo_Attribution = pd.read_csv('/content/criteo_attribution_dataset.tsv.gz', sep='\t')
df_Criteo_Attribution.shape

(16468027, 22)

In [3]:
dfCriteo_Attribution= df_Criteo_Attribution.sample(frac=0.25)

In [4]:
dfCriteo_Attribution.shape

(4117007, 22)

In [5]:
dfCriteo_Attribution['day'] = np.floor(dfCriteo_Attribution.timestamp / 86400.).astype(int)

In [6]:
# Initial data preparation

def add_derived_columns(df):
    df_ext = df.copy()
    df_ext['jid'] = df_ext['uid'].map(str) + '_' + df_ext['conversion_id'].map(str)
    
    min_max_scaler = MinMaxScaler()
    for cname in ('timestamp', 'time_since_last_click'):
        x = df_ext[cname].values.reshape(-1, 1) 
        df_ext[cname + '_norm'] = min_max_scaler.fit_transform(x)
    
    return df_ext

def filter_journeys_by_length(df, min_touchpoints):
    if min_touchpoints <= 1:
        return df
    else:
        grouped = df.groupby(['jid'])['uid'].count().reset_index(name="count")
        return df[df['jid'].isin( grouped[grouped['count'] >= min_touchpoints]['jid'].values )]

def sample_campaigns(df, n_campaigns):    
    campaigns = np.random.choice( df['campaign'].unique(), n_campaigns, replace = False )
    return df[ df['campaign'].isin(campaigns) ]

def balance_conversions(df):
    df_minority = df[df.conversion == 1]
    df_majority = df[df.conversion == 0]
    
    df_majority_jids = np.array_split(df_majority['jid'].unique(), 100 * df_majority.shape[0]/df_minority.shape[0] )
    
    df_majority_sampled = pd.DataFrame(data=None, columns=df.columns)
    for jid_chunk in df_majority_jids:
        df_majority_sampled = pd.concat([df_majority_sampled, df_majority[df_majority.jid.isin(jid_chunk)]])
        if df_majority_sampled.shape[0] > df_minority.shape[0]:
            break
    
    return pd.concat([df_majority_sampled, df_minority]).sample(frac=1).reset_index(drop=True)

def map_one_hot(df, column_names, result_column_name):
    mapper = {} 
    for i, col_name in enumerate(column_names):
        for val in df[col_name].unique():
            mapper[str(val) + str(i)] = len(mapper)
         
    df_ext = df.copy()
    
    def one_hot(values):
        v = np.zeros( len(mapper) )
        for i, val in enumerate(values): 
            v[ mapper[str(val) + str(i)] ] = 1
        return v    
    
    df_ext[result_column_name] = df_ext[column_names].values.tolist()
    df_ext[result_column_name] = df_ext[result_column_name].map(one_hot)
    
    return df_ext

In [7]:
n_campaigns = 400

df1 = add_derived_columns(dfCriteo_Attribution)
df2 = sample_campaigns(df1, n_campaigns)
df3 = filter_journeys_by_length(df2, 2)
df4 = balance_conversions(df3)
df5 = map_one_hot(df4, ['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat8'], 'cats')
df6 = map_one_hot(df5, ['campaign'], 'campaigns').sort_values(by=['timestamp_norm'])

print(df6.shape[0])
print([df6[df6.conversion == 0].shape[0], df6[df6.conversion == 1].shape[0]])

101547
[51028, 50519]


**LAST TOUCH** **ATRRIBUTE** **MODEL**

Last-touch attribution assigns 100% of the credit to the last marketing touchpoint. This model would give all the credit to the striker (Player E in the image below). You could argue that the last touch is all that matters because it resulted in the actual conversion, but it doesn’t tell the whole story. Your customers are likely engaging with your brand across multiple touchpoints on various channels before they convert.

In [8]:
def last_touch_attribution(df):
    
    def count_by_campaign(df):
        counters = np.zeros(n_campaigns)
        for campaign_one_hot in df['campaigns'].values:
            campaign_id = np.argmax(campaign_one_hot)
            counters[campaign_id] = counters[campaign_id] + 1
        return counters
        
    campaign_impressions = count_by_campaign(df)
    
    df_converted = df[df['conversion'] == 1]
    idx = df_converted.groupby(['jid'])['timestamp_norm'].transform(max) == df_converted['timestamp_norm']
    campaign_conversions = count_by_campaign(df_converted[idx])
        
    return campaign_conversions / campaign_impressions
    
lta = last_touch_attribution(df6)



In [9]:
import plotly.express as px
campaign_idx = range(150, 250)
fig = px.bar(lta, x=range(len(lta[campaign_idx])), y=lta[campaign_idx],title='LTA',labels={'x':'Campaign ID','y':'Return per impression'})
fig.show()

**FIRST** **TOUCH** **ATTRIBUTION** **MODEL**

With a first-touch attribution model, your first marketing interaction receives 100% of the credit.

If you’re mainly focused on widening top of your funnel, this is a useful model. It highlights the channels that first introduced a customer to your brand.

The problem? It ignores all subsequent touches, and therefore provides no insight into your down-funnel metrics. A Facebook ad may send you a lot of website traffic, but probably isn’t the sole influencer in a conversion.

In [10]:
def first_touch_attribution(df):
    
    def count_by_campaign(df):
        counters = np.zeros(n_campaigns)
        for campaign_one_hot in df['campaigns'].values:
            campaign_id = np.argmax(campaign_one_hot)
            counters[campaign_id] = counters[campaign_id] + 1
        return counters
        
    campaign_impressions = count_by_campaign(df)
    
    df_converted = df[df['conversion'] == 1]
    idx = df_converted.groupby(['jid'])['timestamp_norm'].transform(min) == df_converted['timestamp_norm']
    campaign_conversions = count_by_campaign(df_converted[idx])
        
    return campaign_conversions / campaign_impressions
    
fta = first_touch_attribution(df6)

In [11]:
fig = px.bar(fta, x=range(len(fta[campaign_idx])), y=fta[campaign_idx],title='First-Touch Attribution',labels={'x':'Campaign ID','y':'Return per impression'})
fig.show()

**Logistic** **Regression** **Attribute** **Model**

In [12]:
def features_for_logistic_regression(df):

    def pairwise_max(series):
        return np.max(series.tolist(), axis = 0).tolist()
    
    aggregation = {
        'campaigns': pairwise_max,
        'cats': pairwise_max,
        'click': 'sum',
        'cost': 'sum',
        'conversion': 'max'
    }
    
    df_agg = df.groupby(['jid']).agg(aggregation)
    
    df_agg['features'] = df_agg[['campaigns', 'cats', 'click', 'cost']].values.tolist()
    
    return (
        np.stack(df_agg['features'].map(lambda x: np.hstack(x)).values),
        df_agg['conversion'].values
    )

In [13]:
x, y = features_for_logistic_regression(df6)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.20, random_state = 1)

In [15]:
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)
score = logisticRegr.score(x_test, y_test)
print(score)

0.832821143208359


In [16]:
from keras.models import Sequential 
from keras.layers import Dense, Dropout
from keras.constraints import NonNeg
m = np.shape(x)[1]
    
model = Sequential()  
model.add(Dense(1, input_dim=m, activation='sigmoid', name = 'contributions')) 

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) 
history = model.fit(x_train, y_train, batch_size=128, epochs=10, verbose=1, validation_data=(x_val, y_val)) 
score = model.evaluate(x_test, y_test, verbose=0) 
print('Test score:', score[0]) 
print('Test accuracy:', score[1])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 0.46588951349258423
Test accuracy: 0.8094652891159058


In [17]:
# Visualization of the attribution scores
from sklearn.utils.extmath import softmax

keras_logreg = model.get_layer('contributions').get_weights()[0].flatten()[0:n_campaigns]
keras_logreg = softmax([keras_logreg]).flatten()

fig = px.bar(keras_logreg, x=range(len(keras_logreg[campaign_idx])), y=keras_logreg[campaign_idx],title='Logirthmic Attribution',labels={'x':'Campaign ID','y':'Return per impression'})
fig.show()

**LINEAR** **ATTRIBUTION** 

Linear attribution is a multi-touch attribution model which splits conversion credit equally across each touchpoint or interaction along a customers journey. Simply, this attribution model gives a participation award to every marketing channel a business used.


In [18]:
def Linear_attribution(df):
    
    def count_by_campaign(df):
        counters = np.zeros(n_campaigns)
        for campaign_one_hot in df['campaigns'].values:
            campaign_id = np.argmax(campaign_one_hot)
            counters[campaign_id] = counters[campaign_id] + 1
        return counters
        
    campaign_impressions = count_by_campaign(df)
    
    df_converted = df[df['conversion'] == 1]
    df_converted['linear'] = df_converted.conversion / (df_converted.click_nb).astype(float)
    
    def linear_attr_by_campaign(df_conv):
        counters = np.zeros(n_campaigns)
        for idx in range(len(df_conv)):
            campaign_id = np.argmax(df_conv.iloc[idx, 27])
            counters[campaign_id] = counters[campaign_id] + df_conv.iloc[idx, 28]
        
        return counters
    
    campaign_conversions = linear_attr_by_campaign(df_converted)
    
    return campaign_conversions / campaign_impressions
    
linear_a = Linear_attribution(df6)

In [19]:
fig = px.bar(linear_a, x=range(len(linear_a[campaign_idx])), y=linear_a[campaign_idx],title='Linear Attribution',labels={'x':'Campaign ID','y':'Return per impression'})
fig.show()

**U**-**SHAPED** **ATTRIBUTION** **MODEL**

U-Shaped Attribution is an attribution model which emphasizes and credits the first and last touchpoint a user encounters with more credit, than the touchpoints encountered in the middle of the customer journey. Specifically the first and last touch touchpoint encountered is given 40% of the conversion credit. The remaining 20% is distributed equally among all other touchpoints encountered in the journey.

In [20]:
def UShape_attribution(df):
    
    def count_by_campaign(df):
        counters = np.zeros(n_campaigns)
        for campaign_one_hot in df['campaigns'].values:
            campaign_id = np.argmax(campaign_one_hot)
            counters[campaign_id] = counters[campaign_id] + 1
        return counters
        
    campaign_impressions = count_by_campaign(df)
    
    df_converted = df[df['conversion'] == 1]
    
    def calc_attribution(click_pos,click_nb):
        default_att = 0.5
        extreme_touch_att = 0.4
        intermed_att = 0.2

        if(click_nb == 2):
            return default_att
        else:
            if(click_pos == click_nb - 1 or click_pos ==0):
                return extreme_touch_att
            else:
                return intermed_att/(click_nb -2)

    
    def UShape_Attr_Value(df_converted):
        df_converted['U_Shape'] = df_converted.apply(lambda val: calc_attribution(val.click_pos,val.click_nb),axis=1)
        return df_converted
        
    def UShape_by_campaign(df_converted):
        counters = np.zeros(n_campaigns)
        for idx in range(len(df_converted)):
            campaign_id = np.argmax(df_converted.iloc[idx, 27])
            counters[campaign_id] = counters[campaign_id] + df_converted.iloc[idx, 28]
        return counters
    
    
    #df_converted['U_Shape'] = df_converted.apply(lambda val: calc_attribution(val.click_pos,val.click_nb),axis=1)
    df_converted = UShape_Attr_Value(df_converted) 
    
    campaign_conversions = UShape_by_campaign(df_converted)   #count conversions by campaign using U-Shape model
        
    return campaign_conversions / campaign_impressions
    
UShape_attr = UShape_attribution(df6)

In [21]:
fig = px.bar(UShape_attr, x=range(len(UShape_attr[campaign_idx])), y=UShape_attr[campaign_idx],title='U-Shape Attribution',labels={'x':'Campaign ID','y':'Return per impression'})
fig.show()

**TIME** **DECAY** **ATTRIBUTION** **MODEL**

Time-decay attribution is a multi-touch attribution model that gives some credit to all the channels that led to your customer converting, with that amount of credit being less (decaying) the further back in time the channel was interacted with.

In [22]:
df6=df6
df6['conversion_day'] = np.floor(df6.conversion_timestamp / 86400.).astype(int)

In [23]:
df6.columns

Index(['timestamp', 'uid', 'campaign', 'conversion', 'conversion_timestamp',
       'conversion_id', 'attribution', 'click', 'click_pos', 'click_nb',
       'cost', 'cpo', 'time_since_last_click', 'cat1', 'cat2', 'cat3', 'cat4',
       'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'day', 'jid', 'timestamp_norm',
       'time_since_last_click_norm', 'cats', 'campaigns', 'conversion_day'],
      dtype='object')

In [24]:
def TimeDecay_attribution(df):
    
    def count_by_campaign(df):
        counters = np.zeros(n_campaigns)
        #print('here 1')
        for campaign_one_hot in df['campaigns'].values:
            campaign_id = np.argmax(campaign_one_hot)
            counters[campaign_id] = counters[campaign_id] + 1
        return counters
    
        
    campaign_impressions = count_by_campaign(df)
    
    df_converted = df[df['conversion'] == 1]
    
    def calc_attribution(conversion_day,click_day):
        rel_pos =  conversion_day - click_day
        attribution = pow(2, -(rel_pos/7))    # using formulae 2^-x/7
        return attribution
    
    def TimeDecay_Attr_Value(df_converted):
        df_converted['TimeDecay'] = df_converted.apply(lambda val: calc_attribution(val.conversion_day,val.day),axis=1)
        return df_converted
        
    def Normalised_TimeDecay(journey_id, Time_Decay):
        attr_sum = df_TDecay_Sum[df_TDecay_Sum.jid == journey_id]['TDsum']
        return Time_Decay / attr_sum       #normalising the attributions by dividing each with the sum of attributions
            
    def TimeDecay_by_campaign(df_converted):
        counters = np.zeros(n_campaigns)
        for idx in range(len(df_converted)):
            campaign_id = np.argmax(df_converted.iloc[idx, 27])
            counters[campaign_id] = counters[campaign_id] + df_converted.iloc[idx, 29]
        return counters
        
        
    
    df_converted = TimeDecay_Attr_Value(df_converted) 
    
    #df_TDecay_Sum = df_converted.groupby(['jid'])['TimeDecay'].sum().reset_index(name="TDsum")
    #df_converted = df_converted.merge(df_TDecay_Sum, on='jid', how='left')
    #df_converted['TimeDecay_Norm'] = df_converted.TimeDecay / df_converted.TDsum
    #df_converted.apply(lambda val: Normalised_TimeDecay(val.jid,val.TimeDecay),axis=1)
    
    campaign_conversions = TimeDecay_by_campaign(df_converted)   #count conversions by campaign using U-Shape model
        
    return campaign_conversions / campaign_impressions
    
TimeDecay_attr = TimeDecay_attribution(df6)

In [25]:
TimeDecay_attr

array([0.1698217 , 0.40584832, 0.46622564, 0.30058069, 0.30092397,
       0.38171802, 0.0595881 , 0.06360239, 0.1290632 , 0.55499364,
       0.24585282, 0.28814621, 0.37729752, 0.24449768, 0.25081371,
       0.26925637, 0.15589014, 0.17481309, 0.25442802, 0.07733091,
       0.17652504, 0.24442659, 0.14887188, 0.14717132, 0.24191776,
       0.19254759, 0.48978377, 0.25682579, 0.43250788, 0.25948367,
       0.09716268, 0.41863876, 0.39047025, 0.48327481, 0.1225202 ,
       0.28766778, 0.23023439, 0.13825268, 0.37125244, 0.39123621,
       0.16713189, 0.1271003 , 0.07838612, 0.44169673, 0.26896428,
       0.23907775, 0.36612177, 0.18582581, 0.18629067, 0.25985637,
       0.22085787, 0.24743445, 0.17623187, 0.20943168, 0.13874871,
       0.25842736, 0.53760447, 0.14587749, 0.38696534, 0.59330318,
       0.13383666, 0.03954741, 0.20799293, 0.20203997, 0.3188792 ,
       0.1206504 , 0.21818922, 0.30596049, 0.39475407, 0.20668363,
       0.31364796, 0.57189765, 0.27178111, 0.23882418, 0.13225

In [26]:

fig = px.bar(TimeDecay_attr, x=range(len(TimeDecay_attr[campaign_idx])), y=TimeDecay_attr[campaign_idx],title='Time-Decay Attribution',labels={'x':'Campaign ID','y':'Return per impression'})
fig.show()

**Saving attributes as .txt file**

In [27]:
np.savetxt("lta.txt", lta)
np.savetxt("Linear.txt", linear_a)
np.savetxt("UShape.txt", UShape_attr)
np.savetxt("TimeDecay.txt", TimeDecay_attr)
np.savetxt("keras_logreg.txt", keras_logreg)
np.savetxt("fta.txt", fta)


**Budget Optimization & Return On Investment**

In [28]:
def get_campaign_id(x_journey_step):
    return np.argmax(x_journey_step[0:n_campaigns])

In [29]:
# Key assumption: If one of the campaigns in a journey runs out of budget, 
# then the conversion reward is fully lost for the entire journey
# including both past and future campaigns

def simulate_budget_roi(df, budget_total, attribution, verbose=False):
    budgets = np.ceil(attribution * (budget_total / np.sum(attribution)))
    
    if(verbose):
        print(budgets)
    
    blacklist = set()
    conversions = set()
    for i in range(df.shape[0]):
        campaign_id = get_campaign_id(df.loc[i]['campaigns']) 
        jid = df.loc[i]['jid']
        if jid not in blacklist:
            if budgets[campaign_id] >= 1:
                budgets[campaign_id] = budgets[campaign_id] - 1
                if(df.loc[i]['conversion'] == 1):
                    conversions.add(jid)
            else:
                blacklist.add(jid)
        
        if(verbose):
            if(i % 10000 == 0):
                print('{:.2%} : {:.2%} budget spent'.format(i/df.shape[0], 1.0 - np.sum(budgets)/budget_total ))
        
        if(np.sum(budgets) < budget_total * 0.02):
            break
            
    return len(conversions.difference(blacklist))

In [31]:
pitches = [0, 0.5, 1.5]
attributions = [lta,fta,linear_a, UShape_attr, TimeDecay_attr, keras_logreg]

for i, pitch in enumerate(pitches):
    print('Pitch of Budget Allocation - ' + str(pitch))
    for j, attribution in enumerate(attributions):
        reward = simulate_budget_roi(df6, 10000, attribution**pitch)
        print('{} {} : {}'.format(i, j, reward))

Pitch of Budget Allocation - 0
0 0 : 461
0 1 : 461
0 2 : 461
0 3 : 461
0 4 : 461
0 5 : 461
Pitch of Budget Allocation - 0.5
1 0 : 0
1 1 : 0
1 2 : 0
1 3 : 0
1 4 : 0
1 5 : 488
Pitch of Budget Allocation - 1.5
2 0 : 0
2 1 : 0
2 2 : 0
2 3 : 0
2 4 : 0
2 5 : 530
