Import libraries

In [217]:
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
import matplotlib.pyplot as plt
from collections import defaultdict
import random  
from random import sample 

#import squarify
import scipy
sns.set_style('darkgrid')
sns.set_palette('mako')

pd.options.display.max_rows = 4000
pd.set_option('display.float_format', lambda x: '%.5f' % x)
from datetime import timedelta

**Import Data Set**

For this updated walkthrough I have updated the data set to be more aligned with the data formats we’d likely encounter in an actual production environment. The data set can be downloaded here.
For every unique customer and visit our data set this time around contains the following information in every row:

    Cookie: Randomly generated customer id enabling us to tie subsequent visits back to the same customer
    Timestamp: Date and time when the visit took place
    Interaction: Categorical variable indicating the type of interaction that took place
    Conversion: Boolean variable indicating whether a conversion took place
    Conversion Value: Value of the potential conversion event
    Channel: The marketing channel that brought the customer to our site
        


In [218]:
df = pd.read_csv('fictional channel attribution data.csv')

In [219]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel
0,335290,F7ffEk7h3non7iDh9BkCiDCB3,2018-07-03T16:34:30Z,impression,0,0.0,Google
1,243753,Cfk3B9k07Bonhn0AD7A79B7iE,2018-07-21T07:58:35Z,impression,0,0.0,Facebook
2,79455,793BEiFfCE0hF03DonnoEBCkB,2018-07-24T12:54:46Z,impression,0,0.0,Facebook
3,164893,AEi3FiEiADB7An3EkiBC3BDif,2018-07-05T08:54:43Z,impression,0,0.0,Google
4,115293,97hhf337DEBhfoA7ii7kE9C7D,2018-07-29T20:21:01Z,impression,0,0.0,Facebook
5,360312,FkEfDfnAkC39nf79oFAo900ii,2018-07-27T12:52:41Z,impression,0,0.0,Facebook
6,419843,hD93BhDoho09oED0Bk3kfnD3C,2018-07-23T15:11:36Z,impression,0,0.0,Google
7,392137,fh9ohCB99hB0EBffh9703DEni,2018-07-04T10:42:33Z,impression,0,0.0,Facebook
8,213568,BkFnFn7F733Do30fBiFFk3DFn,2018-07-20T15:08:58Z,impression,0,0.0,Facebook
9,395921,finBof39FhDoF77CfChEDBCof,2018-07-09T18:01:45Z,impression,0,0.0,Facebook


In [213]:
df.groupby('channel', as_index=False).agg({"conversion": "sum"})

Unnamed: 0,channel,conversion
0,Facebook,5411
1,Google,3274
2,Instagram,2476
3,LinkedIn,1528


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423000 entries, 0 to 422999
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        423000 non-null  int64  
 1   cookie            423000 non-null  object 
 2   time              423000 non-null  object 
 3   interaction       423000 non-null  object 
 4   conversion        423000 non-null  int64  
 5   conversion_value  423000 non-null  float64
 6   channel           423000 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 22.6+ MB


In [72]:
print("Number of touchpoints in the dataset :",df['cookie'].count(), 'touchpoints')

print("Number of customers :",df['cookie'].value_counts().count(),'customers')

print("Number of conversions  :",df['conversion'].sum(),'conversions')

print("Number of channels  :",df['channel'].value_counts().count(),'channels')

Number of touchpoints in the dataset : 423000 touchpoints
Number of customers : 187697 customers
Number of conversions  : 12689 conversions
Number of channels  : 4 channels


**Data Preprocessing**

To get our data into a format that’s ideal for applying the Markov Models algorithm, we’ll need to do a bit of preprocessing.
We’ll start creating a column for creating the paths for conversion of each user:

In [173]:
#data prep

df = df.sort_values(['cookie', 'time'],
                    ascending=[False, True])

df['visit_order'] = df.groupby('cookie').cumcount() + 1

In [174]:
df_paths = df.groupby('cookie')['channel'].aggregate(
    lambda x: x.unique().tolist()).reset_index()
    


In [175]:
df_last_interaction = df.drop_duplicates('cookie', keep='last')[['cookie', 'conversion']]
df_paths = pd.merge(df_paths, df_last_interaction, how='left', on='cookie')

In [176]:
df_paths

Unnamed: 0,cookie,channel,conversion
0,00000FkCnDfDDf0iC97iC703B,"[Facebook, LinkedIn]",0
1,0000nACkD9nFkBBDECD3ki00E,[Google],0
2,0003EfE37E93D0BC03iBhBBhF,[Google],0
3,00073CFE3FoFCn70fBhB3kfon,[Facebook],0
4,00079hhBkDF3k3kDkiFi9EFAD,[Google],0
...,...,...,...
187692,kfFk9B3hkAiiC7D33BA3DnEDF,[Instagram],1
187693,kfFk9k90nh9DfAfCinF39CEhE,[Google],1
187694,kfFki3F7hDkhfoDA3A9h7hh33,"[Facebook, Google]",0
187695,kfFnDiEo00D77EE33oFCEA0Co,[Facebook],0


In [177]:
def f(c, p):
    if c == 0:
        return ['Start']+ p + ['Null']
    else: 
        return ['Start'] + p + ['Conversion']
     
df_paths['path'] = df_paths.apply(lambda x: f(x['conversion'], x['channel']), axis = 1)
df_paths.head()

Unnamed: 0,cookie,channel,conversion,path
0,00000FkCnDfDDf0iC97iC703B,"[Facebook, LinkedIn]",0,"[Start, Facebook, LinkedIn, Null]"
1,0000nACkD9nFkBBDECD3ki00E,[Google],0,"[Start, Google, Null]"
2,0003EfE37E93D0BC03iBhBBhF,[Google],0,"[Start, Google, Null]"
3,00073CFE3FoFCn70fBhB3kfon,[Facebook],0,"[Start, Facebook, Null]"
4,00079hhBkDF3k3kDkiFi9EFAD,[Google],0,"[Start, Google, Null]"


**Markov Chains**

We can now move onto the actual Markov Chain method.
The algorithm for Markov Chains can be summarized in 2 steps:

1. Calculate the 6transition probabilities between all channels (states) in our state-space

2. Calculate the removal effects 

We’ll start by:

Defining a list of all user journeys, the number of total conversion and the base level conversion rate.

In [198]:
list_of_paths = df_paths['path']


total_conversions = sum(path.count('Conversion') for path in df_paths['path'].tolist())
print('the total conversions are:', total_conversions)

base_conversion_rate = total_conversions / len(list_of_paths)
print('the base conversation rate is :', round(base_conversion_rate * 100, 2) , '%')

the total conversions are: 12634
the base conversation rate is : 6.73 %


Next, we’ll define a function that identifies all potential state transitions and outputs a dictionary containing these. We’ll use this as an input when calculating transition probabilities:

In [199]:
def transition_states(list_of_paths):
    list_of_unique_channels = set(x for element in list_of_paths for x in element)
    transition_states = {x + '>' + y: 0 for x in list_of_unique_channels for y in list_of_unique_channels}

    for possible_state in list_of_unique_channels:
        if possible_state not in ['Conversion', 'Null']:
            for user_path in list_of_paths:
                if possible_state in user_path:
                    indices = [i for i, s in enumerate(user_path) if possible_state in s]
                    for col in indices:
                        transition_states[user_path[col] + '>' + user_path[col + 1]] += 1

    return transition_states


trans_states = transition_states(list_of_paths)


Once we have the different transition states, the next goal is to calculate the transition probabilities


In [203]:
def transition_prob(trans_dict):
    list_of_unique_channels = set(x for element in list_of_paths for x in element)
    trans_prob = defaultdict(dict)
    for state in list_of_unique_channels:
        if state not in ['Conversion', 'Null']:
            counter = 0
            index = [i for i, s in enumerate(trans_dict) if state + '>' in s]
            for col in index:
                if trans_dict[list(trans_dict)[col]] > 0:
                    counter += trans_dict[list(trans_dict)[col]]
            for col in index:
                if trans_dict[list(trans_dict)[col]] > 0:
                    state_prob = float((trans_dict[list(trans_dict)[col]])) / float(counter)
                    trans_prob[list(trans_dict)[col]] = state_prob

    return trans_prob


The above should leave us with a dictionary containing all transitions and their respective transition probabilities given our historical data.

In [225]:
trans_prob

defaultdict(dict,
            {'Start>Facebook': 0.39663393661060115,
             'Start>LinkedIn': 0.1422025924761717,
             'Start>Instagram': 0.14486646030570546,
             'Start>Google': 0.3162970106075217,
             'Facebook>LinkedIn': 0.027351196466841116,
             'Facebook>Null': 0.8249523426122168,
             'Facebook>Instagram': 0.029695584735427497,
             'Facebook>Conversion': 0.06217365049670246,
             'Facebook>Google': 0.05582722568881206,
             'LinkedIn>Facebook': 0.0666687082746371,
             'LinkedIn>Null': 0.7817725240399338,
             'LinkedIn>Instagram': 0.016230783364978258,
             'LinkedIn>Conversion': 0.048049243584246955,
             'LinkedIn>Google': 0.08727874073620383,
             'Instagram>Facebook': 0.07284706467352457,
             'Instagram>LinkedIn': 0.018429312863225286,
             'Instagram>Null': 0.7909065481555148,
             'Instagram>Conversion': 0.07300245516984182,
          


The final step is then to identify removal effects for each of our marketing channels. To do this we’ll make use of linear algebra and matrix manipulations, therefore let’s turn our above transition probabilities dictionary into a data frame:

In [204]:
def transition_matrix(list_of_paths, transition_probabilities):
    trans_matrix = pd.DataFrame()
    list_of_unique_channels = set(x for element in list_of_paths for x in element)

    for channel in list_of_unique_channels:
        trans_matrix[channel] = 0.00
        trans_matrix.loc[channel] = 0.00
        trans_matrix.loc[channel][channel] = 1.0 if channel in ['Conversion', 'Null'] else 0.0

    for key, value in transition_probabilities.items():
        origin, destination = key.split('>')
        trans_matrix.at[origin, destination] = value

    return trans_matrix


trans_matrix = transition_matrix(list_of_paths, trans_prob)

In [205]:
trans_matrix

Unnamed: 0,Start,Facebook,LinkedIn,Null,Instagram,Conversion,Google
Start,0.0,0.39663,0.1422,0.0,0.14487,0.0,0.3163
Facebook,0.0,0.0,0.02735,0.82495,0.0297,0.06217,0.05583
LinkedIn,0.0,0.06667,0.0,0.78177,0.01623,0.04805,0.08728
Null,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Instagram,0.0,0.07285,0.01843,0.79091,0.0,0.073,0.04481
Conversion,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Google,0.0,0.08028,0.04475,0.7958,0.02849,0.05068,0.0


The transition matrix we just produced actually holds a great deal of valuable information on its own. Given today’s typical multi-touch conversion journeys this information can prove to be extremely valuable and allows us to optimize our multi-channel customer journeys for conversion.

However we are still not done

In [206]:

def removal_effects(df, conversion_rate):
    removal_effects_dict = {}
    channels = [channel for channel in df.columns if channel not in ['Start',
                                                                     'Null',
                                                                     'Conversion']]
    for channel in channels:
        removal_df = df.drop(channel, axis=1).drop(channel, axis=0)
        for column in removal_df.columns:
            row_sum = np.sum(list(removal_df.loc[column]))
            null_pct = float(1) - row_sum
            if null_pct != 0:
                removal_df.loc[column]['Null'] = null_pct
            removal_df.loc['Null']['Null'] = 1.0

        removal_to_conv = removal_df[
            ['Null', 'Conversion']].drop(['Null', 'Conversion'], axis=0)
        removal_to_non_conv = removal_df.drop(
            ['Null', 'Conversion'], axis=1).drop(['Null', 'Conversion'], axis=0)

        removal_inv_diff = np.linalg.inv(
            np.identity(
                len(removal_to_non_conv.columns)) - np.asarray(removal_to_non_conv))
        removal_dot_prod = np.dot(removal_inv_diff, np.asarray(removal_to_conv))
        removal_cvr = pd.DataFrame(removal_dot_prod,
                                   index=removal_to_conv.index)[[1]].loc['Start'].values[0]
        removal_effect = 1 - removal_cvr / conversion_rate
        removal_effects_dict[channel] = removal_effect

    return removal_effects_dict


removal_effects_dict = removal_effects(trans_matrix, base_conversion_rate)

We can now iteratively go through each of our channels and assess the impact it would have on overall conversion if we were to remove a channel from our state-space. We’ll do this and add the resulting removal effects to an output dictionary.
The resulting removal effects dictionary can then be used to calculate the Markov Chain attributions for each of our marketing channels

In [207]:
def markov_chain_allocations(removal_effects, total_conversions):
    re_sum = np.sum(list(removal_effects.values()))

    return {k: (v / re_sum) * total_conversions for k, v in removal_effects.items()}


attributions = markov_chain_allocations(removal_effects_dict, total_conversions)

In [227]:

pd.DataFrame(attributions.items(), columns=['Channel','Number of conversions'])

Unnamed: 0,Channel,Number of conversions
0,Facebook,5070
1,LinkedIn,1675
2,Instagram,2283
3,Google,3606
