In [2]:
#Import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc

#Classifier libraries
from sklearn.linear_model import LogisticRegression
from collections import defaultdict

In [3]:
#read in dataset

In [None]:
data_sorted = population.sort_values(by=id)

grouped = data_sorted.grouby('id')['event_name'].apply(list)

In [None]:
#Convert the Series into a dataframe in order to add a new column
grouped = pd.DataFrame(grouped)

#Get user id and conversion flag from original dataframe
conversion = data_sorted[['id', 'conversion']].drop_duplicates(keep = 'first')

#merge the two dataframes together using id
grouped = pd.merge(grouped, conversion, on='id', how ='left')

In [None]:
grouped['path'] = np.where(
    grouped['conversion'] == 0, 
    ['Start, '] + grouped['final_event_name'].apply(', '.join) + [', Null'],
    ['Start, '] + grouped['final_event_name'].apply(', '.join) + [', Conversion'])

grouped['path'] = grouped['path'].str.split(', ')

In [None]:
list_of_paths = grouped['path']

total_conversions = sum(path.count('Conversion') for path in grouped['path'].tolist())

base_conversion_rate = total_conversions / len(list_of_paths)

In [None]:
#initialize dictionary
transition_counts = defaultdict(lambda: defaultdict(int))

#create loop
for sequence in grouped.path:
    for i in range(len(sequence - 1):
        current_state = sequence[i]
        next_state = sequence[+1]
        transition_counts[current_state][next_state] += 1
                   
    print(dict(transition_counts))

In [None]:
transition_matrix_counts = defaultdict(dict)

#Get counts for each state conversion
for state, transitions in transition_counts.items():
    total_transitions = sum(transition.values())
    for next_state, count in transition.items():
        transition_matrix_counts[state][next_state] = count
print(dict(transition_matrix_counts))

In [None]:
states = sorted(population['final_event_name'].unique())

transition_df_counts = pd.DataFrame(0, index=states, column=states, dtype=float)

for state, transitions in transition_matrix_counts.items():
    for next_state, count in transition_items():
        transition_df_counts.loc[state, next_state] = count

transition_df_counts.fillna(0, inplace=True)

In [None]:
transition_df_counts['Total'] = transition_df_counts.sum(axis=1)
transition_df_counts['No Conversion'] = transition_df_counts['Total'] - transition_df_counts['Conversion']

In [None]:
transition_matrix = defaultdict(dict)

for state, transitions in transition_counts.items():
    total_transitions = sum(transitions.values())
    for next_state, count in transitions.items():
        transition_matrix[state][next_state] = count / total_transitions

print(dict(transition_matrix))

In [None]:
states = sorted(population['final_event_name'].unique())

transition_df = pd.DataFrame(0, index=states, column=states, dtype=float)

for state, transitions in transition_matrix.items():
    for next_state, prob in transition_items():
        transition_df.loc[state, next_state] = round(prob,3)

In [None]:
transition_df.fillna(0, inplace=True)

plt.figure(figsize=(20,10))
sns.heatmap(transition_df, annot=True, cmap="Greens")

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(transition_df_counts, annot=True, fmt=".0f", cmap="Greens")

In [None]:
#Removal effect
removal_effects_dict = {}

#turn transition counts into dataframe
for event in transition_df_counts.index:
    if event not in ('Start', 'Null', 'Conversion'):
        new_df = transition_df_counts.drop(event, axis=1).drop(event, axis=0)
        new_conversion = new_df.Conversion.sum() / new_df.loc['Start']['Total']
    removal_effects_dict[event] = new_conversion

In [None]:
removal_df = pd.DataFrame(removal_effects_dict.items())
removal_df_filtered = removal_df.sort_values(1, ascending = True).head(20)

removal_df_filtered.plot.bar(x=0, y=1, figsize(20,5),
                             title = "Conversion Rate based on Dropout Stage"
                             ylabel = "Conversion Rate"
                             xlabel = "Stage Removed from Model")                             

In [None]:
dropout_counts = {}
dropout_state = ['Null']

for state in transition_df.index:
    dropout_counts[state] = transition_df.loc[state, dropout_state].sum()

In [None]:
#Removal effect
removal_effects_dict = {}

#turn transition counts into dataframe
for event in transition_df.index:
    if event not in ('Start', 'Null', 'Conversion'):
        new_df = transition_df.drop(event, axis=1).drop(event, axis=0)
        new_conversion = sum(new_df['Conversion'])
    removal_effects_dict[event] = new_conversion