In [43]:
import pandas as pd
import numpy as np
import os
from statsmodels.stats import weightstats as stests
from statsmodels.stats.proportion import proportions_ztest
from dotenv import load_dotenv

In [44]:
load_dotenv()

directory = os.getenv('DIR')
df_final_demo_csv = os.getenv('CSV1')
df_final_experiment_clients_csv = os.getenv('CSV2')
df_final_web_data_pt_1_csv = os.getenv('CSV3')
df_final_web_data_pt_2_csv = os.getenv('CSV4')

## General Overview & Data Cleaning

In [45]:
df_client = pd.read_csv(directory+df_final_demo_csv)
df_client.head()

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.3,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0


In [46]:
df_client.isnull().sum()

client_id            0
clnt_tenure_yr      14
clnt_tenure_mnth    14
clnt_age            15
gendr               14
num_accts           14
bal                 14
calls_6_mnth        14
logons_6_mnth       14
dtype: int64

In [47]:
#Checking for the null rows
df_client[df_client["clnt_age"].isnull()]

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
4164,7402828,,,,,,,,
8316,355337,,,,,,,,
8677,8412164,,,,,,,,
9583,4666211,8.0,106.0,,F,2.0,42550.55,4.0,7.0
13444,2222915,,,,,,,,
18066,4876926,,,,,,,,
25961,5277910,,,,,,,,
28432,7616759,,,,,,,,
35323,8191345,,,,,,,,
43518,1227228,,,,,,,,


In [48]:
df_client=df_client[df_client["clnt_age"].notnull()]

In [49]:
df_interaction1 = pd.read_csv(directory+df_final_web_data_pt_1_csv)
df_interaction1.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04


In [50]:
df_interaction2=pd.read_csv(directory+df_final_web_data_pt_2_csv)
df_interaction2.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,763412,601952081_10457207388,397475557_40440946728_419634,confirm,2017-06-06 08:56:00
1,6019349,442094451_91531546617,154620534_35331068705_522317,confirm,2017-06-01 11:59:27
2,6019349,442094451_91531546617,154620534_35331068705_522317,step_3,2017-06-01 11:58:48
3,6019349,442094451_91531546617,154620534_35331068705_522317,step_2,2017-06-01 11:58:08
4,6019349,442094451_91531546617,154620534_35331068705_522317,step_1,2017-06-01 11:57:58


In [51]:
df_interaction1.isnull().sum()

client_id       0
visitor_id      0
visit_id        0
process_step    0
date_time       0
dtype: int64

In [52]:
df_interaction2.isnull().sum()

client_id       0
visitor_id      0
visit_id        0
process_step    0
date_time       0
dtype: int64

In [53]:
df_variation = pd.read_csv(directory+df_final_experiment_clients_csv)
df_variation.head()

Unnamed: 0,client_id,Variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control


In [54]:
df_variation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70609 entries, 0 to 70608
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   client_id  70609 non-null  int64 
 1   Variation  50500 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


In [55]:
df_variation.isnull().sum()

client_id        0
Variation    20109
dtype: int64

In [56]:
# Create a list of the files in order of appending
# Merge all the dataframes in df_comb
df_combined = [df_interaction1,df_interaction2]

# Pandas will automatically append based on similar column names
df_new_interaction = pd.concat(df_combined)

In [57]:
df_new_interaction_merged = df_new_interaction.merge(df_variation, how="inner", on="client_id")

In [58]:
df_new_interaction_merged.isnull().sum()

client_id            0
visitor_id           0
visit_id             0
process_step         0
date_time            0
Variation       128522
dtype: int64

In [59]:
df_new_interaction_merged.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07,Test
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51,Test
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22,Test
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13,Test
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04,Test


In [60]:
df_new_interaction_dropped=df_new_interaction_merged.dropna(how='any')

In [61]:
df_interaction_client = df_new_interaction_dropped.merge(df_client, how="inner", on="client_id")

In [62]:
df_interaction_client=df_interaction_client.sort_values(['visit_id','date_time'],ascending=True)

In [63]:
df_interaction_client["Variation"].value_counts()

Variation
Test       177787
Control    143408
Name: count, dtype: int64

In [64]:
display(df_interaction_client)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
106827,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:22:17,Test,4.0,56.0,59.5,U,2.0,63130.44,6.0,9.0
106826,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:23:09,Test,4.0,56.0,59.5,U,2.0,63130.44,6.0,9.0
236490,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:20:56,Test,7.0,88.0,23.5,M,2.0,26436.73,6.0,9.0
236489,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:12,Test,7.0,88.0,23.5,M,2.0,26436.73,6.0,9.0
236488,7338123,612065484_94198474375,100019538_17884295066_43909,step_2,2017-04-09 16:21:21,Test,7.0,88.0,23.5,M,2.0,26436.73,6.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95624,6627522,730634087_44272418812,999988789_76411676596_272843,start,2017-04-21 23:49:11,Test,18.0,227.0,58.0,F,2.0,1056775.54,6.0,9.0
95623,6627522,730634087_44272418812,999988789_76411676596_272843,step_1,2017-04-21 23:49:22,Test,18.0,227.0,58.0,F,2.0,1056775.54,6.0,9.0
95622,6627522,730634087_44272418812,999988789_76411676596_272843,step_2,2017-04-21 23:50:16,Test,18.0,227.0,58.0,F,2.0,1056775.54,6.0,9.0
95621,6627522,730634087_44272418812,999988789_76411676596_272843,step_1,2017-04-21 23:51:00,Test,18.0,227.0,58.0,F,2.0,1056775.54,6.0,9.0


## Design Check for Statistic Analyze

In [65]:
print(df_interaction_client.dtypes)

client_id             int64
visitor_id           object
visit_id             object
process_step         object
date_time            object
Variation            object
clnt_tenure_yr      float64
clnt_tenure_mnth    float64
clnt_age            float64
gendr                object
num_accts           float64
bal                 float64
calls_6_mnth        float64
logons_6_mnth       float64
dtype: object


In [66]:
print(df_interaction_client['Variation'].unique())

['Test' 'Control']


In [67]:
df_interaction_client['Variation_numeric'] = df_interaction_client['Variation'].map({'Test': 1, 'Control': 0})

In [68]:
mean_variation = df_interaction_client['Variation_numeric'].mean()

print(f"Mean of Variation (Test=1, Control=0): {mean_variation}")

Mean of Variation (Test=1, Control=0): 0.5535173337069381


In [69]:
#Splitting the data into control group and test group
df_control = df_interaction_client[df_interaction_client["Variation"]=='Control']
df_test = df_interaction_client[df_interaction_client["Variation"]=='Test']

In [70]:
#Extracting the demogaphic data into series for Z-test
df_control_age=df_control['clnt_age']
df_control_tenure=df_control['clnt_tenure_mnth']
df_control_num_accts=df_control['num_accts']
df_control_bal=df_control['bal']
df_control_calls=df_control['calls_6_mnth']
df_control_logons=df_control['logons_6_mnth']
df_test_age=df_test['clnt_age']
df_test_tenure=df_test['clnt_tenure_mnth']
df_test_num_accts=df_test['num_accts']
df_test_bal=df_test['bal']
df_test_calls=df_test['calls_6_mnth']
df_test_logons=df_test['logons_6_mnth']

In [71]:
#to display all rows mentioned in head()
pd.set_option('display.max_rows',500)

### Z-Test on Demographics

In [72]:
#Z-test for Age 
ztest ,pval = stests.ztest(df_control_age,df_test_age,value=0, alternative='two-sided')
print("Z-statistic for client age:", ztest)
print("p-value:", pval)

#Z-test for Tenure
ztest ,pval = stests.ztest(df_control_tenure,df_test_tenure,value=0, alternative='two-sided')
print("\nZ-statistic for client tenure:", ztest)
print("p-value:", pval)

#Z-test for Number of Accounts
ztest ,pval = stests.ztest(df_control_num_accts,df_test_num_accts,value=0, alternative='two-sided')
print("\nZ-statistic for number of accounts:", ztest)
print("p-value:", pval)

#Z-test for Balance
ztest ,pval = stests.ztest(df_control_bal,df_test_bal,value=0, alternative='two-sided')
print("\nZ-statistic for balance:", ztest)
print("p-value:", pval)

#Z-test for Number of calls
ztest ,pval = stests.ztest(df_control_calls,df_test_calls,value=0, alternative='two-sided')
print("\nZ-statistic for number of calls:", ztest)
print("p-value:", pval)

#Z-test for Number of logons
ztest ,pval = stests.ztest(df_control_logons,df_test_logons,value=0, alternative='two-sided')
print("\nZ-statistic for number of logons:", ztest)
print("p-value:", pval)

# Z-test for Gender
males_num=np.array([48323,59690])
total_num=np.array([143408,177787])
(test_stat, p_value)=proportions_ztest(males_num,total_num, alternative='two-sided')
print("\nZ-statistic for gender:", test_stat)
print("p-value:", p_value)

Z-statistic for client age: -7.926400435419628
p-value: 2.255893539888405e-15

Z-statistic for client tenure: -0.5054199559952703
p-value: 0.6132639053956633

Z-statistic for number of accounts: 10.397898754069145
p-value: 2.534606481125466e-25

Z-statistic for balance: -4.086643014804527
p-value: 4.376597446310934e-05

Z-statistic for number of calls: 9.045734571879295
p-value: 1.4866222621367004e-19

Z-statistic for number of logons: 8.533386925260755
p-value: 1.4212435967967977e-17

Z-statistic for gender: 0.7292530033983262
p-value: 0.4658469143504317


## Metrics Calculation - Steps

### Step process

In [73]:
#Creating boolean value for steps
df_interaction_client['process_start'] = np.where(df_interaction_client['process_step']=='start', 1, 0)
df_interaction_client['process_step1'] = np.where(df_interaction_client['process_step']=='step_1', 1, 0)
df_interaction_client['process_step2'] = np.where(df_interaction_client['process_step']=='step_2', 1, 0)
df_interaction_client['process_step3'] = np.where(df_interaction_client['process_step']=='step_3', 1, 0)
df_interaction_client['process_confirm'] = np.where(df_interaction_client['process_step']=='confirm', 1, 0)

In [74]:
df_interaction_client.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation_numeric,process_start,process_step1,process_step2,process_step3,process_confirm
106827,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:22:17,Test,4.0,56.0,59.5,U,2.0,63130.44,6.0,9.0,1,0,0,0,0,1
106826,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:23:09,Test,4.0,56.0,59.5,U,2.0,63130.44,6.0,9.0,1,0,0,0,0,1
236490,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:20:56,Test,7.0,88.0,23.5,M,2.0,26436.73,6.0,9.0,1,1,0,0,0,0
236489,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:12,Test,7.0,88.0,23.5,M,2.0,26436.73,6.0,9.0,1,0,1,0,0,0
236488,7338123,612065484_94198474375,100019538_17884295066_43909,step_2,2017-04-09 16:21:21,Test,7.0,88.0,23.5,M,2.0,26436.73,6.0,9.0,1,0,0,1,0,0


### Transition process

In [75]:
#Adding a column to display the process step in the next row, within the same visit_id
df_interaction_client['lead']=df_interaction_client.groupby('visit_id')['process_step'].shift(-1)

In [76]:
#Adding processs to check the step transitions
df_interaction_client['process_start-step1'] = np.where((df_interaction_client['process_step']=='start') & (df_interaction_client['lead']=='step_1'), 1, 0)
df_interaction_client['process_step1-step2'] = np.where((df_interaction_client['process_step']=='step_1') & (df_interaction_client['lead']=='step_2'), 1, 0)
df_interaction_client['process_step2-step3'] = np.where((df_interaction_client['process_step']=='step_2') & (df_interaction_client['lead']=='step_3'), 1, 0)
df_interaction_client['process_step3-confirm'] = np.where((df_interaction_client['process_step']=='step_3') & (df_interaction_client['lead']=='confirm'), 1, 0)
df_interaction_client.head(10)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,...,process_start,process_step1,process_step2,process_step3,process_confirm,lead,process_start-step1,process_step1-step2,process_step2-step3,process_step3-confirm
106827,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:22:17,Test,4.0,56.0,59.5,U,...,0,0,0,0,1,confirm,0,0,0,0
106826,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:23:09,Test,4.0,56.0,59.5,U,...,0,0,0,0,1,,0,0,0,0
236490,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:20:56,Test,7.0,88.0,23.5,M,...,1,0,0,0,0,step_1,1,0,0,0
236489,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:12,Test,7.0,88.0,23.5,M,...,0,1,0,0,0,step_2,0,1,0,0
236488,7338123,612065484_94198474375,100019538_17884295066_43909,step_2,2017-04-09 16:21:21,Test,7.0,88.0,23.5,M,...,0,0,1,0,0,step_1,0,0,0,0
236487,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:35,Test,7.0,88.0,23.5,M,...,0,1,0,0,0,step_1,0,0,0,0
236486,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:41,Test,7.0,88.0,23.5,M,...,0,1,0,0,0,start,0,0,0,0
236485,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:21:45,Test,7.0,88.0,23.5,M,...,1,0,0,0,0,start,0,0,0,0
236484,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:21:59,Test,7.0,88.0,23.5,M,...,1,0,0,0,0,step_1,1,0,0,0
236483,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:22:04,Test,7.0,88.0,23.5,M,...,0,1,0,0,0,step_2,0,1,0,0


### Drop-off process

In [77]:
#Adding processs to check the step drop-offs
df_interaction_client['process_start-dropoff'] = np.where((df_interaction_client['process_step']=='start') & (df_interaction_client['lead'].isnull()), 1, 0)
df_interaction_client['process_step1-dropoff'] = np.where((df_interaction_client['process_step']=='step_1') & (df_interaction_client['lead'].isnull()), 1, 0)
df_interaction_client['process_step2-dropoff'] = np.where((df_interaction_client['process_step']=='step_2') & (df_interaction_client['lead'].isnull()), 1, 0)
df_interaction_client['process_step3-dropoff'] = np.where((df_interaction_client['process_step']=='step_3') & (df_interaction_client['lead'].isnull()), 1, 0)
df_interaction_client.head(10)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,...,process_confirm,lead,process_start-step1,process_step1-step2,process_step2-step3,process_step3-confirm,process_start-dropoff,process_step1-dropoff,process_step2-dropoff,process_step3-dropoff
106827,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:22:17,Test,4.0,56.0,59.5,U,...,1,confirm,0,0,0,0,0,0,0,0
106826,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:23:09,Test,4.0,56.0,59.5,U,...,1,,0,0,0,0,0,0,0,0
236490,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:20:56,Test,7.0,88.0,23.5,M,...,0,step_1,1,0,0,0,0,0,0,0
236489,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:12,Test,7.0,88.0,23.5,M,...,0,step_2,0,1,0,0,0,0,0,0
236488,7338123,612065484_94198474375,100019538_17884295066_43909,step_2,2017-04-09 16:21:21,Test,7.0,88.0,23.5,M,...,0,step_1,0,0,0,0,0,0,0,0
236487,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:35,Test,7.0,88.0,23.5,M,...,0,step_1,0,0,0,0,0,0,0,0
236486,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:41,Test,7.0,88.0,23.5,M,...,0,start,0,0,0,0,0,0,0,0
236485,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:21:45,Test,7.0,88.0,23.5,M,...,0,start,0,0,0,0,0,0,0,0
236484,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:21:59,Test,7.0,88.0,23.5,M,...,0,step_1,1,0,0,0,0,0,0,0
236483,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:22:04,Test,7.0,88.0,23.5,M,...,0,step_2,0,1,0,0,0,0,0,0


### Transition time process

In [78]:
#Adding a column to display the timestamp of process step in the next row, within the same visit_id
df_interaction_client['leadtime']=df_interaction_client.groupby('visit_id')['date_time'].shift(-1)
display(df_interaction_client)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,...,lead,process_start-step1,process_step1-step2,process_step2-step3,process_step3-confirm,process_start-dropoff,process_step1-dropoff,process_step2-dropoff,process_step3-dropoff,leadtime
106827,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:22:17,Test,4.0,56.0,59.5,U,...,confirm,0,0,0,0,0,0,0,0,2017-04-26 13:23:09
106826,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:23:09,Test,4.0,56.0,59.5,U,...,,0,0,0,0,0,0,0,0,
236490,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:20:56,Test,7.0,88.0,23.5,M,...,step_1,1,0,0,0,0,0,0,0,2017-04-09 16:21:12
236489,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:12,Test,7.0,88.0,23.5,M,...,step_2,0,1,0,0,0,0,0,0,2017-04-09 16:21:21
236488,7338123,612065484_94198474375,100019538_17884295066_43909,step_2,2017-04-09 16:21:21,Test,7.0,88.0,23.5,M,...,step_1,0,0,0,0,0,0,0,0,2017-04-09 16:21:35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95624,6627522,730634087_44272418812,999988789_76411676596_272843,start,2017-04-21 23:49:11,Test,18.0,227.0,58.0,F,...,step_1,1,0,0,0,0,0,0,0,2017-04-21 23:49:22
95623,6627522,730634087_44272418812,999988789_76411676596_272843,step_1,2017-04-21 23:49:22,Test,18.0,227.0,58.0,F,...,step_2,0,1,0,0,0,0,0,0,2017-04-21 23:50:16
95622,6627522,730634087_44272418812,999988789_76411676596_272843,step_2,2017-04-21 23:50:16,Test,18.0,227.0,58.0,F,...,step_1,0,0,0,0,0,0,0,0,2017-04-21 23:51:00
95621,6627522,730634087_44272418812,999988789_76411676596_272843,step_1,2017-04-21 23:51:00,Test,18.0,227.0,58.0,F,...,start,0,0,0,0,0,0,0,0,2017-04-21 23:51:09


In [79]:
#Create a leadtime for anaylze the time spent
df_interaction_client['leadtime'] = pd.to_datetime(df_interaction_client['leadtime'])
df_interaction_client['date_time'] = pd.to_datetime(df_interaction_client['date_time'])

# Calculate the differences in seconds (or another suitable unit)
df_interaction_client['start_time'] = np.where(
    df_interaction_client['process_start-step1'] == 1,
    (df_interaction_client['leadtime'] - df_interaction_client['date_time']).dt.total_seconds(),
    0
)

df_interaction_client['step1_time'] = np.where(
    df_interaction_client['process_step1-step2'] == 1,
    (df_interaction_client['leadtime'] - df_interaction_client['date_time']).dt.total_seconds(),
    0
)

df_interaction_client['step2_time'] = np.where(
    df_interaction_client['process_step2-step3'] == 1,
    (df_interaction_client['leadtime'] - df_interaction_client['date_time']).dt.total_seconds(),
    0
)

df_interaction_client['step3_time'] = np.where(
    df_interaction_client['process_step3-confirm'] == 1,
    (df_interaction_client['leadtime'] - df_interaction_client['date_time']).dt.total_seconds(),
    0
)

# Display the first 10 rows to verify the changes
df_interaction_client.head(10)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,...,process_step3-confirm,process_start-dropoff,process_step1-dropoff,process_step2-dropoff,process_step3-dropoff,leadtime,start_time,step1_time,step2_time,step3_time
106827,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:22:17,Test,4.0,56.0,59.5,U,...,0,0,0,0,0,2017-04-26 13:23:09,0.0,0.0,0.0,0.0
106826,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:23:09,Test,4.0,56.0,59.5,U,...,0,0,0,0,0,NaT,0.0,0.0,0.0,0.0
236490,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:20:56,Test,7.0,88.0,23.5,M,...,0,0,0,0,0,2017-04-09 16:21:12,16.0,0.0,0.0,0.0
236489,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:12,Test,7.0,88.0,23.5,M,...,0,0,0,0,0,2017-04-09 16:21:21,0.0,9.0,0.0,0.0
236488,7338123,612065484_94198474375,100019538_17884295066_43909,step_2,2017-04-09 16:21:21,Test,7.0,88.0,23.5,M,...,0,0,0,0,0,2017-04-09 16:21:35,0.0,0.0,0.0,0.0
236487,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:35,Test,7.0,88.0,23.5,M,...,0,0,0,0,0,2017-04-09 16:21:41,0.0,0.0,0.0,0.0
236486,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:41,Test,7.0,88.0,23.5,M,...,0,0,0,0,0,2017-04-09 16:21:45,0.0,0.0,0.0,0.0
236485,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:21:45,Test,7.0,88.0,23.5,M,...,0,0,0,0,0,2017-04-09 16:21:59,0.0,0.0,0.0,0.0
236484,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:21:59,Test,7.0,88.0,23.5,M,...,0,0,0,0,0,2017-04-09 16:22:04,5.0,0.0,0.0,0.0
236483,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:22:04,Test,7.0,88.0,23.5,M,...,0,0,0,0,0,2017-04-09 16:22:08,0.0,4.0,0.0,0.0


In [80]:
df_interaction_client.to_csv(f"{directory}Final_DF.csv", index=False)

In [81]:
#Splitting the data into control group and test group, after adding all the processs
df_control_new=df_interaction_client[df_interaction_client["Variation"]=='Control']
df_test_new=df_interaction_client[df_interaction_client["Variation"]=='Test']

### Confirmation Rate Analyze

In [82]:
#process-start counted once for one visit id
#Number of distinct start from each visit_id in control group
start_control=df_control_new.groupby('process_start')["visit_id"].nunique()
start_control=start_control.iloc[1]
start_control

30903

In [83]:
#Number of distinct confirm from each visit_id in control group
confirm_control=df_control_new.groupby('process_confirm')["visit_id"].nunique()
confirm_control=confirm_control.iloc[1]
confirm_control

16039

In [84]:
#Number of distinct start from each visit_id in test group
start_test=df_test_new.groupby('process_start')["visit_id"].nunique()
start_test=start_test.iloc[1]
start_test

33144

In [85]:
#Number of distinct confirm from each visit_id in test group
confirm_test=df_test_new.groupby('process_confirm')["visit_id"].nunique()
confirm_test=confirm_test.iloc[1]
confirm_test

21725

In [86]:
#Confirm rate of Control Group
print("Confirm rate of Control Group",confirm_control/start_control)
#Confirm rate of Test Group
print("Confirm rate of Test Group",confirm_test/start_test)

Confirm rate of Control Group 0.5190110992460278
Confirm rate of Test Group 0.6554730871349264


In [87]:
#Left-tailed Proportions Z-Test for Confirm rate
confirm_num=np.array([confirm_control,confirm_test])
start_num=np.array([start_control,start_test])
(test_stat, p_value)=proportions_ztest(confirm_num,start_num, alternative='smaller')
print("Z-statistic for confirm rate:", test_stat)
print("p-value:", p_value)

Z-statistic for confirm rate: -35.08224753617665
p-value: 6.287002370874248e-270


## Variation of Confirm Rates based on gender

In [88]:
#Splitting the control and test groups on the basis of male and female genders
male_control=df_control_new[df_control_new["gendr"]=='M']
female_control=df_control_new[df_control_new["gendr"]=='F']
male_test=df_test_new[df_test_new["gendr"]=='M']
female_test=df_test_new[df_test_new["gendr"]=='F']

In [89]:
#Control group confirm rate comparison between different genders
#start
start_num_male_control=male_control.groupby('process_start')["visit_id"].nunique().iloc[1]
start_num_female_control=female_control.groupby('process_start')["visit_id"].nunique().iloc[1]
start_num_male_test=male_test.groupby('process_start')["visit_id"].nunique().iloc[1]
start_num_female_test=female_test.groupby('process_start')["visit_id"].nunique().iloc[1]
#confirm
confirm_num_male_control=male_control.groupby('process_confirm')["visit_id"].nunique().iloc[1]
confirm_num_female_control=female_control.groupby('process_confirm')["visit_id"].nunique().iloc[1]
confirm_num_male_test=male_test.groupby('process_confirm')["visit_id"].nunique().iloc[1]
confirm_num_female_test=female_test.groupby('process_confirm')["visit_id"].nunique().iloc[1]

In [90]:
print("Confirm rate of Males in Control Group",confirm_num_male_control/start_num_male_control)
print("Confirm rate of Females in Control Group",confirm_num_female_control/start_num_female_control)
print("Confirm rate of Males in Test Group",confirm_num_male_test/start_num_male_test)
print("Confirm rate of Females in Test Group",confirm_num_female_test/start_num_female_test)

Confirm rate of Males in Control Group 0.5206844678450545
Confirm rate of Females in Control Group 0.5036057692307693
Confirm rate of Males in Test Group 0.6830606006114008
Confirm rate of Females in Test Group 0.6346686357762599


## Drop-Off Rate

### Drop-off rate for 'Start'

In [91]:
#Control group drop-off rate for 'start'
sstep1_control=df_control_new['process_start-step1'].sum()
stepdrop_control=df_control_new['process_start-dropoff'].sum()
stepdrop_rate_c=stepdrop_control/(stepdrop_control+sstep1_control)
print("Start Drop-off rate for control group:", stepdrop_rate_c)

#Test group drop-off rate for 'start'
sstep1_test=df_test_new['process_start-step1'].sum()
sdrop_test=df_test_new['process_start-dropoff'].sum()
sdrop_rate_test=sdrop_test/(sdrop_test+sstep1_test)
print("Start Drop-off rate for test group:", sdrop_rate_test)

#Control group drop-off rate for 'step1'
step1step2_control=df_control_new['process_step1-step2'].sum()
step1drop_control=df_control_new['process_step1-dropoff'].sum()
step1drop_rate_control=step1drop_control/(step1drop_control+step1step2_control)
print("\nStep1 Drop-off rate for control group:", step1drop_rate_control)

#Test group drop-off rate for 'step1'
step1step2_test=df_test_new['process_step1-step2'].sum()
step1drop_test=df_test_new['process_step1-dropoff'].sum()
s1drop_rate_test=step1drop_test/(step1drop_test+step1step2_test)
print("Step1 Drop-off rate for test group:", s1drop_rate_test)

#Control group drop-off rate for 'step2'
step2step3_control=df_control_new['process_step2-step3'].sum()
step2drop_control=df_control_new['process_step2-dropoff'].sum()
step2drop_rate_control=step2drop_control/(step2drop_control+step2step3_control)
print("\nStep2 Drop-off rate for control group:", step2drop_rate_control)

#Test group drop-off rate for 'step2'
step2step3_test=df_test_new['process_step2-step3'].sum()
step2drop_test=df_test_new['process_step2-dropoff'].sum()
step2drop_rate_test=step2drop_test/(step2drop_test+step2step3_test)
print("Step2 Drop-off rate for test group:", step2drop_rate_test)

#Control group drop-off rate for 'step3'
step3c_control=df_control_new['process_step3-confirm'].sum()
step3drop_control=df_control_new['process_step3-dropoff'].sum()
step3drop_rate_control=step3drop_control/(step3drop_control+step3c_control)
print("\nStep3 Drop-off rate for control group:", step3drop_rate_control)

#Test group drop-off rate for 'step3'
step3c_test=df_test_new['process_step3-confirm'].sum()
step3drop_test=df_test_new['process_step3-dropoff'].sum()
step3drop_rate_test=step3drop_test/(step3drop_test+step3c_test)
print("Step3 Drop-off rate for test group:", step3drop_rate_test)

Start Drop-off rate for control group: 0.2723240769513263
Start Drop-off rate for test group: 0.22069545709478408

Step1 Drop-off rate for control group: 0.13521366859091086
Step1 Drop-off rate for test group: 0.10238662705279311

Step2 Drop-off rate for control group: 0.06412852112676057
Step2 Drop-off rate for test group: 0.05174035747883349

Step3 Drop-off rate for control group: 0.1283660429027841
Step3 Drop-off rate for test group: 0.08847624249044238


### Statistical Test for Drop-Off Rates

In [92]:
#Z-Test for Start Drop-off rate Right
start_drops_offs=np.array([stepdrop_control,sdrop_test])
start_total=np.array([stepdrop_control+sstep1_control,sdrop_test+sstep1_test])
(test_stat, p_value)=proportions_ztest(start_drops_offs,start_total, alternative='larger')
print("Z-statistic for start drop-off rate:", test_stat)
print("p-value:", p_value)

#Z-Test for Step 1 Drop-off rate Right
step1_drops_offs=np.array([step1drop_control,step1drop_test])
step1_total=np.array([step1drop_control+step1step2_control,step1drop_test+step1step2_test])
(test_stat, p_value)=proportions_ztest(step1_drops_offs,step1_total, alternative='larger')
print("\nZ-statistic for step1 drop-off rate:", test_stat)
print("p-value:", p_value)

#Z-Test for Step 2 Drop-off rate Right
step2_drops_offs=np.array([step2drop_control,step2drop_test])
step2_total=np.array([step2drop_control+step2step3_control,step2drop_test+step2step3_test])
(test_stat, p_value)=proportions_ztest(step2_drops_offs,step2_total, alternative='larger')
print("\nZ-statistic for step2 drop-off rate:", test_stat)
print("p-value:", p_value)

#Z-Test for Step 3 Drop-off rate Right
step3_drops_offs=np.array([step3drop_control,step3drop_test])
step3_total=np.array([step3drop_control+step3c_control,step2drop_test+step3c_test])
(test_stat, p_value)=proportions_ztest(step3_drops_offs,step3_total, alternative='larger')
print("\nZ-statistic for step3 drop-off rate:", test_stat)
print("p-value:", p_value)

Z-statistic for start drop-off rate: 16.72982992926426
p-value: 3.973243414373153e-63

Z-statistic for step1 drop-off rate: 12.069593615078004
p-value: 7.644371736016304e-34

Z-statistic for step2 drop-off rate: 5.830049134236932
p-value: 2.770552895429687e-09

Z-statistic for step3 drop-off rate: 11.712552144347747
p-value: 5.496015019152371e-32


## Navigation Time

### Calculate Avg time for each step

In [93]:
# Start Step
step_avgtime_control=df_control_new['start_time'].sum()/df_control_new['process_start-step1'].sum()
print("Avg. Navigation Time for 'Start' for control group:", step_avgtime_control)
step_avgtime_test=df_test_new['start_time'].sum()/df_test_new['process_start-step1'].sum()
print("Avg. Navigation Time for 'Start' for test group:", step_avgtime_test)

# Step 1
step1_avgtime_control=df_control_new['step1_time'].sum()/df_control_new['process_step1-step2'].sum()
print("\nAvg. Navigation Time for 'Step1' for control group:", step1_avgtime_control)
step1_avgtime_test=df_test_new['step1_time'].sum()/df_test_new['process_step1-step2'].sum()
print("Avg. Navigation Time for 'Step1' for test group:", step1_avgtime_test)

# Step 2
step2_avgtime_control=df_control_new['step2_time'].sum()/df_control_new['process_step2-step3'].sum()
print("\nAvg. Navigation Time for 'Step2' for control group:", step2_avgtime_control)
step2_avgtime_time=df_test_new['step2_time'].sum()/df_test_new['process_step2-step3'].sum()
print("Avg. Navigation Time for 'Step2' for test group:", step2_avgtime_time)

# Step 3
step3_avgtime_control=df_control_new['step3_time'].sum()/df_control_new['process_step3-confirm'].sum()
print("\nAvg. Navigation Time for 'Step3' for control group:", step3_avgtime_control)
step3_avgtime_time=df_test_new['step3_time'].sum()/df_test_new['process_step3-confirm'].sum()
print("Avg. Navigation Time for 'Step3' for test group:", step3_avgtime_time)

Avg. Navigation Time for 'Start' for control group: 37.64330913467138
Avg. Navigation Time for 'Start' for test group: 31.032295789852466

Avg. Navigation Time for 'Step1' for control group: 33.82890551498589
Avg. Navigation Time for 'Step1' for test group: 37.098643291019535

Avg. Navigation Time for 'Step2' for control group: 87.01622536801015
Avg. Navigation Time for 'Step2' for test group: 86.46821263227513

Avg. Navigation Time for 'Step3' for control group: 127.12881267181568
Avg. Navigation Time for 'Step3' for test group: 104.84378234108611


### Z-Test for Navigation Time

In [94]:
#Extracting the time data into series for Z-test

#Control Group
control_seconds =df_control_new['start_time']
control_seconds=control_seconds[control_seconds!=0]
control_seconds_1=df_control_new['step1_time']
control_seconds_1=control_seconds_1[control_seconds_1!=0]
control_seconds_2=df_control_new['step2_time']
control_seconds_2=control_seconds_2[control_seconds_2!=0]
control_seconds_3=df_control_new['step3_time']
control_seconds_3=control_seconds_3[control_seconds_3!=0]

#Test Group
test_seconds=df_test_new['start_time']
test_seconds=test_seconds[test_seconds!=0]
test_seconds_1=df_test_new['step1_time']
test_seconds_1=test_seconds_1[test_seconds_1!=0]
test_seconds_2=df_test_new['step2_time']
test_seconds_2=test_seconds_2[test_seconds_2!=0]
test_seconds_3=df_test_new['step3_time']
test_seconds_3=test_seconds_3[test_seconds_3!=0]

In [95]:
#Z-test for Start Right
ztest ,pval = stests.ztest(control_seconds,test_seconds,value=0, alternative='larger')
print("Z-statistic for time spent on 'Start':", ztest)
print("p-value:", pval)

#Z-test for Start Left
ztest ,pval = stests.ztest(control_seconds,test_seconds,value=0, alternative='smaller')
print("\nZ-statistic for time spent on 'Start':", ztest)
print("p-value:", pval)

#Z-test for Step_1 Right
ztest ,pval = stests.ztest(control_seconds_1,test_seconds_1,value=0, alternative='larger')
print("\nZ-statistic for time spent on 'Step1':", ztest)
print("p-value:", pval)

#Z-test for Step_1 Left
ztest ,pval = stests.ztest(control_seconds_1,test_seconds_1,value=0, alternative='smaller')
print("\nZ-statistic for time spent on 'Step1':", ztest)
print("p-value:", pval)

#Z-test for Step_2 Right
ztest ,pval = stests.ztest(control_seconds_2,test_seconds_2,value=0, alternative='larger')
print("\nZ-statistic for time spent on 'Step2':", ztest)
print("p-value:", pval)

#Z-test for Step_2 Left
ztest ,pval = stests.ztest(control_seconds_2,test_seconds_2,value=0, alternative='smaller')
print("\nZ-statistic for time spent on 'Step2':", ztest)
print("p-value:", pval)

#Z-test for Step_3 right
ztest ,pval = stests.ztest(control_seconds_3,test_seconds_3,value=0, alternative='larger')
print("\nZ-statistic for time spent on 'Step3':", ztest)
print("p-value:", pval)

#Z-test for Step_3 Left
ztest ,pval = stests.ztest(control_seconds_3,test_seconds_3,value=0, alternative='smaller')
print("\nZ-statistic for time spent on 'Step3':", ztest)
print("p-value:", pval)

Z-statistic for time spent on 'Start': 9.65320239988468
p-value: 2.3822080839738875e-22

Z-statistic for time spent on 'Start': 9.65320239988468
p-value: 1.0

Z-statistic for time spent on 'Step1': -6.514861618340062
p-value: 0.9999999999636217

Z-statistic for time spent on 'Step1': -6.514861618340062
p-value: 3.637835044913524e-11

Z-statistic for time spent on 'Step2': 0.8222943847865741
p-value: 0.20545468588904298

Z-statistic for time spent on 'Step2': 0.8222943847865741
p-value: 0.7945453141109571

Z-statistic for time spent on 'Step3': 13.256024552164767
p-value: 2.081955440019988e-40

Z-statistic for time spent on 'Step3': 13.256024552164767
p-value: 1.0


## Step Recurrence

### Step Recurrence Metric for 'Start'

In [96]:
## Recurrence Metric start 
start_total_control=df_control_new['process_start'].sum()
rec_start_control=start_control/start_total_control
print("Step Recurrence Metric for 'Start' for control group:", rec_start_control)
start_total_test=df_test_new['process_start'].sum()
rec_start_test=start_test/start_total_test
print("Step Recurrence Metric for 'Start' for test group:", rec_start_test)

## Recurrence Metric step1
step1_total_control=df_control_new['process_step1'].sum()
step1_num_control=df_control_new.groupby('process_step1')["visit_id"].nunique().iloc[1]
rec_step1_control=step1_num_control/step1_total_control
print("\nStep Recurrence Metric for 'Step1 for control group':", rec_step1_control)
step1_total_test=df_test_new['process_step1'].sum()
step1_num_test=df_test_new.groupby('process_step1')["visit_id"].nunique().iloc[1]
rec_step1_test=step1_num_test/step1_total_test
print("Step Recurrence Metric for 'Step1' for test group:", rec_step1_test)

## Recurrence Metric step2
step2_total_control=df_control_new['process_step2'].sum()
step2_num_control=df_control_new.groupby('process_step2')["visit_id"].nunique().iloc[1]
rec_step2_control=step2_num_control/step2_total_control
print("\nStep Recurrence Metric for 'Step2 for control group':", rec_step2_control)
step2_total_test=df_test_new['process_step2'].sum()
step2_num_test=df_test_new.groupby('process_step2')["visit_id"].nunique().iloc[1]
rec_step2_test=step2_num_test/step2_total_test
print("Step Recurrence Metric for 'Step2 for test group':", rec_step2_test)

## Recurrence Metric step3
step3_total_control=df_control_new['process_step3'].sum()
step3_num_control=df_control_new.groupby('process_step3')["visit_id"].nunique().iloc[1]
rec_step3_control=step3_num_control/step3_total_control
print("\nStep Recurrence Metric for 'Step3 for control group':", rec_step3_control)
step3_total_test=df_test_new['process_step3'].sum()
step3_num_test=df_test_new.groupby('process_step3')["visit_id"].nunique().iloc[1]
rec_step3_test=step3_num_test/step3_total_test
print("Step Recurrence Metric for 'Step3 for test group':", rec_step3_test)

## Recurrence Metric confirm
confirm_total_control=df_control_new['process_confirm'].sum()
rec_confirm_control=confirm_control/confirm_total_control
print("\nStep Recurrence Metric for 'Confirm for control group':", rec_confirm_control)
confirm_total_test=df_test_new['process_confirm'].sum()
rec_confirm_test=confirm_test/confirm_total_test
print("Step Recurrence Metric for 'Confirm for test group':", rec_confirm_test)

Step Recurrence Metric for 'Start' for control group: 0.6493727542079052
Step Recurrence Metric for 'Start' for test group: 0.5871182597605045

Step Recurrence Metric for 'Step1 for control group': 0.793775499881984
Step Recurrence Metric for 'Step1' for test group: 0.7295951281191133

Step Recurrence Metric for 'Step2 for control group': 0.7779495304710747
Step Recurrence Metric for 'Step2 for test group': 0.7906578862418491

Step Recurrence Metric for 'Step3 for control group': 0.8027822881467503
Step Recurrence Metric for 'Step3 for test group': 0.8567676143386898

Step Recurrence Metric for 'Confirm for control group': 0.9166190421762487
Step Recurrence Metric for 'Confirm for test group': 0.844804790791725


### Z-Test for Reocurrence

In [97]:
#Z-test for Start Step Right
start_distincts=np.array([start_control,start_test])
start_repeats=np.array([start_total_control,start_total_test])
(test_stat, p_value)=proportions_ztest(start_distincts,start_repeats, alternative='larger')
print("Z-statistic for 'start' step recurrence metric:", test_stat)
print("p-value:", p_value)

#Z-test for Step1 Step Right
step1_distincts=np.array([step1_num_control,step1_num_test])
step1_repeats=np.array([step1_total_control,step1_total_test])
(test_stat, p_value)=proportions_ztest(step1_distincts,step1_repeats, alternative='larger')
print("\nZ-statistic for 'step1' step recurrence metric:", test_stat)
print("p-value:", p_value)

#Z-test for Step1 Step Left
step1_distincts=np.array([step1_num_control,step1_num_test])
step1_repeats=np.array([step1_total_control,step1_total_test])
(test_stat, p_value)=proportions_ztest(step1_distincts,step1_repeats, alternative='smaller')
print("\nZ-statistic for 'step1' step recurrence metric:", test_stat)
print("p-value:", p_value)

#Z-test for Step2 Step Left
step2_distincts=np.array([step2_num_control,step2_num_test])
step2_repeats=np.array([step2_total_control,step2_total_test])
(test_stat, p_value)=proportions_ztest(step2_distincts,step2_repeats, alternative='smaller')
print("\nZ-statistic for 'step2' step recurrence metric:", test_stat)
print("p-value:", p_value)

#Z-test for Step2 Step Right
step2_distincts=np.array([step2_num_control,step2_num_test])
step2_repeats=np.array([step2_total_control,step2_total_test])
(test_stat, p_value)=proportions_ztest(step2_distincts,step2_repeats, alternative='larger')
print("\nZ-statistic for 'step2' step recurrence metric:", test_stat)
print("p-value:", p_value)

#Z-test for Step3 Step Right
step3_distincts=np.array([step3_num_control,step3_num_test])
step3_repeats=np.array([step3_total_control,step3_total_test])
(test_stat, p_value)=proportions_ztest(step3_distincts,step3_repeats, alternative='larger')
print("\nZ-statistic for 'step3' step recurrence metric:", test_stat)
print("p-value:", p_value)

#Z-test for Step3 Step Left
step3_distincts=np.array([step3_num_control,step3_num_test])
step3_repeats=np.array([step3_total_control,step3_total_test])
(test_stat, p_value)=proportions_ztest(step3_distincts,step3_repeats, alternative='smaller')
print("\nZ-statistic for 'step3' step recurrence metric:", test_stat)
print("p-value:", p_value)

#Z-test for 'Confirm' Right
confirm_distincts=np.array([confirm_control,confirm_test])
confirm_repeats=np.array([confirm_total_control,confirm_total_test])
(test_stat, p_value)=proportions_ztest(confirm_distincts,confirm_repeats, alternative='larger')
print("\nZ-statistic for 'confirm' step recurrence metric:", test_stat)
print("p-value:", p_value)

Z-statistic for 'start' step recurrence metric: 20.564545160840638
p-value: 2.8516052568600364e-94

Z-statistic for 'step1' step recurrence metric: 19.40715688959821
p-value: 3.357042659237477e-84

Z-statistic for 'step1' step recurrence metric: 19.40715688959821
p-value: 1.0

Z-statistic for 'step2' step recurrence metric: -3.6723335084827697
p-value: 0.00012017286712718416

Z-statistic for 'step2' step recurrence metric: -3.6723335084827697
p-value: 0.9998798271328728

Z-statistic for 'step3' step recurrence metric: -15.877384241372063
p-value: 1.0

Z-statistic for 'step3' step recurrence metric: -15.877384241372063
p-value: 4.544522426549254e-57

Z-statistic for 'confirm' step recurrence metric: 22.074025526098033
p-value: 2.8080957599531705e-108


### BOUNCE RATE

In [98]:
starting_sessions = df_interaction_client[df_interaction_client['process_step'] == 'start']

# Counting the total number of starting sessions for each group
total_starting_sessions_control = starting_sessions[starting_sessions['Variation'] == 'Control']['visit_id'].nunique()
total_starting_sessions_test = starting_sessions[starting_sessions['Variation'] == 'Test']['visit_id'].nunique()

# Identifying sessions that only have the starting step and no other steps
# This might require checking if these session_ids appear only once or only in the starting step
unique_sessions = df_interaction_client.groupby('visit_id').nunique()
single_step_sessions = unique_sessions[unique_sessions['process_step'] == 1]
bouncing_sessions = df_interaction_client[df_interaction_client['visit_id'].isin(single_step_sessions.index)]

# Count bouncing sessions for each group
bouncing_sessions_control = bouncing_sessions[bouncing_sessions['Variation'] == 'Control']
bouncing_sessions_test = bouncing_sessions[bouncing_sessions['Variation'] == 'Test']

# Calculating bounce rates
p1 = bouncing_sessions_control['visit_id'].nunique() / total_starting_sessions_control
p2 = bouncing_sessions_test['visit_id'].nunique() / total_starting_sessions_test

# Now you have the correct values for p1 and p2, as well as n1 and n2 (total starting sessions as n1 for Control and n2 for Test)
n1 = total_starting_sessions_control
n2 = total_starting_sessions_test

count = np.array([bouncing_sessions_control['visit_id'].nunique(), bouncing_sessions_test['visit_id'].nunique()])
nobs = np.array([total_starting_sessions_control, total_starting_sessions_test])

# Calculate the Z-statistic and p-value
z_stat, p_value = proportions_ztest(count, nobs)

print(f"Z-statistic for bounce rate: {z_stat}")
print(f"P-value: {p_value}")

Z-statistic for bounce rate: 3.955387366863193
P-value: 7.641074149677507e-05


## ERROR RATE

In [99]:
# Map process steps to numeric values for easy comparison
step_mapping = {'start': 0, 'step_1': 1, 'step_2': 2, 'step_3': 3, 'confirm': 4}
df_interaction_client['step_value'] = df_interaction_client['process_step'].map(step_mapping)

# Sort by visitor and datetime to ensure steps are in the actual order they occurred
df_sorted = df_interaction_client.sort_values(by=['visit_id', 'date_time'])

# Calculate step changes for each user
df_sorted['step_change'] = df_sorted.groupby('visit_id')['step_value'].diff()

# Identify backward movements (where step_change is negative)
df_sorted['is_error'] = df_sorted['step_change'] < 0

# Calculate error rates for each group
error_rates = df_sorted.groupby('Variation')['is_error'].mean()

print(error_rates)

Variation
Control    0.067841
Test       0.091975
Name: is_error, dtype: float64


In [100]:
# Calculate total steps for each user or session
total_steps_by_user = df_sorted.groupby(['Variation', 'visitor_id']).size()

# Calculate the total number of backward steps as before
backward_steps_by_user = df_sorted[df_sorted['is_error']].groupby(['Variation', 'visitor_id']).size()

# Calculate the total error rate according to the formula
error_rate = backward_steps_by_user.sum() / total_steps_by_user.sum()

print("Error Rate:", error_rate)

Error Rate: 0.08119989414530114


In [101]:
# Calculate total steps by group
total_steps_by_gender = df_sorted.groupby(['Variation', 'gendr']).size()

# Calculate backward steps by group
backward_steps_by_gender = df_sorted[df_sorted['is_error']].groupby(['Variation', 'gendr']).size()

# Calculate error rates
error_rates_by_gender = backward_steps_by_gender / total_steps_by_gender

print(error_rates_by_gender)

Variation  gendr
Control    F        0.067644
           M        0.063345
           U        0.072427
Test       F        0.099121
           M        0.089194
           U        0.087729
           X        0.125000
dtype: float64


In [102]:
age_bins = [0, 18, 30, 40, 60, 100]
age_labels = ['1-18', '18-30', '31-40', '41-60', '61-100']

# Categorize ages into the defined bins
df_sorted['age_group'] = pd.cut(df_sorted['clnt_age'], bins=age_bins, labels=age_labels, right=False)

# Calculate error rates by variation and age group
error_rates_by_age_group = df_sorted.groupby(['Variation', 'age_group'])['is_error'].mean()

  error_rates_by_age_group = df_sorted.groupby(['Variation', 'age_group'])['is_error'].mean()


In [103]:
# Calculate total steps by group
total_steps_by_age = df_sorted.groupby(['Variation', 'age_group']).size()

# Calculate backward steps by group
backward_steps_by_age = df_sorted[df_sorted['is_error']].groupby(['Variation', 'age_group']).size()

# Calculate error rates
error_rates_by_age = backward_steps_by_age / total_steps_by_age

print(error_rates_by_age)

Variation  age_group
Control    1-18         0.041885
           18-30        0.066304
           31-40        0.058636
           41-60        0.067518
           61-100       0.075373
Test       1-18         0.062500
           18-30        0.072715
           31-40        0.070172
           41-60        0.093294
           61-100       0.114093
dtype: float64


  total_steps_by_age = df_sorted.groupby(['Variation', 'age_group']).size()
  backward_steps_by_age = df_sorted[df_sorted['is_error']].groupby(['Variation', 'age_group']).size()


In [104]:
# Calculate the number of errors for each group
errors_control = df_sorted[(df_sorted['Variation'] == 'Control') & (df_sorted['is_error'])]['process_step'].count()
errors_test = df_sorted[(df_sorted['Variation'] == 'Test') & (df_sorted['is_error'])]['process_step'].count()

# Calculate the total number of actions or steps for each group
total_actions_control = df_sorted[df_sorted['Variation'] == 'Control']['process_step'].count()
total_actions_test = df_sorted[df_sorted['Variation'] == 'Test']['process_step'].count()

# Calculate error rates as proportions
error_rate_control = errors_control / total_actions_control
error_rate_test = errors_test / total_actions_test

print(error_rate_control)
print(error_rate_test)

0.06784140354791922
0.09197522878500677


In [105]:
count = np.array([errors_control, errors_test])
nobs = np.array([total_actions_control, total_actions_test])

z_stat, p_value = proportions_ztest(count, nobs)

print(f"Z-statistic: {z_stat}")
print(f"P-value: {p_value}")

Z-statistic: -24.893743825049114
P-value: 8.696336324148776e-137
