In [2]:
import pandas as pd

In [4]:
# Reload the dataset
df = pd.read_csv('../data/clean/df_cleaned.csv')

In [6]:
# Split data into control and test groups
control_group = df[df['Variation'] == 'Control'].copy()
test_group = df[df['Variation'] == 'Test'].copy()

# Sort data for processing
control_group = control_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])
test_group = test_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

In [8]:
# Calculate Time Differences for Each Step
def calculate_time_diff(data):
    data['date_time'] = pd.to_datetime(data['date_time'])
    data['time_diff'] = data.groupby(['client_id', 'visit_id'])['date_time'].diff().dt.total_seconds()
    return data

control_group = calculate_time_diff(control_group)
test_group = calculate_time_diff(test_group)

In [10]:
# Flag errors (backward steps)
def flag_errors(data):
    data['step_num'] = data['process_step'].str.extract(r'(\d+)').astype(float)
    data['error'] = (data['step_num'].diff() < 0).astype(int)
    return data

control_group = flag_errors(control_group)
test_group = flag_errors(test_group)

In [14]:
# Calculate Completion Rate
def compute_completion_rate(data):
    total_users = data['client_id'].nunique()
    confirm_users = data[data['process_step'] == 'confirm']['client_id'].nunique()
    return (confirm_users / total_users) * 100 if total_users > 0 else 0

control_completion_rate = compute_completion_rate(control_group)
test_completion_rate = compute_completion_rate(test_group)


In [16]:
# Calculate Average Time Spent on Each Step
def compute_avg_time(data):
    return data.groupby('process_step')['time_diff'].mean().reset_index(name='Avg Time (seconds)')

control_avg_time = compute_avg_time(control_group)
test_avg_time = compute_avg_time(test_group)


In [18]:
# Calculate Error Rate
def compute_error_rate(data):
    total_steps = len(data)
    backward_steps = data['error'].sum()
    return (backward_steps / total_steps) * 100 if total_steps > 0 else 0

control_error_rate = compute_error_rate(control_group)
test_error_rate = compute_error_rate(test_group)

In [20]:
# Calculate Engagement KPI (based on logons_6_mnth)
def compute_engagement_kpi(data):
    return data['logons_6_mnth'].mean()

control_engagement_kpi = compute_engagement_kpi(control_group)
test_engagement_kpi = compute_engagement_kpi(test_group)

In [22]:
# Compile results into a summary table
kpi_summary = pd.DataFrame({
    'KPI': ['Completion Rate (%)', 'Avg Time (seconds)', 'Error Rate (%)', 'Engagement KPI (Logons)'],
    'Control Group': [control_completion_rate, control_avg_time['Avg Time (seconds)'].mean(), control_error_rate, control_engagement_kpi],
    'Test Group': [test_completion_rate, test_avg_time['Avg Time (seconds)'].mean(), test_error_rate, test_engagement_kpi]
})

display(kpi_summary)

Unnamed: 0,KPI,Control Group,Test Group
0,Completion Rate (%),65.578509,69.297823
1,Avg Time (seconds),34.114698,57.018242
2,Error Rate (%),0.045325,0.034312
3,Engagement KPI (Logons),6.311621,6.245569
