## KPI - SUMMARY
### The percentage of all participants getting to the confirm stage is 68%, for the control group, 66% and 69% for the test group
### On average, the control group spent more time on each stage, especially on the 'start' stage.
### The test group experienced about 2% more errors than the control group 

In [11]:
import pandas as pd

In [38]:
# Reload the dataset
df = pd.read_csv('../data/clean/df_cleaned.csv')
kpi_df = pd.read_csv('../data/clean/df_cleaned.csv')

In [14]:
# Split data into control and test groups
control_group = df[df['Variation'] == 'Control'].copy()
test_group = df[df['Variation'] == 'Test'].copy()

# Sort data for processing
control_group = control_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])
test_group = test_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

In [16]:
# Calculate Time Differences for Each Step
def calculate_time_diff(data):
    data['date_time'] = pd.to_datetime(data['date_time'])
    data['time_diff'] = data.groupby(['client_id', 'visit_id'])['date_time'].diff().dt.total_seconds()
    return data

control_group = calculate_time_diff(control_group)
test_group = calculate_time_diff(test_group)

In [19]:
# Flag errors (backward steps)
def flag_errors(data):
    data['step_num'] = data['process_step'].str.extract(r'(\d+)').astype(float)
    data['error'] = (data['step_num'].diff() < 0).astype(int)
    return data

control_group = flag_errors(control_group)
test_group = flag_errors(test_group)

In [21]:
# Calculate Completion Rate
def compute_completion_rate(data):
    total_users = data['client_id'].nunique()
    confirm_users = data[data['process_step'] == 'confirm']['client_id'].nunique()
    return (confirm_users / total_users) * 100 if total_users > 0 else 0

control_completion_rate = compute_completion_rate(control_group)
test_completion_rate = compute_completion_rate(test_group)


In [23]:
# Calculate Average Time Spent on Each Step
def compute_avg_time(data):
    return data.groupby('process_step')['time_diff'].mean().reset_index(name='Avg Time (seconds)')

control_avg_time = compute_avg_time(control_group)
test_avg_time = compute_avg_time(test_group)


In [57]:
# Calculate Error Rate
def compute_error_rate(data):
    total_steps = len(data)
    backward_steps = data['error'].sum()
    return (backward_steps / total_steps) * 100 if total_steps > 0 else 0

control_error_rate = compute_error_rate(control_group)
test_error_rate = compute_error_rate(test_group)

In [27]:
# Calculate Engagement KPI (based on logons_6_mnth)
def compute_engagement_kpi(data):
    return data['logons_6_mnth'].mean()

control_engagement_kpi = compute_engagement_kpi(control_group)
test_engagement_kpi = compute_engagement_kpi(test_group)

In [29]:
# Compile results into a summary table
kpi_summary = pd.DataFrame({
    'KPI': ['Completion Rate (%)', 'Avg Time (seconds)', 'Error Rate (%)', 'Engagement KPI (Logons)'],
    'Control Group': [control_completion_rate, control_avg_time['Avg Time (seconds)'].mean(), control_error_rate, control_engagement_kpi],
    'Test Group': [test_completion_rate, test_avg_time['Avg Time (seconds)'].mean(), test_error_rate, test_engagement_kpi]
})

display(kpi_summary)

Unnamed: 0,KPI,Control Group,Test Group
0,Completion Rate (%),65.578509,69.297823
1,Avg Time (seconds),34.114698,57.018242
2,Error Rate (%),0.045325,0.034312
3,Engagement KPI (Logons),6.311621,6.245569


In [40]:
# Convert 'date_time' to a datetime object
kpi_df['date_time'] = pd.to_datetime(kpi_df['date_time'])

# Sort data by 'visit_id' and 'date_time'
kpi_df = kpi_df.sort_values(by=['visit_id', 'date_time'])

# Calculate time differences for consecutive steps
kpi_df['time_diff'] = kpi_df.groupby('visit_id')['date_time'].diff().dt.total_seconds()

# Ensure only the last 'confirm' step is considered
kpi_df = kpi_df.iloc[::-1].drop_duplicates(subset=['visit_id', 'process_step'], keep='first').iloc[::-1]

# Aggregate average time spent per step
average_time_per_step = kpi_df.groupby('process_step')['time_diff'].mean().round().reindex(['start', 'step_1', 'step_2', 'step_3', 'confirm']).astype(int)

# Create a DataFrame for display
average_time_per_step_df = average_time_per_step.reset_index(name='average_time_spent')

# Display the DataFrame to the user
display(average_time_per_step_df)


Unnamed: 0,process_step,average_time_spent
0,start,148
1,step_1,39
2,step_2,40
3,step_3,92
4,confirm,122


In [42]:
# Aggregate average time spent per step for each group (Control and Test)
average_time_by_group = kpi_df.groupby(['Variation', 'process_step'])['time_diff'].mean().unstack(level=0)

# Reindex to ensure the order of steps is consistent
average_time_by_group = average_time_by_group.round().reindex(['start', 'step_1', 'step_2', 'step_3', 'confirm']).astype(int)

# Convert to a DataFrame for display
average_time_by_group_df = average_time_by_group.reset_index()

# Display the DataFrame to the user
(average_time_by_group_df)

Variation,process_step,Control,Test
0,start,140,154
1,step_1,41,36
2,step_2,37,42
3,step_3,90,93
4,confirm,127,118
