Marketing A/B Testing - William Jennette

In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.proportion import proportions_ztest
import sys

file_path = 'marketing_AB.csv'

#load dataset
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")

    print(df['test group'].value_counts())
    
# Group 1: errors with file's existence/accessibility
except (FileNotFoundError, PermissionError) as e:
    print(f"Error: Cannot access file at '{file_path}'.")
    print(f"Reason: {e}")

# Group 2: Errors with file's content or format
except (pd.errors.EmptyDataError, pd.errors.ParserError, KeyError) as e:
    if isinstance(e, KeyError):
        print(f"Error: File is missing an expected column: {e}")
    else:
        print(f"Error: File is empty, corrupted of not a valid CSV.")
        print(f"Dettails: {e}")

# Group 3: All other unexpected errors
except Exception as e:
    print("An unexpected application error occurred.")
    print(f"DEBUG DETAILS: {type(e).__name__}: {e}")

else:
    print("\n--- Analysis complete. ---")

Dataset loaded successfully.
test group
ad     564577
psa     23524
Name: count, dtype: int64

--- Analysis complete. ---


In [3]:
#Cleaning and prep
#Check for null values
print(df.isnull().sum())

#Check for users in multiple groups
duplicate_users = df.groupby('user id')['test group'].nunique()
duplicate_users = duplicate_users[duplicate_users > 1].index
print(f"Found {len(duplicate_users)} users in multiple groups.")

#Remove ambiguous users from analysis
df_cleaned = df[~df['user id'].isin(duplicate_users)]

#Separate into control ('psa') and treatment ('ad') groups
control_group = df_cleaned[df_cleaned['test group'] == 'psa']
treatment_group = df_cleaned[df_cleaned['test group'] == 'ad']

print(f"Control group size: {len(control_group)}")
print(f"Treatment group size: {len(treatment_group)}")

Unnamed: 0       0
user id          0
test group       0
converted        0
total ads        0
most ads day     0
most ads hour    0
dtype: int64
Found 0 users in multiple groups.
Control group size: 23524
Treatment group size: 564577


### Defined Hypothesis

* **Null Hypothesis ($H_0$):** There is no difference in conversion rate between the 'psa' (control) and 'ad' (treatment) groups.
* **Alternative Hypothesis ($H_A$):** There is a significant difference in conversion rate between the two groups.

In [4]:
#Calculate conversions for each group
control_conversions = control_group['converted'].sum()
treatment_conversions = treatment_group['converted'].sum()

#Calculate total users in each group
n_control = len(control_group)
n_treatment = len(treatment_group)

#Calculate conversion rates
control_rate = control_conversions / n_control
treatment_rate = treatment_conversions / n_treatment

print(f"Control Conversion Rate: {control_rate:.4f}")
print(f"Treatment Conversion Rate: {treatment_rate:.4f}")

Control Conversion Rate: 0.0179
Treatment Conversion Rate: 0.0255


In [6]:
#Crate arrays for z-test function
successes = np.array([treatment_conversions, control_conversions])
observations = np.array([n_treatment, n_control])

#Perform z-test
z_stat, p_value = proportions_ztest(successes, observations, alternative='two-sided')

print(f"Z-statistic: {z_stat:.4f}")
print(f"P-value: {p_value:.4f}")

#Interpret results
alpha = 0.05   #standard significance level
if p_value < alpha:
    print("Result is statistically significant (Reject $H_0$).")
    print("The new 'ad' version has a different conversion rate than the 'psa' version.")
else:
    print("Result is not statistically significatn (Fail to reject $H_0$.")
    print("We cannot conclude there is a difference in conversion rates.")

Z-statistic: 7.3701
P-value: 0.0000
Result is statistically significant (Reject $H_0$).
The new 'ad' version has a different conversion rate than the 'psa' version.
