In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import statsmodels.stats.api as sms
from scipy.stats import shapiro, levene, mannwhitneyu

In [None]:
#### Load dataset
data = pd.read_csv('/content/drive/MyDrive/ab_data.csv')
data.head()

In [None]:
#### Check format
data.info()

In [None]:
#### Check values
for x in data.columns:
    print(x)
    print(data[x].values)

In [None]:
#### Check unique values
data.apply(lambda x: x.nunique())

In [None]:
#### Check null
data.isnull().sum()

In [None]:
#### Remove duplicates
print(data.shape)
df = data.drop_duplicates(subset= 'user_id', keep= False)
print(df.shape)

In [None]:
df[['group', 'landing_page']].value_counts()

In [None]:
#### Check Mismatch - group & landing page
df_mismatch = df[(df["group"]=="treatment")&(df["landing_page"]=="old_page")
                |(df["group"]=="control")&(df["landing_page"]=="new_page")]
n_mismatch = df_mismatch.shape[0]
print(f"The number of mismatched rows:{n_mismatch} rows" )
print("Percent of mismatched rows:%.2f%%" % (n_mismatch/df.shape[0]*100))

In [None]:
import pandas as pd

# Function to convert MM:SS.S to total seconds
def convert_to_seconds(timestamp):
    if pd.isnull(timestamp):  # Handle missing values
        return None
    minutes, seconds = map(float, timestamp.split(':'))
    return minutes * 60 + seconds

df['total_seconds'] = df['timestamp'].apply(convert_to_seconds)

print(df)

In [None]:
# Missing value
df['total_seconds'] = df['total_seconds'].fillna(0)


In [None]:
df.columns

In [None]:
df.describe()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Plot the distribution of total seconds
plt.figure(figsize=(8, 6))
plt.hist(df['total_seconds'], bins=30, alpha=0.7, edgecolor='black')
plt.title('Distribution of Total Seconds')
plt.xlabel('Total Seconds')
plt.ylabel('Frequency')
plt.ylim(bottom=0) #Change starting point
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
import pandas as pd
group_counts = df['group'].value_counts()
group_ratios = group_counts / group_counts.sum()
print(group_ratios)

In [None]:
df.groupby(['group','landing_page']).agg({'landing_page': lambda x: x.value_counts()})

In [None]:
import seaborn as sns

page_conversion = df.groupby(['landing_page', 'group'])['converted'].mean().reset_index()

# Plot

sns.barplot(data=page_conversion, x='landing_page', y='converted', hue='group', palette='coolwarm')
plt.title('Conversion Rate by Landing Page')
plt.ylabel('Conversion Rate')
plt.xlabel('Landing Page')
plt.legend(title='Group')
plt.show()

In [None]:
df.groupby(['group','landing_page']).agg({'converted': 'mean'})

In [None]:
conversion_summary = df.groupby('group')['converted'].mean().reset_index()
conversion_summary.columns = ['Group', 'Conversion Rate']

print(conversion_summary)

In [None]:
import seaborn as sns

sns.barplot(data=conversion_summary, x='Group', y='Conversion Rate', palette='viridis')
plt.title('Conversion Rate by Group')
plt.ylabel('Conversion Rate')
plt.xlabel('Group')
plt.show()

In [None]:
conversion_rates = df.groupby('group')['converted'].mean()
print(conversion_rates)

In [None]:
time_metrics = df.groupby('group')['total_seconds'].mean()
print(time_metrics)

In [None]:
from statsmodels.stats.proportion import proportions_ztest

control = df[df['group'] == 'control']['converted']
treatment = df[df['group'] == 'treatment']['converted']

control_converted = control.sum()
treatment_converted = treatment.sum()

#Size of group
n_control = len(control)
n_treatment = len(treatment)

stat, p_value = proportions_ztest([control_converted, treatment_converted],
                                   [n_control, n_treatment])
print(f"Z-test Statistic: {stat}, p-value: {p_value:.6f}")

In [None]:
from scipy.stats import ttest_ind

control_time = df[df['group'] == 'control']['total_seconds']
treatment_time = df[df['group'] == 'treatment']['total_seconds']

t_stat, p_value = ttest_ind(control_time, treatment_time)
print(f"T-test Statistic: {t_stat}, p-value: {p_value:.6f}")