# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import io

sns.set_style("whitegrid")
pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

import statsmodels.stats.api as sm
from scipy.stats import shapiro, levene, mannwhitneyu

from IPython.display import Markdown

def bold(string):
    display(Markdown(string))

import warnings
warnings.filterwarnings("ignore")

# Load Data

In [None]:
df_ = pd.read_csv("/mnt/hdd/Datasets/grocerywebsiteabtestdata.csv")
df = df_.copy()
df.head()

In [None]:
def df_stats(data):
    print("**" + " SHAPE ".center(50, "#") + "**")
    print("ROWS: {}".format(data.shape[0]))
    print("COLUMNS: {}".format(data.shape[1]))
    bold("**" + " TYPES ".center(50, "#") + "**")
    print(data.dtypes)
    bold("**" + " MISSING VALUES ".center(50, "#") + "**")
    print(data.isnull().sum())
    bold("**" + " DUPLICATED VALUES ".center(50, "#") + "**")
    print("NUMBER OF DUPLICATED VALUES: {}".format(data.duplicated().sum()))
    bold("**" + " MEMORY USAGE ".center(50, "#") + "**")
    buf = io.StringIO()
    data.info(buf=buf)
    info = buf.getvalue().split("\n")[-2].split(":")[1].strip()
    print("Memory Usage: {}".format(info))
    bold("**" + " DESCRIBE ".center(50, "#") + "**")
    print(data.describe().T)

In [None]:
df_stats(df)

# Explorary Data Analysis

In [None]:
df.head()

In [None]:
df["LoggedInFlag"].value_counts()

In [None]:
df["ServerID"].value_counts()

In [None]:
df["VisitPageFlag"].value_counts()

In [None]:
df["LoggedInFlag"].value_counts().plot(kind="pie", autopct="%.2f%%")

# Data Preprocess

In [None]:
df.drop("RecordID", axis=1, inplace=True)

In [None]:
df = df[df["LoggedInFlag"] == 0]
df.head()

In [None]:
df = df.groupby(["IP Address", "ServerID"], as_index=False)["VisitPageFlag"].sum()
df.head()

In [None]:
df["VisitPageFlag"].min()

In [None]:
df["VisitPageFlag"].max()

In [None]:
df["isVisited"] = df["VisitPageFlag"].apply(lambda x: 1 if x != 0 else 0)

In [None]:
df.head()

In [None]:
df["ServerID"] = df["ServerID"].map({
    1: "Treatment",
    2: "Control",
    3: "Control"
})

In [None]:
plt.figure()
ax = sns.countplot(data=df, x="ServerID")
ax.bar_label(ax.containers[0])
plt.title("Control - Treatment")
plt.show()

In [None]:
treatment = df[df["ServerID"] == "Treatment"]
control = df[df["ServerID"] == "Control"]

In [None]:
treatment.describe().T

In [None]:
control.describe().T

# Hypothesis

- H0: M1 = M2
- H1: M1 != M2

In [None]:
def normality_test(control_group, treatment_group):
    ct_stat, cp_value = shapiro(control_group)
    print("Control Group:")
    print(f" -T Stat: {ct_stat: .4f}")
    print(f" -P Value: {cp_value: .4f}")

    tt_stat, tp_value = shapiro(treatment_group)
    print("\nTreatment Group:")
    print(f" -T Stat: {tt_stat: .4f}")
    print(f" -P Value: {tp_value: .4f}")

In [None]:
normality_test(control["isVisited"], treatment["isVisited"])

In [None]:
def levene_test(control_group, treatment_group):
    t_stat, p_value = levene(control_group, treatment_group)
    print(f"T Stat: {t_stat: .4f}")
    print(f"P Value: {p_value: .4f}")

In [None]:
levene_test(control["isVisited"], treatment["isVisited"])

In [None]:
def mannwhitney_u_test(control_group, treatment_group):
    u_stat, p_value = mannwhitneyu(control_group, treatment_group)
    print(f"U Stat: {u_stat: .4f}")
    print(f"P Value: {p_value: .4f}")

In [None]:
mannwhitney_u_test(control["isVisited"], treatment["isVisited"])

In [None]:
counts = df.groupby(by=["ServerID", "isVisited"], as_index=False)["VisitPageFlag"].count()
counts

In [None]:
cross_counts = pd.crosstab(index=counts["ServerID"], columns=counts["isVisited"], values=counts["VisitPageFlag"], aggfunc=np.sum, margins=True)
cross_counts

In [None]:
cross_counts["0_Percent%"] = 100 * (cross_counts[0] / cross_counts["All"])
cross_counts["1_Percent%"] = 100 * (cross_counts[1] / cross_counts["All"])
cross_counts