In [10]:
import numpy as np
import pandas as pd
from scipy import stats

Let's assume we use 0.05 as the significance level.

## Get the data ready

In [4]:
data = pd.read_csv('advertisement_clicks.csv')

In [6]:
a = data[data.advertisement_id == 'A']['action']
b = data[data.advertisement_id == 'B']['action']

### Built-in T-test

In [27]:
t, p = stats.ttest_ind(a, b)

In [28]:
print(f"t: {t}")
print(f"p: {p}")

t: -3.2211732138019786
p: 0.0012971905467125246


since the p-value is below the significance level 0.05, we can reject the null hypothesis.

Since we don't know whether the population variance are equal, let's try the Welch's T-test

### Welch's T-test

In [29]:
t, p = stats.ttest_ind(a, b, equal_var=False) # State that the variance are not equal

In [30]:
print(f"t: {t}")
print(f"p: {p}")

t: -3.2211732138019786
p: 0.0012972410374001632


### Manual T-test

In [31]:
a_var = a.var(ddof = 1)
b_var = b.var(ddof = 1)

# N(a) = N(b)
s = np.sqrt((a_var + b_var) / 2)
t = (a.mean() - b.mean()) / (s * np.sqrt(2.0 / len(a)))

df = 2 * len(a) - 2 # degree of freedom
p = (1 - stats.t.cdf(np.abs(t), df = df)) * 2

print(f"t: {t}")
print(f"p: {p}")

t: -3.221173213801978
p: 0.0012971905467125122
