# A/B Testing

# Loading Libraries

In [None]:
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Reading the data

In [None]:
df = pd.read_csv("/marketing data.csv")

# Initial inspection

In [None]:
df.head()

# Checking the datatypes

We have 6 different features

In [None]:
df.info()

# Number of rows and columns

Half a million entries in our dataset

In [None]:
df.shape

In [None]:
print("Is NA?")
print(df.isna().sum())
print("Is NULL?")
print(df.isnull().sum())


# Preprossesing

Convert the boolean True/False values, to int

In [None]:
df["converted"] = np.where(df["converted"]==True, 1, 0)

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)

Exploratory Data Analysis

In [None]:
df.describe(include='all')

In [None]:
print(df["test group"].value_counts())

ax = df["test group"].value_counts().plot(kind="pie")
ax.set_title('Views of Different Marketing campaings piechart')
plt.show()

Seems a portion (2.52%) of those who viewed the different marketing campaings were converted and decided to follow the new marketing product

In [None]:
print("% Not converted", 100 * df["converted"].value_counts()[0]/df["converted"].value_counts().sum())
print("% Converted", 100 * df["converted"].value_counts()[1]/df["converted"].value_counts().sum())

ax = df["converted"].value_counts().plot(kind="pie")
ax.set_title('Converted piechart')
plt.show()

# Calculate conversion rates

In [None]:
df.groupby("test group")["converted"].mean()*100

From these conversion rates, it can be concluded that the ad group achieved a higher conversion rate (2.5547%) compared to the PSA group (1.7854%). This indicates that the advertisements were successful in generating conversions and the overall ad campaign was effective.

Lets see how many of the customers who took part of different marketing campaings got affected by them and converted to the new marketing products

In [None]:
print(df.groupby("converted")["test group"].value_counts().unstack('converted'))

df.groupby("converted")["test group"].value_counts().unstack('converted').plot(kind="bar", stacked=True)


In [None]:
print(df.loc[df["test group"]=="psa"].groupby("converted")["test group"].value_counts())
df.loc[df["test group"]=="psa"].groupby("converted")["test group"].value_counts().plot(kind="bar", stacked=True)


In [None]:
print(df.loc[df["test group"]=="ad"].groupby("converted")["test group"].value_counts())
df.loc[df["test group"]=="ad"].groupby("converted")["test group"].value_counts().plot(kind="bar", stacked=True)

Seems Monday is the the best day to air adventishments and seems Tuesday for public service anouncement campaings

In [None]:
print(df.groupby("most ads day")["converted"].sum())

ax =df.groupby(by =['most ads day', 'test group']).sum()['converted'].unstack('test group').plot(kind='bar', figsize=(9,6), grid= True, stacked= True)
ax.set_ylabel('converted')
ax.set_title('Most ad days and converted')
plt.show()

### which day we had the greatest conversion?

In [None]:
print(df.loc[df["test group"]=="ad"].groupby("most ads day")["converted"].sum())
df.loc[df["test group"]=="ad"].groupby("most ads day")["converted"].sum().plot(kind="bar")

### for Public Service Anouncement campaigns, which day had the greatest conversion?

In [None]:
print(df.loc[df["test group"]=="psa"].groupby("most ads day")["converted"].sum())
df.loc[df["test group"]=="psa"].groupby("most ads day")["converted"].sum().plot(kind="bar")

## Most profitable hours with new marketing campaings
The most profitable hour for both campaings in the 15th hour, we need to inspect further for each of the 2 campaings.

In [None]:
print(df.groupby("most ads hour")["converted"].sum())

ax = df.groupby(by =['most ads hour', 'test group'])['converted'].sum().unstack('test group').plot(kind= 'bar', figsize= (14,13), grid= True, stacked= True)
ax.set_ylabel('converted')
ax.set_title('Most ad days and converted')
plt.show()

### for Advertishement campaigns, which hour we had the greatest conversion?

During the 15th hour

In [None]:
print(df.loc[df["test group"]=="ad"].groupby("most ads hour")["converted"].sum())
df.loc[df["test group"]=="ad"].groupby("most ads hour")["converted"].sum().plot(kind="bar")

### for Public Service campaigns, which hour we had the greatest conversion?

During the 15th hour

In [None]:
print(df.loc[df["test group"]=="psa"].groupby("most ads hour")["converted"].sum())
df.loc[df["test group"]=="psa"].groupby("most ads hour")["converted"].sum().plot(kind="bar")

# A/B Testing

In [None]:
ad_dist = df[df["test group"] == 'ad'].reset_index(drop=True)
psa_dist = df[df["test group"] == 'psa'].reset_index(drop=True)

In [None]:
ad_dist.shape, psa_dist.shape

## test of Normality

In [None]:
from scipy.stats import shapiro, levene, mannwhitneyu, ttest_ind

In [None]:
_, pvalue_ad = shapiro(ad_dist.converted)
_, pvalue_psa = shapiro(psa_dist.converted)
print(pvalue_ad, pvalue_psa)

## Test of homogeneity of variance

In [None]:
# variance homogeneity check
levene(psa_dist.converted, ad_dist.converted)

p_values are less than 0.05 so the normality and variance assumptions are met.

## A/B Testing, independent T-test

Since Shapiro's test and the test of homogenous variance are met

In [None]:
# test for significance
ttest_ind(psa_dist.converted, ad_dist.converted)

the results are statistically significant!
- pvalue strongly rejects the null hypothesis
- difference in conversions are due to the ads

In [None]:
# conversion rates
converts_psa = psa_dist.groupby('converted')['user id'].count()
converts_ad = ad_dist.groupby('converted')['user id'].count()
(100 * converts_psa[1]/converts_psa[0]), (100 * converts_ad[1]/converts_ad[0])

The ad group achieved a higher conversion rate of 2.62% compared to the PSA group's 1.81%. This indicates that the ad campaign was more effective in driving conversions compared to the PSA.

The t-test p-value of 0.0000 indicates that the difference in conversion rates between the two groups is statistically significant, meaning that it is highly unlikely to have occurred by chance alone. This provides strong evidence for the observed difference between the two groups.