In [25]:

import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats.api as sms
from scipy.stats import ttest_1samp, shapiro, levene, ttest_ind, mannwhitneyu, \
    pearsonr, spearmanr, kendalltau, f_oneway, kruskal
from statsmodels.stats.proportion import proportions_ztest

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [26]:
df_control=pd.read_excel("ab_testing.xlsx", sheet_name='Control Group')

In [27]:
df_control.head()

Unnamed: 0,Impression,Click,Purchase,Earning
0,82529.45927,6090.07732,665.21125,2311.27714
1,98050.45193,3382.86179,315.08489,1742.80686
2,82696.02355,4167.96575,458.08374,1797.82745
3,109914.4004,4910.88224,487.09077,1696.22918
4,108457.76263,5987.65581,441.03405,1543.72018


In [28]:
df_control.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Impression  40 non-null     float64
 1   Click       40 non-null     float64
 2   Purchase    40 non-null     float64
 3   Earning     40 non-null     float64
dtypes: float64(4)
memory usage: 1.4 KB


In [29]:
df_control.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Impression,40.0,101711.44907,20302.15786,45475.94296,85726.69035,99790.70108,115212.81654,147539.33633
Click,40.0,5100.65737,1329.9855,2189.75316,4124.30413,5001.2206,5923.8036,7959.12507
Purchase,40.0,550.89406,134.1082,267.02894,470.09553,531.20631,637.95709,801.79502
Earning,40.0,1908.5683,302.91778,1253.98952,1685.8472,1975.16052,2119.80278,2497.29522


In [30]:
df_control.isnull().sum()

Impression    0
Click         0
Purchase      0
Earning       0
dtype: int64

In [31]:
df_test=pd.read_excel("ab_testing.xlsx", sheet_name='Test Group')

In [32]:
df_test.head()


Unnamed: 0,Impression,Click,Purchase,Earning
0,120103.5038,3216.54796,702.16035,1939.61124
1,134775.94336,3635.08242,834.05429,2929.40582
2,107806.62079,3057.14356,422.93426,2526.24488
3,116445.27553,4650.47391,429.03353,2281.42857
4,145082.51684,5201.38772,749.86044,2781.69752


In [33]:
df_test.isnull().sum()

Impression    0
Click         0
Purchase      0
Earning       0
dtype: int64

In [34]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Impression  40 non-null     float64
 1   Click       40 non-null     float64
 2   Purchase    40 non-null     float64
 3   Earning     40 non-null     float64
dtypes: float64(4)
memory usage: 1.4 KB


In [35]:
df_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Impression,40.0,120512.41176,18807.44871,79033.83492,112691.97077,119291.30077,132050.57893,158605.92048
Click,40.0,3967.54976,923.09507,1836.62986,3376.81902,3931.3598,4660.49791,6019.69508
Purchase,40.0,582.1061,161.15251,311.62952,444.62683,551.35573,699.86236,889.91046
Earning,40.0,2514.89073,282.73085,1939.61124,2280.53743,2544.66611,2761.5454,3171.48971


 Adım 3: Analiz işleminden sonra concat metodunu kullanarak kontrol ve test grubu verilerini birleştiriniz.

In [36]:
df_test["Test_Group_Ok"]= 40*["Test_Group_Ok"]

In [37]:
df_test

Unnamed: 0,Impression,Click,Purchase,Earning,Test_Group_Ok
0,120103.50380,3216.54796,702.16035,1939.61124,Test_Group_Ok
1,134775.94336,3635.08242,834.05429,2929.40582,Test_Group_Ok
2,107806.62079,3057.14356,422.93426,2526.24488,Test_Group_Ok
3,116445.27553,4650.47391,429.03353,2281.42857,Test_Group_Ok
4,145082.51684,5201.38772,749.86044,2781.69752,Test_Group_Ok
...,...,...,...,...,...
35,79234.91193,6002.21358,382.04712,2277.86398,Test_Group_Ok
36,130702.23941,3626.32007,449.82459,2530.84133,Test_Group_Ok
37,116481.87337,4702.78247,472.45373,2597.91763,Test_Group_Ok
38,79033.83492,4495.42818,425.35910,2595.85788,Test_Group_Ok


In [38]:
df_control["Test_Control_Ok"]= 40*["Test_Control_Ok"]

In [39]:
df_Concat_List=pd.concat([df_control,df_test],axis=0)

In [40]:
df_Concat_List

Unnamed: 0,Impression,Click,Purchase,Earning,Test_Control_Ok,Test_Group_Ok
0,82529.45927,6090.07732,665.21125,2311.27714,Test_Control_Ok,
1,98050.45193,3382.86179,315.08489,1742.80686,Test_Control_Ok,
2,82696.02355,4167.96575,458.08374,1797.82745,Test_Control_Ok,
3,109914.40040,4910.88224,487.09077,1696.22918,Test_Control_Ok,
4,108457.76263,5987.65581,441.03405,1543.72018,Test_Control_Ok,
...,...,...,...,...,...,...
35,79234.91193,6002.21358,382.04712,2277.86398,,Test_Group_Ok
36,130702.23941,3626.32007,449.82459,2530.84133,,Test_Group_Ok
37,116481.87337,4702.78247,472.45373,2597.91763,,Test_Group_Ok
38,79033.83492,4495.42818,425.35910,2595.85788,,Test_Group_Ok




#####################################################
# Görev 2:  A/B Testinin Hipotezinin Tanımlanması
#####################################################



# Adım 1: Hipotezi tanımlayınız.

# H0: M1 = M2 (... iki grup ortalamaları arasında purchase açısından fark yoktur.)
# H1: M1 != M2 (...vardır)





# Adım 2: Kontrol ve test grubu için purchase(kazanç) ortalamalarını analiz ediniz


In [41]:
df_test["Purchase"].mean()

582.1060966484677

In [42]:
df_control["Purchase"].mean()

550.8940587702316

Test grubunun Purchase miktarı daha yüksek görünüyor, buradan matematiksel olarak daha iyi diyebiliriz ama bunu istatistiksel açıdan da incelememiz gerekiyor


#####################################################
# GÖREV 3: Hipotez Testinin Gerçekleştirilmesi
#####################################################

######################################################
# AB Testing (Bağımsız İki Örneklem T Testi)
######################################################



# Adım 1: Hipotez testi yapılmadan önce varsayım kontrollerini yapınız.Bunlar Normallik Varsayımı ve Varyans Homojenliğidir.

Normallik Varsayımı Aşağıdadır

In [43]:
test_stat, pvalue = shapiro(df_test["Purchase"])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))


## p-value < değilse 0.05 H0 REDDEDILEMEZ.
## H0= kabul
## Normallik Varsayımı gösterir

Test Stat = 0.9589, p-value = 0.1541


In [44]:
test_stat, pvalue = shapiro(df_control["Purchase"])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

## p-value < değilse 0.05 H0 REDDEDILEMEZ.
## H0= kabul

Test Stat = 0.9773, p-value = 0.5891


# Varyans homojenliği
# H0: Varyanslar Homojendir
# H1: Varyanslar Homojen Değildir

In [45]:

test_stat, pvalue = levene(df_test["Purchase"].dropna(),
                           df_control["Purchase"].dropna())

print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

# p-value < değilse 0.05 H0 REDDEDILEMEZ.
## H0= kabul
 ## H0: Varyanslar Homojendir

Test Stat = 2.6393, p-value = 0.1083




# Adım 2: Normallik Varsayımı ve Varyans Homojenliği sonuçlarına göre uygun testi seçiniz

normal varsayabileceğimiz ve varyans homojenliği bulunduğundan "parametrik testi" seçtik


In [46]:

test_stat, pvalue = ttest_ind(df_test["Purchase"],
                              df_control["Purchase"],
                              equal_var=True)

print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))


# p-value < değilse 0.05 H0 REDDEDILEMEZ.
## H0= kabul

#p value < değildir 0.015
## bu sebeple istatistiksel bir fark yoktur aralarında diyebiliriz


Test Stat = 0.9416, p-value = 0.3493
