In [1]:
import pandas as pd
import numpy as np
from scipy import stats



In [2]:

# Load the dataset
df = pd.read_csv('online_shoppers_intention.csv')


In [3]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())


First few rows of the dataset:
   Administrative  Administrative_Duration  Informational  \
0               0                      0.0              0   
1               0                      0.0              0   
2               0                      0.0              0   
3               0                      0.0              0   
4               0                      0.0              0   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                     0.0               1                 0.000000   
1                     0.0               2                64.000000   
2                     0.0               1                 0.000000   
3                     0.0               2                 2.666667   
4                     0.0              10               627.500000   

   BounceRates  ExitRates  PageValues  SpecialDay Month  OperatingSystems  \
0         0.20       0.20         0.0         0.0   Feb                 1   
1         0.00       0.10  

In [4]:

# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())



Missing values in the dataset:
Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64


In [5]:
# Basic statistics of the dataset
print("\nBasic statistics of the dataset:")
print(df.describe())



Basic statistics of the dataset:
       Administrative  Administrative_Duration  Informational  \
count    12330.000000             12330.000000   12330.000000   
mean         2.315166                80.818611       0.503569   
std          3.321784               176.779107       1.270156   
min          0.000000                 0.000000       0.000000   
25%          0.000000                 0.000000       0.000000   
50%          1.000000                 7.500000       0.000000   
75%          4.000000                93.256250       0.000000   
max         27.000000              3398.750000      24.000000   

       Informational_Duration  ProductRelated  ProductRelated_Duration  \
count            12330.000000    12330.000000             12330.000000   
mean                34.472398       31.731468              1194.746220   
std                140.749294       44.475503              1913.669288   
min                  0.000000        0.000000                 0.000000   
25%       

In [6]:
# Creating an A/B test scenario:
# Let's assume 'SpecialDay' is the intervention we're testing (e.g., a special promotion)
# We will treat SpecialDay > 0 as the treatment group and SpecialDay == 0 as the control group

df['group'] = np.where(df['SpecialDay'] > 0, 'treatment', 'control')


In [7]:
# Converting 'Revenue' to binary (0 or 1) indicating conversion
df['converted'] = df['Revenue'].astype(int)


In [8]:
# Group by control and treatment
control_group = df[df['group'] == 'control']
treatment_group = df[df['group'] == 'treatment']


In [9]:
# Calculate conversion rates
control_conversion_rate = control_group['converted'].mean()
treatment_conversion_rate = treatment_group['converted'].mean()


In [10]:
# Number of users in each group
n_control = control_group.shape[0]
n_treatment = treatment_group.shape[0]


In [11]:
# Number of conversions in each group
conversions_control = control_group['converted'].sum()
conversions_treatment = treatment_group['converted'].sum()


In [12]:
# Pooled conversion rate
conversion_rate_pooled = (conversions_control + conversions_treatment) / (n_control + n_treatment)


In [13]:
# Standard error
se = np.sqrt(conversion_rate_pooled * (1 - conversion_rate_pooled) * (1/n_control + 1/n_treatment))


In [14]:
# Z-score
z = (treatment_conversion_rate - control_conversion_rate) / se


In [15]:
# P-value
p_value = stats.norm.sf(abs(z)) * 2  # two-tailed test


In [16]:
# Output results
print(f"Control Group Conversion Rate: {control_conversion_rate:.4f}")
print(f"Treatment Group Conversion Rate: {treatment_conversion_rate:.4f}")
print(f"Pooled Conversion Rate: {conversion_rate_pooled:.4f}")
print(f"Standard Error: {se:.4f}")
print(f"Z-score: {z:.4f}")
print(f"P-value: {p_value:.4f}")


Control Group Conversion Rate: 0.1653
Treatment Group Conversion Rate: 0.0616
Pooled Conversion Rate: 0.1547
Standard Error: 0.0108
Z-score: -9.6149
P-value: 0.0000


In [17]:
# Conclusion
if p_value < 0.05:
    print("Reject the null hypothesis: The special promotion significantly increases the conversion rate.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in conversion rate.")

Reject the null hypothesis: The special promotion significantly increases the conversion rate.
