In [4]:
#importing the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import scipy.stats as scs
import random
import math
import random

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../data/AdSmartABdata.csv')

In [9]:
def transform_data(df):
  '''
  segment data into exposed and control groups
  consider that SmartAd runs the experment hourly, group data into hours. 
      Hint: create new column to hold date+hour and use df.column.map(lambda x:  pd.Timestamp(x,tz=None).strftime('%Y-%m-%d:%H'))
  create two dataframes with bernouli series 1 for posetive(yes) and 0 for negative(no)
    Hint: Given engagement(sum of yes and no until current observation as an array) and success (yes countas an array), the method generates random binomial distribution
        #Example
           engagement = np.array([5, 3, 3])
           yes = np.array([2, 0, 3])       
         Output is "[1] 1 0 1 0 0 0 0 0 1 1 1", showing a binary array of 5+3+3 values
         of which 2 of the first 5 are ones, 0 of the next 3 are ones, and all 3 of
         the last 3 are ones where position the ones is randomly distributed within each group.
  '''
  
  # Filter out users that were not interacting
  relevant_rows = df.query('yes == 1 | no == 1')
  
  # Get exposed and control groups 
  exposed = relevant_rows[relevant_rows['experiment'] == 'exposed']
  control = relevant_rows[relevant_rows['experiment'] == 'control']

  # Append hour to date for exposed group
  exposed['hour'] = exposed['hour'].astype('str')
  exposed['date_hour'] = pd.to_datetime( exposed['date'] + " " + exposed['hour'] + ":00" + ":00")
  exposed['date_hour'] = exposed['date_hour'].map( lambda x:  pd.Timestamp(x, tz=None).strftime('%Y-%m-%d:%H'))

  # Append hour to date for control group
  control['hour'] = control['hour'].astype('str')
  control['date_hour'] = pd.to_datetime(control['date'] + " " + control['hour'] + ":00" + ":00")
  control['date_hour'] = control['date_hour'].map(lambda x:  pd.Timestamp(x, tz=None).strftime('%Y-%m-%d:%H'))


  #create two dataframes with bernouli series 1 for posetive(yes) and 0 for negative(no)
  #   exposed_bernouli = exposed.groupby(['date_hour'])['yes'].apply(lambda x: np.random.binomial(x, 0.5))
  #   control_bernouli = control.groupby(['date_hour'])['yes'].apply(lambda x: np.random.binomial(x, 0.5))  

  
  #   exposed['engagement_yes'] = exposed.yes.map(lambda x: np.random.binomial(x, 0.5))
  #   control['engagement_no'] = exposed.no.map(lambda x: np.random.binomial(x, 0.5))
    
  
  exposed['engagement'] = exposed['yes'] + exposed['no']
  control['engagement'] = control['yes'] + control['no']

  exposed['success'] = exposed['yes'] 
  control['success'] = control['yes'] 

  expo_p = sum(exposed['success']) / sum(exposed['engagement'])
  cont_p = sum(control['success']) / sum(control['engagement'])

  exposed_engagement = exposed['engagement'].to_numpy()
  control_engagement = control['engagement'].to_numpy()

  exposed = np.random.choice([0, 1], size=((np.sum(exposed_engagement)),), p=[expo_p, 1-expo_p])
  control = np.random.choice([0, 1], size=((np.sum(control_engagement)),), p=[cont_p , 1-cont_p ])
    
  return exposed, control

In [10]:
control, exposed = transform_data(df)
print(len(control), len(exposed))

657 586


In [11]:
#checking numerical and categorical data
numerical_column = df.select_dtypes(exclude="object").columns.tolist()
categorical_column = df.select_dtypes(include="object").columns.tolist()
print("Numerical Columns:", numerical_column)
print("Categorical Columns:", categorical_column)

Numerical Columns: ['hour', 'platform_os', 'yes', 'no']
Categorical Columns: ['auction_id', 'experiment', 'date', 'device_make', 'browser']


In [12]:
user_group = df.groupby(['experiment'])
control = user_group.get_group('control')
exposed = user_group.get_group('exposed')

In [13]:
class ConditionalSPRT:
     
    def __init__(self, x, y, odd_ratio, alpha=0.05, beta=0.10, stop=None):
        self.x = x
        self.y = y
        self.odd_ratio = odd_ratio
        self.alpha = alpha
        self.beta = beta
        self.stop = stop
   
    def run(self):
        res = conditionalSPRT(self.x, self.y, self.odd_ratio,
                              self.alpha, self.beta,
                              self.stop)
        return res
    def resJson(self, res):
        outcome,n, k,l,u,truncated,truncate_decision,x1,r,stats,limits = res
        jsonRes = {
            "name": "Sequential AB testing",
            "outcome": outcome,
            "decsionMadeIndex": k,
            "numberOfObservation": len(n),
            "truncated": truncated,
            "truncateDecision": truncate_decision,        
      
        }
        return jsonRes
    
    def plotExperiment(self, res):
        outcome,n, k,l,u,truncated,truncate_decision,x1,r,stats,limits = res
        lower = limits[:, 0]
        upper = limits[:,1]

        fig, ax = plt.subplots(figsize=(12,7))

        ax.plot(n, x1, label='Cumlative value of yes+no')

        ax.plot(n, lower, label='Lower Bound', linestyle='--')
        ax.plot(n, upper, label='Upper Bound', linestyle='--')

        plt.legend()


        plt.show()