# Simulation Data Generator
To build a demo and illustrate how our Dynamic A/B Testing works, we used [a kaggle dataset](https://www.kaggle.com/datasets/podsyp/how-to-do-product-analytics) to generate some simulated data for each timestep under each strategy (stored in two folders). Our demo will then go through the timestep and choose from the two folders to simulate the decision making process.

In [0]:
from pyspark.sql import functions as F
# URL processing
import urllib
import datetime

# Setup. Eg. strategy 1 is 80% Banner 0, 20% Banner 1
strategy_0 = [0.8, 0.2]
strategy_1 = [0.2, 0.8]

In [0]:
# Read raw data and do some transformation
my_upload_path = "dbfs:/FileStore/shared_uploads/WAN00316@umn.edu/sample_product.csv"
raw = spark.read.csv(my_upload_path, header=True, inferSchema=True)
raw.show(5)

+------+--------+-------------------+------+
|banner| product|               time|target|
+------+--------+-------------------+------+
|     0| clothes|2019-02-01 00:00:39|   0.0|
|     1|sneakers|2019-02-01 00:00:45|   0.0|
|     1|sneakers|2019-02-01 00:01:06|   0.0|
|     0| clothes|2019-02-01 00:01:56|   0.0|
|     0| clothes|2019-02-01 00:02:22|   0.0|
+------+--------+-------------------+------+
only showing top 5 rows



In [0]:
data_path = "dbfs:/FileStore/simulated_data"
timesteps = 120
start_time = datetime.datetime(2019,2,1)
time_step_interval = datetime.timedelta(hours=12)

for timestep in range(timesteps):
    period = raw[(raw['time']>=start_time) & (raw['time']<=start_time + time_step_interval)] # filter out time period
    period_upsampled = period.withColumn("dummy", explode(array([lit(x) for x in range(10)]))).drop('dummy') # upsample by 10X since data is a bit sparse
    for strategy_num in [0,1]:
        sampled = period_upsampled.sampleBy('product', dict(zip(banners, eval('strategy_{}'.format(strategy_num))))) # sample by strategy setup
        # Save the file to the right folder
        sampled.coalesce(1).write.mode("overwrite").option("header", "true").csv(data_path + '/tmp')
        csv_file = [file.path for file in dbutils.fs.ls(data_path + '/tmp') if file.path.endswith('.csv')][0]
        dbutils.fs.mv(csv_file, 'dbfs:/FileStore/simulated_data/strategy_{}/timestep_{}.csv'.format(strategy_num, timestep))
    start_time += time_step_interval