# MADGICX's standard adset performance evalution

__Diagram of the ad set__ [win lose evaluation flow](https://app.diagrams.net/#G1GMSCJ_EQPm-NUYhxQVNe7bFD-vVQk46P#%7B%22pageId%22%3A%22VbIS5GhQtAIJzmuHBgNp%22%7D) 

### Check statistical significance

* Volumne improving over last 7 days for:
   * impressions: number ad view adset
   * clicks : number of ad clicks
   * conversions: number of subsequent click post ad click
   * CTR : Click Through Rate = number of clicks / number of impressions
* plot time series of daily mean of performance metric indicators
* plot time series of daily adsets below and above performance metric average
* setup to run plots for any objectives

In [1]:
'''
    WARNING CONTROL to display or ignore all warnings
'''
import warnings; warnings.simplefilter('ignore')     #switch betweeb 'default' and 'ignore'
import traceback

''' Set debug flag to view extended error messages; else set it to False to turn off debugging mode '''
debug = True


## Instantiate Classes

In [2]:
import os
import sys
import numpy as np
from pyspark.sql import functions as F
# import tensorflow_probability as tfp
# import tensorflow as tf

proj_dir = os.path.abspath(os.pardir)
sys.path.insert(1,proj_dir.split('mining/')[0])
# from rezaware.modules.etl.loader import sparkRDBM as db
from rezaware.modules.ml.timeseries import rollingstats as roll
from mining.modules.budget.optimization import rwAdsDataFile as file
from rezaware.modules.lib.spark import execSession

''' restart initiate classes '''
if debug:
    import importlib
    roll = importlib.reload(roll)
    file=importlib.reload(file)
    # attr=importlib.reload(attr)

__desc__ = "read and write dataset for MCMC sampling"

clsFile = file.dataWorkLoads(
    desc = "optimizing action_type budgets for an ad",
    f_store_mode='local-fs',
    f_store_root=proj_dir.split('mining/')[0],
    jar_dir=None,
)
clsStats = roll.mlWorkLoads(desc=__desc__)
clsSpark = execSession.Spawn()
print("\n%s class initialization and load complete!" % __desc__)

All functional __PROPATTR__-libraries in TIMESERIES-package of ML-module imported successfully!
All packages in rezaware ml timeseries RollingStats imported successfully!
All functional __PROPATTR__-libraries in OPTIMIZATION-package of BUDGET-module imported successfully!
All functional RWADSDATA-libraries in OPTIMIZATION-package of BUDGET-module imported successfully!
All packages in rezaware ml timeseries RollingStats imported successfully!
All functional RWADSDATA-libraries in OPTIMIZATION-package of BUDGET-module imported successfully!
All functional APP-libraries in REZAWARE-package of REZAWARE-module imported successfully!
__propAttr__ Class initialization complete
All functional __PROPATTR__-libraries in LOADER-package of ETL-module imported successfully!
All functional SPARKFILE-libraries in LOADER-package of ETL-module imported successfully!
__propAttr__ Class initialization complete
sparkFile Class initialization complete
rwAdsData Class initialization complete
All functional

In [21]:
_grp_by_date = sdf.groupBy(F.col('adset_id'), F.col('updated_time'))\
                .agg(F.sum(_num_col).alias(f"sum_{_num_col}"))\
                .select('updated_time', 'adset_id', f"sum_{_num_col}")\
                .orderBy('adset_id','updated_time')
_grp_by_date.show()



+------------+-------------+-----------+
|updated_time|     adset_id|sum_to_cart|
+------------+-------------+-----------+
|  2025-02-06|6287207374958|       21.0|
|  2025-02-02|6290983114158|       null|
|  2025-02-06|6290983118558|      143.0|
|  2025-01-25|6333951807635|       null|
|  2025-01-25|6333951807835|       null|
|  2025-01-25|6333951809035|       null|
|  2025-01-25|6333951810435|       null|
|  2025-01-25|6333960414435|       null|
|  2025-01-25|6333971194235|       null|
|  2025-01-25|6335138014835|       null|
|  2025-01-23|6336296098761|       39.4|
|  2025-01-23|6336627585761|     583.18|
|  2025-01-25|6340897185635|       null|
|  2025-01-25|6340897185835|       null|
|  2025-01-25|6340897186035|       null|
|  2025-01-25|6340897186235|       null|
|  2025-01-25|6340897212635|       null|
|  2025-01-25|6340897213035|       null|
|  2025-01-25|6340897213235|       null|
|  2025-01-25|6340897213635|       null|
+------------+-------------+-----------+
only showing top

                                                                                

In [118]:
from datetime import date, timedelta, datetime
from pyspark.sql.types import StringType, DateType
from pyspark.sql import Window

''' consecutive list of dates '''
_max_date = _grp_by_date.select(F.max('updated_time')).first()[0]
_min_date = _grp_by_date.select(F.min('updated_time')).first()[0]
_dates_lst=[_min_date+timedelta(days=d) 
            for d in range(0, (_max_date-_min_date).days, 1)]
print("Created %d days between min date: %s and max date: %s" 
      % (len(_dates_lst), str(_min_date),str(_max_date)))
_adset_ids_lst= [x['adset_id'] for x in _grp_by_date.select(F.col('adset_id')).distinct().collect()]
print("Number of unique adset ids:", len(_adset_ids_lst))

''' create cross joined complete date and adset ids sdf '''
adset_ids_sdf = clsSpark.session.createDataFrame(
    _adset_ids_lst,StringType())\
    .withColumnRenamed('value','adset_id')

dates_sdf = clsSpark.session.createDataFrame(
    _dates_lst,DateType())\
    .withColumnRenamed('value','updated_time')
all_dates_ids_sdf = adset_ids_sdf.crossJoin(dates_sdf)

print("Crossjoined rows =",all_dates_ids_sdf.count())

execSession Class initialization complete


                                                                                

Created 27 days between min date: 2025-01-22 and max date: 2025-02-18


                                                                                

Number of unique adset ids: 3996
Crossjoined rows = 107892


In [119]:

w = Window.partitionBy("adset_id").orderBy("updated_time")

_imputed_sdf = all_dates_ids_sdf\
                    .join(_grp_by_date, ["adset_id", "updated_time"], "left")\
                    .select("adset_id","updated_time",
                            *[F.last(F.col(c), ignorenulls=True).over(w).alias(c)
                              for c in _grp_by_date.columns 
                                  if c not in ("adset_id", "updated_time")])\
                    .dropna()

print("Imputed sdf rows = ",_imputed_sdf.count())

                                                                                

Imputed sdf rows =  7200


In [120]:
_imputed_sdf.printSchema()

root
 |-- adset_id: string (nullable = true)
 |-- updated_time: date (nullable = true)
 |-- sum_to_cart: double (nullable = true)



In [121]:
_num_col = "sum_to_cart"
kwargs = {
    "RESULTCOL" : f"sma_{_num_col}"
}
_moving_sdf = clsStats.simple_moving_stats(
    num_col =_num_col,  # numeric column name to apply the rolling computation
    date_col='updated_time',  # datetime column name to use as the time stamp
    part_col='adset_id',  # partition column name to apply rolling stats to windows
    win_len =7,   # window length in days, hous, min
    win_unit='DAY', # window length unit of measure by days, hours, minutes
    stat_op ="mean", # stat operation sum, mean or standard deviation
    data = _imputed_sdf,   # data set; that can be converted to pyspark DataFrame
    **kwargs,    # key/value pairs to set other parameters
    )

                                                                                

In [113]:
print("Simple moving average results = ",_moving_sdf.count())

                                                                                

5743

In [117]:
_moving_sdf.filter(F.col('adset_id')==120207252077210665)\
    .select('adset_id', 'updated_time', 'to_cart', 'sma_to_cart').distinct().count()

                                                                                

26

In [111]:
", ".join([x['adset_id'] for x in _moving_sdf.select('adset_id').distinct().collect()])

                                                                                

'120203730642500265, 120205561409390341, 120207252077210665, 120207252077360665, 120207284214170036, 120209700380740741, 120210838557760640, 120210902135400093, 120211073424860341, 120211474905670233, 120212516385180167, 120214458651670383, 120214603481340347, 120214603779340347, 120214775799110121, 120214790928760525, 120214807877110190, 120214824975810036, 120214825030090036, 120214938246210218, 120215122519670081, 120215217528090719, 120215312683850513, 120215424357560510, 120215457546220647, 120215467439550383, 120215546238570767, 120215583293450668, 120215669666630479, 120215749346680165, 120216134870020064, 120216404344810011, 120216406583180011, 120216442107970676, 120216568560120526, 120216958789620341, 120217390781250360, 120217840667850729, 120218147182050735, 120218147182080735, 120218161599550735, 120218161599660735, 120218161599670735, 120218183358530428, 120218187602110735, 120218187602330735, 120218187602340735, 120218187602360735, 120218187602390735, 120222407741940059,

## Load data
Load from file

In [4]:
_fpath = "mining/data/budget/"
_fname = "FullDataset.csv"

__def_date_attr__="updated_time"
__def_realm__='OBJECTIVE'
__def_obj__ = 'OUTCOME_LEADS'

kwargs = {
    "REALMFILTATTR" : 'objective',
    "REALMFILTLIST" : [__def_obj__],
    "UNIXTIMESTAMP" : __def_date_attr__,
}
sdf = clsFile.read_realm(
    realm = __def_realm__,
    to_date = None,
    from_date=None,
    fname = _fname,
    fpath = _fpath,
    **kwargs,
)
print("Loaded %s %d rows" % (_fname, sdf.count()))# sdf.printSchema()




Loaded FullDataset.csv 58280 rows


                                                                                

In [None]:
__def_obj_col__ = 'objective'
__def_opt_goal__= 'checkout'  # action group
__def_dt_cols__ = [f"unix_{__def_date_attr__}", __def_date_attr__]
__def_ids_cols__ = ['adset_id']
__def_ad_met_cols__ = ['spend','impressions', 'clicks', 'reach', 
                       'frequency', 'CTR', 'CPP', 'CPC', 'CPM']
__def_roas_cols__ = ['purchase_value', 'purchase_roas']

_goal_act_cols = [x for x in sdf.columns
                  if x.find(__def_opt_goal__)==0
                  and x not in [*__def_ids_cols__, *__def_ad_met_cols__, 
                                *__def_dt_cols__, *__def_roas_cols__]]

''' Remove goal action cols not greater than zero '''
count_dict= {}
for act  in _goal_act_cols:
    count_dict[act]=sdf.select(act).dropna().count()

_act_met_cols_lst = [k for k,v in count_dict.items() if v>0]
_act_met_cols_lst

''' select data for relevant columns '''
sdf=sdf.select(*__def_ids_cols__, *__def_dt_cols__, __def_obj_col__,
                         *__def_roas_cols__, *__def_ad_met_cols__, *_act_met_cols_lst)\
                .orderBy(*__def_dt_cols__)

sdf.printSchema()

### Retrieve and rename conversion columns

In [9]:
# Run cell to list objectives and metrics
conv_col_lst = [c for c in sdf.columns if c.find(__def_opt_goal__)==0]
new_cols_lst = [c.replace('__',' ').replace('_',' ') for c in conv_col_lst]
new_cols_lst = ["_".join(c.split()[-2:]) for c in new_cols_lst]

for _old_col, _new_col in zip(conv_col_lst, new_cols_lst):
    sdf=sdf.withColumnRenamed(_old_col, _new_col)

print("Converted %d columns" % len(new_cols_lst))

Converted 3 columns


## Compute the 7-day moving average of adset