## Analze the data to find any trends

In [1]:
'''
    WARNING CONTROL to display or ignore all warnings
'''
import warnings; warnings.simplefilter('ignore')     #switch betweeb 'default' and 'ignore'
import traceback

''' Set debug flag to view extended error messages; else set it to False to turn off debugging mode '''
debug = True

In [2]:
import os
import sys
import numpy as np
# import findspark
# findspark.init()
from pyspark.sql import functions as F
# import tensorflow_probability as tfp
# import tensorflow as tf

proj_dir = os.path.abspath(os.pardir)
sys.path.insert(1,proj_dir.split('mining/')[0])
# from rezaware.modules.etl.loader import sparkRDBM as db
from rezaware.modules.etl.loader import sparkFile as file
# from rezaware.modules.etl.loader import __propAttr__ as attr

''' restart initiate classes '''
if debug:
    import importlib
    # db = importlib.reload(db)
    file=importlib.reload(file)
    # attr=importlib.reload(attr)

__desc__ = "read and write BigQuery dataset for hypothese testing"

clsFile = file.dataWorkLoads(
    desc = "optimizing action_type budgets for an ad",
    store_mode='local-fs',
    store_root=proj_dir.split('mining/')[0],
    jar_dir=None,
)

print("\n%s class initialization and load complete!" % __desc__)

All functional __PROPATTR__-libraries in LOADER-package of ETL-module imported successfully!
All functional SPARKFILE-libraries in LOADER-package of ETL-module imported successfully!
All functional SPARKFILE-libraries in LOADER-package of ETL-module imported successfully!
All functional APP-libraries in REZAWARE-package of REZAWARE-module imported successfully!
__propAttr__ Class initialization complete
sparkFile Class initialization complete

read and write BigQuery dataset for hypothese testing class initialization and load complete!


## Load Data

#### spark sesion options

In [3]:
options = {
    "inferSchema":True,
    "header":True,
    "delimiter":",",
    "pathGlobFilter":'*.csv',
    "recursiveFileLookup":True,
}

#### Full Dataset

In [4]:
_fname = "FullDataset.csv"
_fpath = "mining/data/budget"
sdf=clsFile.read_files_to_dtype(
    as_type = "SPARK",      # optional - define the data type to return
    folder_path=_fpath,  # optional - relative path, w.r.t. self.storeRoot
    file_name = _fname,  # optional - name of the file to read (complete-60-accounts.csv)
    file_type = None,  # optional - read all the files of same type
    **options,
)
print("Loaded %s %d rows" % (_fname, sdf.count()))

''' convert date to int '''
sdf = sdf.withColumn('date', F.unix_timestamp('updated_time'))
print("timestamp converted unix_timestamp for %d rows" % sdf.count())

25/02/26 10:56:47 WARN Utils: Your hostname, FarmRaider2 resolves to a loopback address: 127.0.1.1; using 192.168.2.85 instead (on interface enp3s0)
25/02/26 10:56:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/02/26 10:56:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

Loaded FullDataset.csv 1777681 rows




timestamp converted unix_timestamp for 1777681 rows


                                                                                

### Filter by objective
Also remove all null columns

In [6]:
obj_name = "LINK_CLICKS"
obj_sdf=sdf.filter(F.col('objective').isin(obj_name))
null_counts = obj_sdf.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in obj_sdf.columns]).collect()[0].asDict()
to_drop = [k for k, v in null_counts.items() if v >= obj_sdf.count()]
obj_sdf = obj_sdf.drop(*to_drop)
print("Filtered %d rows and reduced to %d of %d columns" 
      % (obj_sdf.count(), len(obj_sdf.columns), len(sdf.columns)))

25/02/26 11:00:29 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

Filtered 71717 rows and reduced to 51 of 131 columns


                                                                                

### Define predictors and labels

In [25]:
_num_cols_lst = ['d1_view', 'd7_view', 'd28_view', 'd1_click', 'd7_click', 'd28_click', 
                 'CTR', 'CPM', 'CPC', 'CPP'] 
_act_cols_lst = ['app_interactions_app_custom_event__fb_mobile_add_to_wishlist',
                 'app_interactions_app_custom_event__fb_mobile_spent_credits', 
                 'app_interactions_app_custom_event__fb_mobile_content_view', 
                 'checkout_offsite_conversion__fb_pixel_add_to_cart', 
                 'checkout_app_custom_event__fb_mobile_initiated_checkout', 
                 'checkout_offsite_conversion__fb_pixel_initiate_checkout', 
                 'checkout_app_custom_event__fb_mobile_add_to_cart', 
                 'checkout_app_custom_event__fb_mobile_add_payment_info', 
                 'checkout_offsite_conversion__fb_pixel_add_payment_info', 
                 'custom_conversions_app_custom_event__other', 
                 'custom_conversions_offsite_conversion__fb_pixel_custom', 
                 'custom_conversions_omni_custom', 
                 'lead_generation_onsite_conversion__lead_grouped', 
                 'lead_generation_offsite_conversion__fb_pixel_lead', 
                 'lead_generation_lead', 
                 'purchase_onsite_conversion__purchase', 
                 'purchase_offsite_conversion__fb_pixel_purchase', 
                 'purchase_app_custom_event__fb_mobile_purchase', 
                 'purchase_omni_purchase', 
                 'registration_omni_complete_registration', 
                 'registration_offsite_conversion__fb_pixel_complete_registration', 
                 'search_offsite_conversion__fb_pixel_search']
_target = ['purchase_roas']
_target_cols_lst = ['impressions', 'clicks', 'reach', 'frequency', 'spend', 
                    'purchase_value', ]
_ignore_cols_lst = ['date', 'timezone_name', 'business_country_code', 'business_city',
                    'business_state', 'campaign_id', 'adset_id', 'ad_id', 'account_id',]
                    # 'CTR', 'CPM', 'CPC', 'CPP']

### Remove target null values

In [30]:
data = obj_sdf.filter(F.greatest(*[F.col(c).isNotNull() \
                      for c in [*_target_cols_lst, *_target]])).toPandas()
data = data.dropna(axis=0, subset=_target)
print("After Null col remove: %d of %d active columns" 
      % (len(data.columns), len(obj_sdf.columns)))
print("Cleanup reduced rows %d of %d" % (data.shape[0], obj_sdf.count()))

                                                                                

After Null col remove: 51 of 51 active columns




Cleanup reduced rows 10697 of 71717


                                                                                

### Setup pycaret

In [31]:
from pycaret.regression import *

s = setup(data, target=_target[0], 
          numeric_features=[*_num_cols_lst, *_act_cols_lst, *_target_cols_lst], 
          ignore_features=[*_ignore_cols_lst], session_id = 1969)

Unnamed: 0,Description,Value
0,Session id,1969
1,Target,purchase_roas
2,Target type,Regression
3,Original data shape,"(10697, 51)"
4,Transformed data shape,"(10697, 64)"
5,Transformed train set shape,"(7487, 64)"
6,Transformed test set shape,"(3210, 64)"
7,Ignore features,9
8,Numeric features,38
9,Categorical features,3


### Use found model (Extra Tree Regressor)

In [35]:
'''Extra Trees Regressor'''
best = compare_models()
# create best models
# best = create_model('et')

evaluate_model(best)
pre_holdout = predict_model(best)

new_df = data.copy().drop([_target], axis=1)
predictions = predict_model(best, data=new_df)
save_model(best, 'best_pipeline')


ValueError: Estimator xgboost not available. Please see docstring for list of available estimators.

## Load Model

In [20]:
from pycaret.regression import *

best=load_model('best_pipeline')

Transformation Pipeline and Model Successfully Loaded


In [25]:
new_df = data.copy().drop(_ignore_cols_lst).iloc[[1]]

y_test_df = new_df['spend']
X_test_df = new_df.drop('spend', axis=1)
y_test_df, X_test_df
# predictions = predict_model(best, data=X_test_df)


In [22]:
predictions

Unnamed: 0,account_id,ad_id,adset_id,campaign_id,updated_time,impressions,frequency,reach,CTR,CPP,...,lead_generation_lead,purchase_onsite_conversion__purchase,purchase_offsite_conversion__fb_pixel_purchase,purchase_app_custom_event__fb_mobile_purchase,purchase_omni_purchase,registration_omni_complete_registration,registration_offsite_conversion__fb_pixel_complete_registration,search_offsite_conversion__fb_pixel_search,date,prediction_label
1,10202284950244559,23854099209720543,23854083590040543,23854083127830543,2025-01-31,42163,1.213429,34747,5.602068,1.442462,...,,,154.899994,,154.899994,,,,1738252800,50.121243
