# Rolling Statistics Example

In [1]:
'''
    WARNING CONTROL to display or ignore all warnings
'''
import warnings; warnings.simplefilter('default')     #switch betweeb 'default' and 'ignore'

''' Set debug flag to view extended error messages; else set it to False to turn off debugging mode '''
debug = True


## Instantiate packages

In [6]:
import os
import sys

os.environ['KMP_DUPLICATE_LIB_OK']='True'
sys.path.insert(1,"/home/nuwan/workspace/rezaware/")
from utils.modules.ml.timeseries import rollingstats as rs

''' restart initiate classes '''
if debug:
    import importlib
    rs = importlib.reload(rs)
    
__desc__ = "process rolling stats for sample dataset"

''' optional - if not specified class will use the default values '''
prop_kwargs = {"WRITE_TO_TMP_FILE":False,   # necessary to emulate the etl dag
              }
clsRS = rs.RollingStats(desc=__desc__)
print("\nClass initialization and load complete!")

All packages in utils ml timeseries RollingStats imported successfully!
execSession Class initialization complete

Class initialization and load complete!


  logger.handlers.clear()
  logger.handlers.clear()


## Load sample data
1. The data is stored in ```utils/data/ml/timeseries/sampledata/rollingstats.csv``` file
1. Import the data into a pandas ```dataframe```; the ```rollingstats``` class will convert it to a ```pyspark.sql.DataFrame```
1. Print the first 3 rows to visualize the data

__Note__: remember to change ```_rezaware_home``` with the absolute path to your _rezaware_ folder.

In [3]:
import pandas as pd

_rezaware_home = '/home/nuwan/workspace/rezaware/'

data_file_path = os.path.join(_rezaware_home,
                              'utils/data/ml/timeseries/sampledata',
                              'rollingstats.csv'
                             )
data_df = pd.read_csv(data_file_path)
data_df[:3]

Unnamed: 0,mcap_past_pk,uuid,data_source,asset_name,asset_symbol,alt_asset_id,currency,price_date,price_value,price_log_ror,...,volume_date,volume_size,volume_change,created_dt,created_by,created_proc,modified_dt,modified_by,modified_proc,deactivate_dt
0,46400,64538e084a6e0a5e3a6f7acb,coingecko,gamezone,gzone,gamezone,usd,2023-03-02 00:00:00,0.038417,-0.06832,...,2023-03-02 00:00:00,113719.7,,2023-05-05 07:37:38.326173,farmraider,wrangler_assets_etl_CryptoMarket function <nos...,2023-05-17 11:10:34.581678,farmraider,utils_etl_loader_sparkDBwls_sparkDBwls functio...,
1,41977,6453d10d4a6e0a5e3a805a02,coingecko,the_virtua_kolect,tvk,the-virtua-kolect,usd,2023-03-02 00:00:00,0.050673,-0.064649,...,2023-03-02 00:00:00,26092770.0,,2023-05-05 07:35:47.200183,farmraider,wrangler_assets_etl_CryptoMarket function <nos...,2023-05-17 11:10:34.581678,farmraider,utils_etl_loader_sparkDBwls_sparkDBwls functio...,
2,40238,6453638f4a6e0a5e3a646bb1,coingecko,alchemy_pay,ach,alchemy-pay,usd,2023-03-02 00:00:00,0.036582,-0.063371,...,2023-03-02 00:00:00,88788820.0,,2023-05-05 07:35:05.269605,farmraider,wrangler_assets_etl_CryptoMarket function <nos...,2023-05-17 11:10:34.581678,farmraider,utils_etl_loader_sparkDBwls_sparkDBwls functio...,


## Execute rollingstats functions
1. Set the input parameters for the ```simple_moving_stats``` function
   * ```num_col```, numeric column name to apply the rolling computation on
   * ```date_col```, datetime column name to use as the time stamp
   * ```part_col```, partition column name to apply rolling stats to windows
   * ```win_len```, window length in days, hous, min
   * ```win_unit```, window length in days, hous, min
   * ```stat_op```, stat operation sum, mean or standard deviation
   * ```data```, data set; that can be converted to pyspark DataFrame
   * ```**kwargs```, key/value pairs to set other parameters
1. execute the function to return a pyspark dataset to ```_roll_stat_df``` with the new column containint the rollingstats
1. print the first 5 lines of the returned dataset

__Note__: the logs are available in the file: ```utils/logs/ml/timeseries/app.log```

In [None]:
from pyspark.sql import functions as F

__num_col_prefix__='price' # change between 'price' & 'mcap'

__num_col__ = "_".join([__num_col_prefix__,'log_ror'])
__date_col__= "_".join([__num_col_prefix__,'date'])
__part_col__= "asset_name"
__win_len__ = 7
__win_unit__= "DAY"
__stat_op__ = "mean"
kwargs={
    "RESULTCOL":"_".join(['roll','price',__stat_op__])
}

_roll_stat_df = clsRS.simple_moving_stats(
    num_col=__num_col__,
    date_col=__date_col__,
    part_col=__part_col__,
    win_len=__win_len__,
    win_unit=__win_unit__,
    stat_op=__stat_op__,
    data=data_df,
    **kwargs,
)
_roll_stat_df.select(F.col('asset_name'),F.col('price_date'),F.col('price_log_ror'),\
                     F.col(kwargs['RESULTCOL']))\
                     .sort(F.col('asset_name'),F.col('price_date')).show(n=10)