# Read n Clean MCap with pyspark

In [1]:
'''
    WARNING CONTROL to display or ignore all warnings
'''
import warnings; warnings.simplefilter('ignore')     #switch betweeb 'default' and 'ignore'
import traceback

''' Set debug flag to view extended error messages; else set it to False to turn off debugging mode '''
debug = True

In [30]:
import os
import sys
from datetime import datetime, date, timedelta

sys.path.insert(1,"/home/nuwan/workspace/rezaware/")
import rezaware as reza
from utils.modules.etl.load import sparkDBwls as sdb
from utils.modules.etl.transform import sparkCleanNRich as scne
from mining.modules.assets.etp import logReturns as log

''' restart initiate classes '''
if debug:
    import importlib
    reza = importlib.reload(reza)
    log = importlib.reload(log)
    sdb = importlib.reload(sdb)
    scne = importlib.reload(scne)
    
__desc__ = "analyze crypto market capitalization time series data"
# clsSDB = sdb.SQLWorkLoads(desc=__desc__)
clsSCNR = scne.Transformer(desc=__desc__)
clsROR = log.RatioOfReturns(desc=__desc__)
''' optional - if not specified class will use the default values '''
# prop_kwargs = {"WRITE_TO_TMP":True,   # necessary to emulate the etl dag
#               }
print("\nClass initialization and load complete!")

All functional APP-libraries in REZAWARE-package of REZAWARE-module imported successfully!
All functional LOGRETURNS-libraries in ETP-package of ASSETS-module imported successfully!
All functional SPARKDBWLS-libraries in LOAD-package of ETL-module imported successfully!
All functional SPARKCLEANNRICH-libraries in TRANSFORM-package of ETL-module imported successfully!
logReturns Class initialization complete

Class initialization and load complete!


## Read data from mcap_past
We apply a query to select assets with mcap > 1.0 million. Any missing values are imputed with the mean value.

In [3]:
_from_date = '2022-01-01'
_to_date = '2022-01-31'
# _query = "select * from warehouse.mcap_past "+\
#         f"where mcap_date >= '{_from_date}' and "+\
#         f"mcap_date <= '{_to_date}'"
_query = "select * from warehouse.mcap_past "+\
        f"where mcap_date between '{_from_date}' and '{_to_date}' "+\
        f"and mcap_value > 1000000"
_kwargs = {
    "TABLENAME":'warehouse.mcap_past',
    "COLUMN":'mcap_date',
    "FROMDATETIME":_from_date,
    "TODATETIME":_to_date,
    "PARTITIONS":2,
    "AGGREGATE":'avg',
    "PIVCOLUMNS":['cofix','paypolitan-token','raven-protocol',
               'nft-index','beldex','mt-pelerin-shares']
}

# print(clsSpark.dbSchema)
mcap_sdf = clsROR.read_n_clean_mcap(query=_query,**_kwargs)
# mcap_sdf = clsROR.read_n_clean_mcap(**_kwargs)

print("Loaded %d rows and %d columns" % (mcap_sdf.count(),len(mcap_sdf.columns)))

Wait a moment, retrieving data ...
23/01/12 12:08:19 WARN Utils: Your hostname, FarmRaiderTester resolves to a loopback address: 127.0.1.1; using 192.168.124.15 instead (on interface enp2s0)
23/01/12 12:08:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/01/12 12:08:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/12 12:08:21 WARN FileSystem: Cannot load filesystem: java.util.ServiceConfigurationError: org.apache.hadoop.fs.FileSystem: com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem Unable to get public no-arg constructor
23/01/12 12:08:21 WARN FileSystem: java.lang.NoClassDefFoundError: com/google/api/client/auth/oauth2/Credential
23/01/12 12:08:21 WARN FileSystem: java.lang.ClassNotFoundException: com.google.api.client.auth.oauth2.Credential


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

23/01/12 12:10:27 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 17:>                                                         (0 + 1) / 1]

23/01/12 12:10:53 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB


[Stage 24:>                                                         (0 + 0) / 1]

23/01/12 12:11:23 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB


[Stage 29:>                                                         (0 + 1) / 1]

Loaded 31 rows and 1742 columns


                                                                                

## Unpivot cleaned mcap data

In [31]:
_col_subset = mcap_sdf.columns
_col_subset.remove('mcap_date')
_unpivot_sdf = clsSCNR.unpivot_table(
    table = mcap_sdf,
    unpivot_columns=_col_subset,
    index_column='mcap_date',
    value_columns=['asset_name','mcap_value'],
    where_cols = 'mcap_value',
    **_kwargs
)

In [39]:
# clsSCNR = scne.Transformer(desc=__desc__)
_col_subset = mcap_sdf.columns
_col_subset.remove('mcap_date')
_nan_counts_sdf = clsSCNR.count_column_nulls(
    data=_unpivot_sdf,
#     column_subset=_col_subset
)

# print(_nan_counts_sdf.count(),len(_nan_counts_sdf.columns))
_nan_counts_sdf.show()

[Stage 55:>                                                         (0 + 1) / 1]

23/01/12 18:23:17 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB


[Stage 65:>                                                         (0 + 1) / 1]

23/01/12 18:24:50 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB


[Stage 68:>                                                         (0 + 1) / 1]

++
||
++
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
||
++
only showing top 20 rows



                                                                                

In [40]:
print(_nan_counts_sdf.count())

[Stage 71:>                                                         (0 + 1) / 1]

23/01/12 18:27:03 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB


[Stage 74:>                                                         (0 + 1) / 1]

53971


                                                                                