In [1]:

import pandas as pd
import io
import requests
import sys
import json
import plotly.express as px

sys.path.append("../modules")
from data_manager import DataManager
#from feature_manager import FeatureManger

import toniq

tsu = toniq.SparkUtils()

## Setup Config

In [2]:
config = {
"provider": "gcp",
"verbose": False,
"gt_column": "income",
"data":

{
    "load":
        {
             mode: {"name": f"income_raw_data_{mode}", "store": "data", "partition": None}
             for mode in ["train", "test"]

        }, 


    "save":
        {
             mode: {"name": f"income_transformed_data_{mode}", "store": "feature", "partition":mode}
             for mode in ["train", "test"]

        }
    }
}


## Initialize DataManger

In [3]:
dm = DataManager(provider=config["provider"])

s3_endpoint is 10.2.3.167:9000


## Load Raw Data into Toniq Data Store

In [4]:

dfs = {}
for mode, load_data_args in config["data"]["load"].items():
    dfs[mode] = dm.load_table(**load_data_args)

In [5]:
dfs[mode].sample(0.2).toPandas()

Unnamed: 0,index,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,13,58,?,299831,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,35,United-States,lt 50k
1,21,34,Private,238588,Some-college,10,Never-married,Other-service,Own-child,Black,Female,0,0,35,United-States,lt 50k
2,24,25,Private,205947,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,lt 50k
3,31,56,Self-emp-not-inc,186651,11th,7,Widowed,Other-service,Unmarried,White,Female,0,0,50,United-States,lt 50k
4,34,26,Private,43311,HS-grad,9,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,United-States,lt 50k
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3266,8111,41,?,45186,Some-college,10,Married-civ-spouse,?,Husband,White,Male,0,0,40,United-States,lt 50k
3267,8114,17,Private,194946,11th,7,Never-married,Other-service,Own-child,White,Female,0,0,20,United-States,lt 50k
3268,8127,18,Private,137363,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,20,United-States,lt 50k
3269,8134,30,Private,143078,Prof-school,15,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,lt 50k


### Find a Map betwen the Columns and the Column Dtypes and Vice Versa

- We want to find a map betweeen each column and their data type such that
    - string can be converted to categorical varaibles
    - bigint/floats can be converted into continuous variables given high enough cardinality (aka more unique values)

In [6]:
df = dfs["train"]

col2dtype = {col:dtype for col, dtype in df.dtypes}
dtype2col = {}

for col, dtype in col2dtype.items():
    if dtype in dtype2col:
        dtype2col[dtype] += [col]
    else:
        dtype2col[dtype] = [col]
        

print(f"Column to Type Map \n\n\n",
      json.dumps(col2dtype,indent=4),
      "\n\n\n Types to Column(s) Map \n\n", 
      json.dumps(dtype2col, indent=4))

Column to Type Map 


 {
    "index": "bigint",
    "age": "bigint",
    "workclass": "string",
    "fnlwgt": "bigint",
    "education": "string",
    "education_num": "bigint",
    "marital_status": "string",
    "occupation": "string",
    "relationship": "string",
    "race": "string",
    "sex": "string",
    "capital_gain": "bigint",
    "capital_loss": "bigint",
    "hours_per_week": "bigint",
    "native_country": "string",
    "income": "string"
} 


 Types to Column(s) Map 

 {
    "bigint": [
        "index",
        "age",
        "fnlwgt",
        "education_num",
        "capital_gain",
        "capital_loss",
        "hours_per_week"
    ],
    "string": [
        "workclass",
        "education",
        "marital_status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "native_country",
        "income"
    ]
}


## Understanding the Data

- Visualizing the Distribution of Each Column from the Training Set Only
- This allows us to observe any need for what tranformations we need

**The Goal**
- Transform Continuous Features into Standard Normal Distributions
- Transform Categorical Features into One-Hot Encodings (Or Categorical if you want to get fancy)

### Feature Historgrams

In [7]:
if config["verbose"]:
    for column in df.columns:
        tmp_df = df.sample(0.1).toPandas()
        fig = px.histogram(tmp_df, x=column)
        fig.update_layout(title=column)
        fig.show()


## Building the Transformation Pipeline

- Categorical Variables:
    - StringIndex and OnehotCode
- Continuous Variables
    - Scalar Standardize
    
**Assumption/Limitation/Warning**
   - For Simplicity, all bigint/float columns are treated as continuous
   - All strings are categorical


In [8]:
from pyspark.ml.feature import OneHotEncoder, StandardScaler, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

col2pipeline = {}

pipeline_list = []
categorical_input_list = []
gt_column = "income"



"""

User inputs N blocks of a transformation
user can save the state of this in s3


"""



"""

One Hot Encode Categorical Variables

"""
for col in dtype2col["string"]: 
    if col == gt_column:
        # just return the index for the ground truth column
        pipeline_list.append(StringIndexer(inputCol=col, outputCol = f"label", handleInvalid="keep"))
        
    else:
        # get the index of the category
        pipeline_list.append(StringIndexer(inputCol=col, outputCol = f"{col}_index_tmp"))
        
        # one hot encode the string index
        pipeline_list.append(OneHotEncoder(inputCol=f"{col}_index_tmp", outputCol=f"{col}_ohe_tmp"))
        categorical_input_list.append(f"{col}_ohe_tmp")
        
pipeline_list.append(VectorAssembler(inputCols=categorical_input_list, outputCol=f"cat_features_tmp"))


"""
Standardize Continuous Variables

Assumption/Limitation

- For Simplicity, all bigint/float columns are treated as continuous

"""

pipeline_list.append(VectorAssembler(inputCols = dtype2col["bigint"], outputCol = "cont_features_tmp")) 
pipeline_list.append(StandardScaler(inputCol = "cont_features_tmp", outputCol = "cont_features_standardized_tmp"))



"""
Vector Assemble Continous and Cateogrical Variables into one feature vector 
"""

pipeline_list.append(VectorAssembler(inputCols = ["cat_features_tmp", "cont_features_standardized_tmp"], outputCol = "features" ))



"""

Create the preprocessing pipeline pipeline 

"""
transform_pipeline = Pipeline(stages= pipeline_list)

'''Fit on TRAIN ONLY'''
transform_pipeline = transform_pipeline.fit(dfs["train"])

'''Transform on TRAIN and TEST'''
transformed_dfs = {split: transform_pipeline.transform(df).select("index","features", "label")
                           for split, df in dfs.items()}



## Saved Processed Transformations to Feature Store

In [9]:


for mode, save_data_arg in config["data"]["save"].items():
    dm.write_table(df = transformed_dfs[mode], **save_data_arg)



Saved Toniq Table (income_transformed_data_train)
Saved Toniq Table (income_transformed_data_test)


## Stop Spark

In [10]:
dm.store.stop_spark()

s3_endpoint is 10.2.3.167:9000
