In [30]:

import pandas as pd
import io
import requests
import sys
import json
import plotly.express as px

sys.path.append("../modules")
from data_manager import DataManager


import pyspark.ml
import pyspark.sql.functions as f
import pyspark.sql.types as t

from pyspark.ml.tuning import CrossValidator
from pyspark.mllib.evaluation import MulticlassMetrics

import toniq
import hyperopt
from hyperopt import hp

from functools import partial


In [31]:
 def calculate_metrics( df):
        
        """
        
        define your own metrics to evaluate cross validation
        
        :params:
        
        df: dataframe containing {aprediction} and {label} columns
        
        :returns:
        
        confusion matrix
        
        """

        # turn gt into label
        preds_and_labels = df.select('prediction',f.col('label').cast(t.FloatType()))
        metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
        
        
        # confusion matrix
        
        
        metrics_dict = dict(
            # unweighted measures
            tpr = metrics.truePositiveRate(label=1.0),
            fpr = metrics.falsePositiveRate(label=1.0),
            precision = metrics.precision(label=1.0),
            recall = metrics.recall(label=1.0),
            fMeasure = metrics.fMeasure(label=1.0)
        )
        
        
        metrics_dict= {k:round(v,3) if  k != "confusion" else v for k,v in metrics_dict.items()}
        
        
        return metrics_dict

# Setup Config

In [65]:
config = {
"verbose": True,
"gt_column": "income",

"hyperopt":
    {
    "metric": "fMeasure",
    "max_evals": 10,
    },
    
"mlflow":
    {
        "experiment_name": "DEMO",
        "tags": {"version": "0.1.0"} 
    }
}

## Initialize DataManager

In [4]:
dm = DataManager(provider="gcp")

s3_endpoint is 10.2.3.167:9000


## Load Features from DataManager

In [5]:

dfs = {}

for mode in ["train", "test"]:
    dfs[mode] = dm.load_table(name=f"income_transformed_data_{mode}", store="feature", partition=mode)

In [6]:
dfs[mode]

DataFrame[features: vector, label: double]

In [67]:

mlflow_client = toniq.MlflowClient()
      

def train_and_eval(model_config, args):
    
    '''update the model config'''
    config["model"] = model_config
    
    '''get the model'''
    model = getattr(getattr(pyspark.ml, model_config["type"]), model_config["name"])
    model = model(**model_config["params"])

    
    '''Initialize MLFLOW Experiment (If it does not exist)'''
    # get the experiment if it exists, otherwise it will return a None
    
    experiment= mlflow_client.get_experiment_by_name(config["mlflow"]["experiment_name"])

    if experiment:
        # if the experiment is not None, get the id
        experiment_id = experiment.experiment_id
    else:
        # if the experiment is None, create a new exerpiment and get the experiment by name
        experiment_id= mlflow_client.create_experiment(config["mlflow"]["experiment_name"])
        experiment= mlflow_client.get_experiment_by_name(config["mlflow"]["experiment_name"])

        
    """CREATE A NEW RUN"""
    current_run = mlflow_client.create_run(experiment_id)
    run_id = current_run.info.run_id
    
    
    '''FIT MODEL ON TRAINING DATASET'''
    model = model.fit(dfs["train"])
    pred_dfs = {mode:model.transform(df) for mode, df in dfs.items()}
    
    
    '''CALCULATE MODEL METRICS FOR TRAINING/TESTING SETS'''
    metric_results= {mode: calculate_metrics(pred_df) for mode,pred_df in pred_dfs.items()}
                
    '''Register Metrics'''
    for mode in metric_results.keys():
        for metric_key, metric_val in metric_results[mode].items():
            mlflow_client.log_metric(run_id, f"{mode}-{metric_key}", metric_val)
    
    for param_name, param_val in config["model"]["params"].items():
        mlflow_client.log_param(run_id,param_name, param_val)
            
    # report metricn to tune
    return -metric_results["test"][config["hyperopt"]["metric"]]



    


    

In [68]:
# define an objective function
def objective(args):
    case, val = args
    if case == 'case 1':
        return val
    else:
        return val ** 2

# define a search space


{
    'type': 'classification',
    'name': 'RandomForestClassifier',    
    'params': dict(maxDepth=10, maxBins=49, minInstancesPerNode=2, numTrees= 10)
    }

space = hp.choice('model',
    """
    Chosing the Mode
    """
                
    [
        {
            'type': 'classification',
            'name': 'RandomForestClassifier',    
            'params': {
                       "maxDepth":hp.quniform("maxDepth",5,10,1),
                       "maxBins":hp.quniform("maxBins", 45,60,1),
                       "minInstancesPerNode":hp.quniform("minInstancesPerNode", 40,60,1),
                       "numTrees":hp.quniform("numTrees", 40,60,1)
                      }
        },
        
        {
            'type': 'classification',
            'name': 'GBTClassifier',   
            
            
            'params': {
                       "maxDepth":hp.quniform("maxDepth_GBT",5,10,1),
                       "maxBins":hp.quniform("maxBins_GBT", 45,60,1),
                       "minInstancesPerNode":hp.quniform("minInstancesPerNode_GBT", 40,60,1),
                      }
        }
        

    ])

# minimize the objective over the space
from hyperopt import fmin, tpe
best = fmin(partial(train_and_eval, args=config), space, algo=tpe.suggest, max_evals=config["hyperopt"]["max_evals"])

print(best)
# -> {'a': 1, 'c2': 0.01420615366247227}
print("Best Hypr", hyperopt.space_eval(space, best))
# -> ('case 2', 0.01420615366247227}

100%|██████████| 10/10 [02:29<00:00, 14.91s/trial, best loss: -0.685]
{'maxBins_2': 58.0, 'maxDepth_2': 5.0, 'minInstancesPerNode_2': 56.0, 'model': 1}
{'name': 'GBTClassifier', 'params': {'maxBins': 58.0, 'maxDepth': 5.0, 'minInstancesPerNode': 56.0}, 'type': 'classification'}
