In [39]:
import warnings
warnings.simplefilter("ignore")

import os
import pandas as pd
import io
import requests
import sys
import json
import plotly.express as px
sys.path.append("../../modules")
from data_manager import DataManager


import pyspark.ml
import pyspark.sql.functions as f
import pyspark.sql.types as t

from pyspark.ml.tuning import CrossValidator
from pyspark.mllib.evaluation import MulticlassMetrics

import toniq
import hyperopt
from hyperopt import hp

from functools import partial

import tempfile
import shutil
from inspect import signature
from glob import glob

## Setup Config

In [40]:
config = {
    "experiment_name": "EXP2", # name of the experiment
    "sort_runs_metric": "test-fMeasure",  # sort runs by the metric name
}

## Initialize DataManager

In [41]:
dm = DataManager(provider="gcp")

s3_endpoint is 10.2.3.167:9000
s3_endpoint is 10.2.3.167:9000


## Initialize MLFLOW

In [42]:
mlflow_client = toniq.MlflowClient()

## Get the Experiment Object from config['experiment_name']

In [50]:
experiment_list = mlflow_client.list_experiments()
name2experiment = dict(zip(list(map(lambda exp: exp.name, experiment_list)), experiment_list))
name2experiment

{'EXP1': <Experiment: artifact_location='file:///home/toniq/work/datascience_template/src/experiments/mlruns/1', experiment_id='1', lifecycle_stage='active', name='EXP1', tags={}>,
 'EXP2': <Experiment: artifact_location='file:///home/toniq/work/datascience_template/src/experiments/mlruns/2', experiment_id='2', lifecycle_stage='active', name='EXP2', tags={}>,
 'Default': <Experiment: artifact_location='file:///home/toniq/work/datascience_template/src/experiments/mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>}

In [14]:
experiment = mlflow_client.get_experiment_by_name(name=config["experiment_name"])


## Get the Runs from the Chosen Expeirment

In [15]:
run_list = mlflow_client.search_runs(experiment_ids=experiment.experiment_id)

# sort out any runs with null metrics to avoid further issues when choosing an experiment
run_list = list(filter(lambda r: bool(r.data.metrics), run_list))


#Sort Runs by the Specified Metric (config['sort_runs_metric'])
run_list = sorted(run_list, key= lambda r: r.data.metrics[config["sort_runs_metric"]], reverse=True)

# get the best run from the run_list
best_run = run_list[0]

In [16]:
ls /home/toniq/work/datascience_template/src/experiments/mlruns/1/2f4cdb28541b4caeafe6ebcf33a33c6a/artifacts

[0m[01;34mconfig[0m/  [01;34mmodel[0m/  [01;34mpredictions[0m/


## Retrieve the Artifacts from the Best Run

In [38]:

# list all of the paths from the best run
artifact_paths = glob(best_run.info.artifact_uri.replace("file://", "")+"/*/*")

# get a hash between the artifact name and the path
artifact2path = {ap.split('/')[-2] : ap for ap in artifact_paths}


{'model': '/home/toniq/work/datascience_template/src/experiments/mlruns/2/248a944993a84cd590d57f4cbd4f61e9/artifacts/model/tmph0p8rdy6', 'config': '/home/toniq/work/datascience_template/src/experiments/mlruns/2/248a944993a84cd590d57f4cbd4f61e9/artifacts/config/tmp2ws2jq0x.json', 'predictions': '/home/toniq/work/datascience_template/src/experiments/mlruns/2/248a944993a84cd590d57f4cbd4f61e9/artifacts/predictions/tmpaf1oapgy.json'}


### Get the Run Configuration

In [26]:

with open(artifact2path["config"], "r") as f:
    run_config = json.load(f)

{'name': 'GBTClassifier',
 'params': {'maxBins': 55.0, 'maxDepth': 5.0, 'minInstancesPerNode': 58.0},
 'type': 'classification'}

### Get the Run Predictions

In [32]:
run_predictions = pd.read_json(artifact2path["predictions"])

Unnamed: 0,label,probability,prediction
0,0,"[0.952287674, 0.0477123335]",0
1,0,"[0.9599270225000001, 0.0400729962]",0
2,1,"[0.6862932444000001, 0.3137067556]",0
3,0,"[0.7930009961000001, 0.20699898900000002]",0
4,1,"[0.5173786879, 0.48262131210000003]",0
...,...,...,...
16276,0,"[0.7340903878, 0.265909642]",0
16277,0,"[0.9448387623000001, 0.055161234]",0
16278,0,"[0.2734720409, 0.7265279889]",1
16279,0,"[0.9550402164, 0.044959787300000005]",0


## Get the Model from the Experiment

In [37]:
import pyspark.ml

# get the model_cofnig from the model field in run_config
''' FIX TODO: Saved Models are Empty'''

model_config = run_config["model"]

# get the model class
model = getattr(getattr(pyspark.ml, model_config["type"]), model_config["name"])

# initialize the object with the hyperparams (params field) from the model_config
model = model(**model_config["params"])

# load the model from the artifact store in mlflow
#model.load(artifact2path["model"])