```bash
## RUN FROM THE TERMINAL WITH PYTHON
$ jupyter nbconvert --to script x11_classifier_best_pipeline.ipynb
```

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [2]:
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor

### Load libraries

In [3]:
# Custom scikit-learn like pipeline with additional functionalities
import pipeline as pl

In [4]:
# How the pipeline should be run for this project
from model.AdData import *
from model.AdFeatures import *
from model.AdRegress import TestPerformance

In [5]:
# Utilities
from model import hyperparams
from model import plotlib

### Initialize

In [6]:
plotlib.load_fonts("../../../common/fonts/")
plt.style.use("matplotlib.mplstyle")

runName = 'x11_regressor_pl'
outputCsv = "%s_results.csv" %runName

In [7]:
# Save outputs to log files
# pl.set_stderr("%s.errlog.txt" %runName, fout="%s.log.txt" %runName)

In [8]:
inputCsv = "../Data/imputed_data.mice.csv"
ignoreXCols = ['imp', 'id', 'quality', 'lobe', 'full', 'other', 'coatingId']

# The initial hyperparams of the best models

In [9]:
# The best models
xgbest1 = dict(base_score=0.5, booster='gbtree', callbacks=None,
    colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
    early_stopping_rounds=None, enable_categorical=False,
    eval_metric=None, gamma=0.0, gpu_id=-1, grow_policy='depthwise',
    importance_type=None, interaction_constraints='',
    learning_rate=0.5, max_bin=256, max_cat_to_onehot=4,
    max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=1,
    monotone_constraints='()', n_estimators=3, n_jobs=0,
    num_parallel_tree=1, objective='reg:squarederror', predictor='auto',
    random_state=0, reg_alpha=0.4)


xgbest2 = dict(base_score=0.5, booster='gbtree', callbacks=None,
    colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
    early_stopping_rounds=None, enable_categorical=False,
    eval_metric=None, gamma=0.0, gpu_id=-1, grow_policy='depthwise',
    importance_type=None, interaction_constraints='',
    learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
    max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
    monotone_constraints='()', n_estimators=8, n_jobs=0,
    num_parallel_tree=1, objective='reg:squarederror', predictor='auto',
    random_state=0, reg_alpha=0.4)

xgbest3 = dict(base_score=0.5, booster='gbtree', callbacks=None,
    colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
    early_stopping_rounds=None, enable_categorical=False,
    eval_metric=None, gamma=0.0, gpu_id=-1, grow_policy='depthwise',
    importance_type=None, interaction_constraints='', learning_rate=1,
    max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=7,
    max_leaves=0, min_child_weight=1,
    monotone_constraints='()', n_estimators=3, n_jobs=0,
    num_parallel_tree=1, objective='reg:squarederror', predictor='auto',
    random_state=0, reg_alpha=0.1)


bestXcols = ['lsfw1', 'lsfw2', 'lsfw3', 'lspk2', 'lspk3', 'teosVolPct',
       'teosVolume', 'tsfw1', 'tsfw2', 'tsfw3', 'tspk1', 'tspk2', 'tspk3']

# Grid pipeline

In [20]:
loader = LoadData(csv=inputCsv, resample=False)
loader.Execute(None)
loader.testdf

 -- LoadData() ... 
	 Test IDs: [28, 30, 26, 19, 14, 7]
	 Test classes: ['full', 'other', 'full', 'lobe', 'full', 'full']
	 Performed train/test split.



Unnamed: 0_level_0,imp,id,tspk1,tsfw1,lspk1,lsfw1,tspk2,tsfw2,lspk2,lsfw2,...,lspk3,lsfw3,teosVolume,teosVolPct,quality,lobe,full,other,coating,coatingId
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
trial_08,0,7,517,48.076381,717,88.568606,512.0,37.261999,720.0,92.623238,...,717.0,96.284753,125.0,20,4,0.0,0.744681,0.255319,full,0
trial_27,0,14,516,35.753805,796,101.53635,513.0,36.940846,811.0,94.240396,...,795.0,101.025205,125.0,20,7,0.3,0.233333,0.466667,other,1
trial_32,0,19,512,42.619045,809,99.312124,516.0,46.944391,822.0,87.107829,...,802.0,105.357169,125.0,20,8,0.0,0.848485,0.151515,full,0
Rowe2018_S2b,0,26,511,29.314961,798,100.062783,515.0,35.240813,829.0,89.313601,...,808.0,96.412917,50.0,5,8,1.0,0.0,0.0,lobe,2
trial_36,0,28,515,39.8732,789,109.4959,517.0,40.4525,806.0,114.4428,...,788.0,133.6731,50.0,20,4,0.0,1.0,0.0,full,0
trial_38,0,30,516,39.5148,784,112.6574,517.0,40.5124,804.0,115.3234,...,777.0,112.8985,200.0,5,7,0.0,1.0,0.0,full,0


In [61]:
grid = [
    # [loader for i in range(20)],
    loader,
    ImputedData(),
    [
        pl.SetYCol('full'),
        pl.SetYCol('lobe'),
        pl.SetYCol('other'),
    ],
    pl.Set(scoring='r2'), #scoring used by sklearn
    pl.DropCol('coating'),
    # pl.AllValidFeatures(ignoreCols=ignoreXCols),
    pl.Set(xCols = bestXcols),
    pl.AugmentByQuality(F=1.5, scale=0.3, qcol='quality'),
    AggregateFeatures(show=False),
    pl.SplitValidation(split_fraction=0.1),
    pl.ScaleX(allColumns=True),
    # pl.SetModel(DecisionTreeRegressor()),
    # pl.SelectFeaturesRFE(show=True),
    [
        pl.SetAlgorithm(XGBRegressor, **xgbest1),
    ],
    pl.SearchHyperParams(hyperparams.space),
    TestPerformance(show=False, use_validation=True),
    TestPerformance(show=False, use_test=True),
]

In [62]:
pipe = pipeline.GridLine(grid)
pipe.Shuffle() # suffle the list, so we immediately have data for analysis
pipe.Save(outputCsv) # save the results after each pipeline run

Initialized 3 pipelines.


In [63]:
pipe.Execute(inputCsv)

Pipeline 01 of 03:
 -- LoadData() ... ok
 -- ImputedData() ... ok
 -- SetYCol: full ... 'full' ok
 -- Set: scoring ... 
	 {'scoring': 'r2'}

 -- DropCol: coating ... ok
 -- Set: xCols ... 
	 {'xCols': ['lsfw1', 'lsfw2', 'lsfw3', 'lspk2', 'lspk3', 'teosVolPct', 'teosVolume', 'tsfw1', 'tsfw2', 'tsfw3', 'tspk1', 'tspk2', 'tspk3']}

 -- AugmentByQuality: F=1 scale=0.30 ... 
	 Old shape: (120, 21), New shape: (350, 21)

 -- AggregateFeatures() ... ok
 -- SplitValidation: 0.10 ... 
	 Training shape: (282, 69), Validation shape: (35, 69)

 -- ScaleX: StandardScaler All: True ... 
	 Scaled all columns.

 -- SetAlgorithm: XGBRegressor ... ok
 -- SearchHyperParams() ... running randomized search ... 
	 5-fold CV HyperParam search for XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
	              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
	              early_stopping_rounds=None, enable_categorical=False,
	              eval_metric=None, gamma=0.0, gpu_id=-1, gro

In [64]:
df = pipe.Summarize()
df[df == ""] = np.nan
df[df.r2 > 0.0].sort_values("r2", ascending=False)

Unnamed: 0,score,model,xcols,val_r2,val_rmse,r2,rmse,S01,S02,S03,...,S05,S06,S07,S08,S09,S10,S11,S12,S13,S14


In [65]:
best_pipelines = [int(l.replace("L", "")) - 1 for l in df.index]
best_models = [pipe.results[i] for i in best_pipelines]
len(best_pipelines)

3

In [70]:
def ensemble_predict(best_models, Ts, col = None):
    """ Given the set of models, and a dataframe, make weighted 
    classification prediction using prediction probabilities.
    
    Notes: Ts must contain all the columns, standard scaling will be performed.
    """
    pr = {}
    for m, ml in enumerate(best_models):
        m = 'ml%d_%s' %(m, ml.yCol)
        df1 = Ts[ml.model.feature_names_in_]
        sclr = StandardScaler().fit(df1)
        
        df2 = pd.DataFrame(sclr.transform(df1), index=df1.index, columns=df1.columns)
        pr[m] = ml.model.predict(df2)
        
    if col is None:
        col = ml.yCol

    pr = pd.DataFrame(pr, index=Ts[col])
    pr = pr.assign(avg = lambda df: df.mean(axis=1))
    pr = pr.assign(err = lambda df: df.index - df.avg)
    return pr

In [73]:
ensemble_predict(best_models, loader.traindf, 'lobe')

Unnamed: 0_level_0,ml0_lobe,ml1_full,ml2_other,avg,err
lobe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.078947,0.096767,0.826873,0.179915,0.367851,-0.288904
0.040000,0.034378,0.573530,0.210470,0.272792,-0.232792
0.000000,0.034378,0.826873,0.153003,0.338084,-0.338084
0.000000,0.034378,0.858651,0.417473,0.436834,-0.436834
0.293103,0.301032,0.141830,0.328796,0.257219,0.035884
...,...,...,...,...,...
0.000000,0.034378,0.996750,0.012378,0.347835,-0.347835
0.000000,0.034378,0.996750,0.002653,0.344594,-0.344594
0.000000,0.034378,0.102599,0.016199,0.051059,-0.051059
0.000000,0.034378,0.899785,0.086935,0.340366,-0.340366
