# Composite AutoML

In [1]:
import pandas as pd

df = pd.read_csv('tmp/extracted_data_processed.csv')
X = df.iloc[:,:-1]
X = X.loc[:, ~X.columns.str.contains('^Unnamed')]
y = df.iloc[:,-1]

X.head()

Unnamed: 0,id,Zone16_Pressure__ratio_beyond_r_sigma__r_0.5,Zone26_Pressure__number_cwt_peaks__n_5,Zone4_Pressure__quantile__q_0.8,Zone7_Pressure__quantile__q_0.8,Zone31_Pressure__kurtosis,"Zone9_Pressure__fft_coefficient__coeff_31__attr_""imag""",Zone15_Pressure__kurtosis,Zone15_Pressure__quantile__q_0.7,Zone23_Pressure__skewness,...,"Zone14_Pressure__agg_linear_trend__f_agg_""mean""__chunk_len_50__attr_""stderr""","Zone10_Pressure__fft_coefficient__coeff_88__attr_""abs""","Zone31_Pressure__fft_coefficient__coeff_39__attr_""imag""",Zone24_Pressure__skewness,Zone17_Pressure__time_reversal_asymmetry_statistic__lag_1,"Zone31_Pressure__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","Zone2_Pressure__augmented_dickey_fuller__attr_""teststat""","Zone31_Pressure__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0",Zone26_Pressure__ratio_value_number_to_time_series_length,"Zone9_Pressure__fft_coefficient__coeff_30__attr_""imag"""
0,0,0.152859,26.0,90414.74,23210.18,1.178,201211.715148,0.180995,18795.44,1.207939,...,1112.369466,76492.559938,255785.698246,2.473288,477375900000.0,135393.08443,-1.086534,85.208147,0.325554,244955.6795
1,2,0.09417,23.0,93441.16,21939.86,2.6417,180946.077666,1.282848,16444.71,1.487467,...,566.148649,74394.719607,-98428.832501,3.445868,552242900000.0,30438.276291,-2.317037,42.94978,0.382287,212275.353946
2,5,0.136808,33.0,43145.8,12085.7,1.567874,432280.949243,0.114999,11725.8,1.189575,...,604.656515,126717.303704,70494.515982,3.155242,616169100000.0,78916.292213,0.628782,50.660506,0.378936,399243.625839
3,11,0.112735,25.0,41329.9,13174.0,3.252763,207945.724593,2.822502,10678.11,1.811675,...,322.081596,104570.553883,245829.624233,2.818438,608691100000.0,27339.274344,0.894001,34.879843,0.41023,193371.775057
4,12,0.148026,24.0,38397.98,13821.24,-0.219816,274288.012114,0.325279,13051.98,1.08462,...,304.019976,82506.913319,238733.932588,2.321429,483027800000.0,23691.742285,2.030197,50.837662,0.33114,256213.407809


In [2]:
# code
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

seed = 0
folds = 10
algorithm = RandomForestClassifier(n_estimators=100, random_state=seed)

def get_score(X, y):    
    # Cross Validation
    score = cross_val_score(algorithm, X, y, cv=folds).mean()
    print("Cross Validation Score: "+ str(score*100) + "%") 
    # Train-Test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)
    algorithm.fit(X_train, y_train)
    y_pred = algorithm.predict(X_test)
    print(classification_report(y_test, y_pred))

get_score(X, y)

Cross Validation Score: 76.5%
              precision    recall  f1-score   support

           0       0.78      0.82      0.80        34
           1       0.80      0.75      0.77        32

    accuracy                           0.79        66
   macro avg       0.79      0.79      0.79        66
weighted avg       0.79      0.79      0.79        66



### TPOT

In [3]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3)

# generations: int, optional (default=100)
#    Number of iterations to the run pipeline optimization process. Must be a positive number.
#    Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.
#    TPOT will evaluate population_size + generations × offspring_size pipelines in total. 
tpot = TPOTClassifier(generations=50, population_size=50, n_jobs=8, verbosity=2)
tpot.fit(X_train, y_train)

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=2550, style=ProgressStyle(descrip…

Generation 1 - Current best internal CV score: 0.792391899288451
Generation 2 - Current best internal CV score: 0.7990421455938698
Generation 3 - Current best internal CV score: 0.8128352490421455
Generation 4 - Current best internal CV score: 0.8135924101441343
Generation 5 - Current best internal CV score: 0.8135924101441343
Generation 6 - Current best internal CV score: 0.8135924101441343
Generation 7 - Current best internal CV score: 0.8135924101441343
Generation 8 - Current best internal CV score: 0.8135924101441343
Generation 9 - Current best internal CV score: 0.8212643678160919
Generation 10 - Current best internal CV score: 0.8212643678160919
Generation 11 - Current best internal CV score: 0.8212643678160919
Generation 12 - Current best internal CV score: 0.8212643678160919
Generation 13 - Current best internal CV score: 0.8215106732348112
Generation 14 - Current best internal CV score: 0.8215106732348112
Generation 15 - Current best internal CV score: 0.8215106732348112
Gener

TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=50,
               max_eval_time_mins=5, max_time_mins=None, memory=None,
               mutation_rate=0.9, n_jobs=8, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=50,
               random_state=None, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)

In [4]:
tpot.score(X_test, y_test)

0.8666666666666667

In [5]:
tpot.export('tmp/tpot_composite_pipeline.py')