## Section 5.2

### 1. Building our own AutoML script using Optuna

In [1]:
import pandas as pd
import numpy as np
import impyute as impy

import optuna
from optuna.samplers import TPESampler

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, \
            StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, \
            chi2, f_classif, mutual_info_classif
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [3]:
data = pd.read_csv("../titanic/train.csv")

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
y = data['Survived']
X = data.drop('Survived', axis=1)

In [6]:
def label_encode_column(col):
    nans = col.isnull()
    nan_lst = []
    nan_idx_lst = []
    label_lst = []
    label_idx_lst = []

    for idx, nan in enumerate(nans):
        if nan:
            nan_lst.append(col[idx])
            nan_idx_lst.append(idx)
        else:
            label_lst.append(col[idx])
            label_idx_lst.append(idx)

    nan_df = pd.DataFrame(nan_lst, index=nan_idx_lst)
    label_df = pd.DataFrame(label_lst, index=label_idx_lst) 

    label_encoder = LabelEncoder()
    label_df = label_encoder.fit_transform(label_df.astype(str))
    label_df = pd.DataFrame(label_df, index=label_idx_lst)
    final_col = pd.concat([label_df, nan_df])
    
    return final_col.sort_index()

for column_name in X.columns:
    if str(X[column_name].dtype) == 'object':
        X[column_name] = label_encode_column(X[column_name])
        if len(X[column_name].unique()) > len(X)/3:
            X = X.drop(column_name, axis=1)

  return f(**kwargs)


In [7]:
def mice_imputer(data):
    data = data.to_numpy()
    imputed_data = impy.mice(data)
    imputed_data = pd.DataFrame(imputed_data)
    return imputed_data

def mean_imputer(data):
    imputer = SimpleImputer(strategy='mean')
    imputed_data = imputer.fit_transform(data)
    imputed_data = pd.DataFrame(imputed_data)
    return imputed_data

def feature_selector(X, y, k, algo='f_classif'):
    kbest = SelectKBest(eval(algo), k)
    X = kbest.fit_transform(X, y)
    X = pd.DataFrame(X)
    return X

def scaling(data, scaler='min_max'):
    if scaler=='min_max':
        scaled_data = MinMaxScaler().fit_transform(data)
    else:
        scaled_data = StandardScaler().fit_transform(data)
    scaled_data = pd.DataFrame(scaled_data)
    return scaled_data

In [8]:
def objective_func(trial):
    
    try:
        imputer = trial.suggest_categorical('impyter', ['mice', 'mean'])
        if imputer=='mice':
            imputed_X = mice_imputer(X)
        else:
            imputed_X = mean_imputer(X)

        fea_slct = trial.suggest_categorical('fea_slct', ['chi2', 'f_classif', 'mutual_info_classif'])
        no_feature_cols = trial.suggest_int('k', 3, len(X.columns))
        selected_features = feature_selector(imputed_X, y, no_feature_cols, fea_slct)

        scaler = trial.suggest_categorical('scaler', ['min_max', 'standard'])
        scaled_X = scaling(selected_features)
        
    except:
        return 0.0
    
    classifier_name = trial.suggest_categorical("classifier", ["SVC", "RandomForest"])
    if classifier_name == "SVC":
        c = trial.suggest_loguniform("svc_c", 1e-2, 1e+11)
        gamma = trial.suggest_loguniform("svc_gamma", 1e-9, 1e+3)
        kernel = trial.suggest_categorical("svc_kernel", ['rbf','poly','rbf','sigmoid'])
        degree = trial.suggest_categorical("svc_degree", range(1,15))
        clf = SVC(C=c, gamma=gamma, kernel=kernel, degree=degree)
    else:
        algorithm = trial.suggest_categorical("algorithm", ['ball_tree', "kd_tree"])
        leaf_size = trial.suggest_categorical("leaf_size", range(1,50))
        metric = trial.suggest_categorical("metic", ["euclidean","manhattan", "chebyshev","minkowski"])
        clf = KNeighborsClassifier(algorithm=algorithm, leaf_size=leaf_size, metric=metric)
        
    clf.fit(scaled_X, y)
    val_acc = clf.score(scaled_X, y)
    
    return val_acc

study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective_func, n_trials=20)
best_trial = study.best_trial.value

print(f"Best trial  accuracy: {best_trial}")
print("parameters for best trial are :")
for key, value in study.best_trial.params.items():
    print(f"{key}: {value}")

[I 2020-09-10 02:57:38,965] Trial 0 finished with value: 0.856341189674523 and parameters: {'impyter': 'mean', 'fea_slct': 'mutual_info_classif', 'k': 3, 'scaler': 'min_max', 'classifier': 'RandomForest', 'algorithm': 'ball_tree', 'leaf_size': 34, 'metic': 'chebyshev'}. Best is trial 0 with value: 0.856341189674523.
[I 2020-09-10 02:57:41,936] Trial 1 finished with value: 0.8754208754208754 and parameters: {'impyter': 'mice', 'fea_slct': 'mutual_info_classif', 'k': 5, 'scaler': 'min_max', 'classifier': 'RandomForest', 'algorithm': 'ball_tree', 'leaf_size': 39, 'metic': 'manhattan'}. Best is trial 1 with value: 0.8754208754208754.
[I 2020-09-10 02:57:43,549] Trial 2 finished with value: 0.8361391694725028 and parameters: {'impyter': 'mice', 'fea_slct': 'f_classif', 'k': 4, 'scaler': 'min_max', 'classifier': 'RandomForest', 'algorithm': 'kd_tree', 'leaf_size': 8, 'metic': 'manhattan'}. Best is trial 1 with value: 0.8754208754208754.
[I 2020-09-10 02:57:44,537] Trial 3 finished with value

Best trial  accuracy: 0.8754208754208754
parameters for best trial are :
impyter: mice
fea_slct: mutual_info_classif
k: 5
scaler: min_max
classifier: RandomForest
algorithm: ball_tree
leaf_size: 39
metic: manhattan


### 2. Using TPOT

In [9]:
from tpot import TPOTClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64),
    iris.target.astype(np.float64), train_size=0.75, test_size=0.25)

tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_iris_pipeline.py')



HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=300.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.9731225296442687
Generation 2 - Current best internal CV score: 0.9731225296442687
Generation 3 - Current best internal CV score: 0.9731225296442687
Generation 4 - Current best internal CV score: 0.982213438735178
Generation 5 - Current best internal CV score: 0.982213438735178
Best pipeline: GaussianNB(MLPClassifier(input_matrix, alpha=0.01, learning_rate_init=0.001))
0.9736842105263158
