In [1]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split, cross_val_score
# from tpot.config.classifier_nn import classifier_config_nn

from sklearn.pipeline import make_pipeline
from tpot.config import classifier_config_dict_light
from tpot.config import classifier_config_dict
from sklearn.neighbors import KNeighborsClassifier


import pandas as pd
import numpy as np
import os
import glob

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
pd.options.mode.chained_assignment = None

Using TensorFlow backend.


In [2]:
personal_config = classifier_config_dict_light
personal_config = classifier_config_dict
personal_config['tpot.builtins.SimpleAutoencoder'] = {
    'encoding_dim': [10],
    'activation': ['relu'],
    'optimizer': ['adadelta'],
    'loss':['binary_crossentropy'],
    'epochs':[100],
    'batch_size':[200]
}

In [3]:
#one hot encoding function

#function that takes in a features pandas dataframe and turns it into a numpy array 
#with only the categorical variables one-hot encoded (and leaving out one of the features 
#of every one-hot encoded variable as baseline)
#@param:  features is a pandas df of features; threshold number is the number of unique values 
#a variable should have in order to be considered categorical
#@return:  a numpy matrix containing the original dataset but with the categorical variables
#one hot encoded and one of the one hot encoded features per cat variable left out; 
#for example an original feature set of 10 categorical variables with three categories each will 
#be transoformed into a numpy array with 20 dichotomous variables

def one_hot_encode(features, cat_threshold_number = 5):
    num_unique_vals_dict = {}
    feature_names = list(features)
    for feature in feature_names:
        label_encoder = LabelEncoder()
        features.loc[:, feature] = label_encoder.fit_transform(features.loc[:, feature])
        num_unique_vals_dict[feature] = len(label_encoder.classes_)

    features_to_onehot = []
    for feature in num_unique_vals_dict:
        if num_unique_vals_dict[feature] <= cat_threshold_number and num_unique_vals_dict[feature] > 1:
            features_to_onehot = features_to_onehot + [feature]

    #create index array listing indices in orginal feature names array that are present in features_to_onehot
    indices_to_onehot = np.nonzero(np.in1d(feature_names, features_to_onehot))

    onehot_encoder = OneHotEncoder(categorical_features = indices_to_onehot, sparse = False)
    features = onehot_encoder.fit_transform(features)

    idx_to_delete = np.cumsum([0] + list(num_unique_vals_dict.values()))

    idx_to_keep = [i for i in range(features.shape[1]) if i not in idx_to_delete]

    features = features[:, idx_to_keep]

    return features

In [4]:
#read in data
data = pd.read_table("sample_data.txt")
data = data.iloc[:, :11]
dv = data.iloc[:, 10:11]
features = data.iloc[:, :10]

In [5]:
#one hot encode data, then split it into training and validation
X = one_hot_encode(features)
y = dv
y = pd.np.array(y).ravel()
#split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.33, random_state = 42)
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_val = x_val.reshape((len(x_val)), np.prod(x_val.shape[1:]))

In [6]:
x_train.shape

(670, 20)

In [7]:
y_train.shape

(670,)

In [8]:
from tpot.builtins import SimpleAutoencoder

In [9]:
sae = SimpleAutoencoder(encoding_dim=10, 
                        activation='relu', 
                        optimizer='adadelta', 
                        loss='binary_crossentropy', 
                        epochs=100, batch_size=200)

In [10]:
sae.fit(x_train, y_train)

SimpleAutoencoder(activation='relu', batch_size=200, encoding_dim=10,
         epochs=100, loss='binary_crossentropy', optimizer='adadelta',
         random_state=42)

In [11]:
sae.transform(x_train).shape

(670, 10)

In [12]:
x_train.shape

(670, 20)

In [13]:
tpot = TPOTClassifier(generations=10, config_dict=personal_config,
                        population_size=10, verbosity=3,
                        template = 'SimpleAutoencoder-Classifier')

In [14]:
tpot.fit(x_train, y_train)

31 operators have been imported by TPOT.


Generation 1 - Current Pareto front scores:
-2	0.5508027947148135	XGBClassifier(SimpleAutoencoder(input_matrix, SimpleAutoencoder__activation=relu, SimpleAutoencoder__batch_size=200, SimpleAutoencoder__encoding_dim=10, SimpleAutoencoder__epochs=100, SimpleAutoencoder__loss=binary_crossentropy, SimpleAutoencoder__optimizer=adadelta), XGBClassifier__learning_rate=0.5, XGBClassifier__max_depth=8, XGBClassifier__min_child_weight=1, XGBClassifier__n_estimators=100, XGBClassifier__nthread=1, XGBClassifier__subsample=0.9500000000000001)

Generation 2 - Current Pareto front scores:
-2	0.5508027947148135	XGBClassifier(SimpleAutoencoder(input_matrix, SimpleAutoencoder__activation=relu, SimpleAutoencoder__batch_size=200, SimpleAutoencoder__encoding_dim=10, SimpleAutoencoder__epochs=100, SimpleAutoencoder__loss=binary_crossentropy, SimpleAutoencoder__optimizer=adadelta), XGBClassifier__learning_rate=0.5, XGBClassifier__max_depth=8, XGBClassifier__min_child_weight=1, XGBClassifier__n_estimators=100

TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT..., 'optimizer': ['adadelta'], 'loss': ['binary_crossentropy'], 'epochs': [100], 'batch_size': [200]}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=10, max_eval_time_mins=5,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=None, periodic_checkpoint_folder=None,
        population_size=10, random_state=None, scoring=None, subsample=1.0,
        template='SimpleAutoencoder-Classifier', use_dask=False,
        verbosity=3, warm_start=False)

In [15]:
tpot.fitted_pipeline_

Pipeline(memory=None,
     steps=[('simpleautoencoder', SimpleAutoencoder(activation='relu', batch_size=200, encoding_dim=10,
         epochs=100, loss='binary_crossentropy', optimizer='adadelta',
         random_state=42)), ('xgbclassifier', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsampl...=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.9500000000000001))])

In [16]:
tpot.score(x_train, y_train)

  if diff:


1.0

In [19]:
tpot.score(x_val, y_val)

  if diff:


0.5272727272727272

In [18]:
#compare with regular TPOT
tpot_reg = TPOTClassifier(generations=10, population_size=10, verbosity=3)
tpot_reg.fit(x_train, y_train)

31 operators have been imported by TPOT.




TPOT closed prematurely. Will use the current best pipeline.


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
        disable_update_check=False, early_stop=None, generations=10,
        max_eval_time_mins=5, max_time_mins=None, memory=None,
        mutation_rate=0.9, n_jobs=1, offspring_size=None,
        periodic_checkpoint_folder=None, population_size=10,
        random_state=None, scoring=None, subsample=1.0,
        template='RandomTree', use_dask=False, verbosity=3,
        warm_start=False)

In [18]:
tpot_reg.score(x_train, y_train)

0.9850746268656716