In [1]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split, cross_val_score
# from tpot.config.classifier_nn import classifier_config_nn

from sklearn.pipeline import make_pipeline
from tpot.config import classifier_config_dict_light
from tpot.config import classifier_config_dict
from sklearn.neighbors import KNeighborsClassifier


import pandas as pd
import numpy as np
import os
import glob

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
pd.options.mode.chained_assignment = None

Using TensorFlow backend.


In [21]:
personal_config = classifier_config_dict_light
personal_config = classifier_config_dict
personal_config['tpot.builtins.SimpleAutoencoder'] = {
    'encoding_dim': [10],
    'activation': ['relu'],
    'optimizer': ['adadelta'],
    'loss':['binary_crossentropy'],
    'epochs':[100],
    'batch_size':[200]
}

In [22]:
#one hot encoding function

#function that takes in a features pandas dataframe and turns it into a numpy array 
#with only the categorical variables one-hot encoded (and leaving out one of the features 
#of every one-hot encoded variable as baseline)
#@param:  features is a pandas df of features; threshold number is the number of unique values 
#a variable should have in order to be considered categorical
#@return:  a numpy matrix containing the original dataset but with the categorical variables
#one hot encoded and one of the one hot encoded features per cat variable left out; 
#for example an original feature set of 10 categorical variables with three categories each will 
#be transoformed into a numpy array with 20 dichotomous variables

def one_hot_encode(features, cat_threshold_number = 5):
    num_unique_vals_dict = {}
    feature_names = list(features)
    for feature in feature_names:
        label_encoder = LabelEncoder()
        features.loc[:, feature] = label_encoder.fit_transform(features.loc[:, feature])
        num_unique_vals_dict[feature] = len(label_encoder.classes_)

    features_to_onehot = []
    for feature in num_unique_vals_dict:
        if num_unique_vals_dict[feature] <= cat_threshold_number and num_unique_vals_dict[feature] > 1:
            features_to_onehot = features_to_onehot + [feature]

    #create index array listing indices in orginal feature names array that are present in features_to_onehot
    indices_to_onehot = np.nonzero(np.in1d(feature_names, features_to_onehot))

    onehot_encoder = OneHotEncoder(categorical_features = indices_to_onehot, sparse = False)
    features = onehot_encoder.fit_transform(features)

    idx_to_delete = np.cumsum([0] + list(num_unique_vals_dict.values()))

    idx_to_keep = [i for i in range(features.shape[1]) if i not in idx_to_delete]

    features = features[:, idx_to_keep]

    return features

In [23]:
#read in data
data = pd.read_table("sample_data.txt")
data = data.iloc[:, :11]
dv = data.iloc[:, 10:11]
features = data.iloc[:, :10]

In [24]:
#one hot encode data, then split it into training and validation
X = one_hot_encode(features)
y = dv
y = pd.np.array(y).ravel()
#split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.33, random_state = 42)
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_val = x_val.reshape((len(x_val)), np.prod(x_val.shape[1:]))

In [25]:
x_train.shape

(670, 20)

In [26]:
y_train.shape

(670,)

In [27]:
from tpot.builtins import SimpleAutoencoder

In [28]:
sae = SimpleAutoencoder(encoding_dim=10, 
                        activation='relu', 
                        optimizer='adadelta', 
                        loss='binary_crossentropy', 
                        epochs=100, batch_size=200)

In [29]:
sae.fit(x_train, y_train)

SimpleAutoencoder(activation='relu', batch_size=200, encoding_dim=10,
         epochs=100, loss='binary_crossentropy', optimizer='adadelta',
         random_state=42)

In [11]:
sae.transform(x_train).shape

(670, 3)

In [12]:
x_train.shape

(670, 20)

In [30]:
tpot = TPOTClassifier(generations=10, config_dict=personal_config,
                        population_size=10, verbosity=3,
                        template = 'SimpleAutoencoder-Classifier')

In [None]:
tpot.fit(x_train, y_train)

31 operators have been imported by TPOT.


Skipped pipeline #10 due to time out. Continuing to the next pipeline.


In [15]:
tpot.fitted_pipeline_

Pipeline(memory=None,
     steps=[('simpleautoencoder', SimpleAutoencoder(activation='relu', batch_size=200, encoding_dim=3,
         epochs=100, loss='binary_crossentropy', optimizer='adadelta',
         random_state=42)), ('randomforestclassifier', RandomForestClassifier(bootstrap=False, class_weight=None,
            crite...mators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [16]:
tpot.score(x_train, y_train)

0.7373134328358208

In [17]:
#compare with regular TPOT
tpot_reg = TPOTClassifier(generations=10, population_size=10, verbosity=3)
tpot_reg.fit(x_train, y_train)

31 operators have been imported by TPOT.


_pre_test decorator: _random_mutation_operator: num_test=0 Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=True.
Generation 1 - Current Pareto front scores:
-1	0.739123264213602	ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=True, ExtraTreesClassifier__criterion=gini, ExtraTreesClassifier__max_features=0.8500000000000001, ExtraTreesClassifier__min_samples_leaf=7, ExtraTreesClassifier__min_samples_split=8, ExtraTreesClassifier__n_estimators=100)

Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.
Generation 2 - Current Pareto front scores:
-1	0.739123264213602	ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=True, ExtraTreesClassifier__criterion=gini, ExtraTreesClassifier__max_features=0.8500000000000001, ExtraTreesClassifier__min_samples_leaf=7, ExtraTreesClassifier__min_sa

-2	0.8747700927276731	ExtraTreesClassifier(RFE(input_matrix, RFE__ExtraTreesClassifier__criterion=entropy, RFE__ExtraTreesClassifier__max_features=0.55, RFE__ExtraTreesClassifier__n_estimators=100, RFE__step=0.6000000000000001), ExtraTreesClassifier__bootstrap=False, ExtraTreesClassifier__criterion=gini, ExtraTreesClassifier__max_features=0.8500000000000001, ExtraTreesClassifier__min_samples_leaf=7, ExtraTreesClassifier__min_samples_split=8, ExtraTreesClassifier__n_estimators=100)
-3	0.8882238764406871	ExtraTreesClassifier(RFE(DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=4, DecisionTreeClassifier__min_samples_leaf=18, DecisionTreeClassifier__min_samples_split=10), RFE__ExtraTreesClassifier__criterion=entropy, RFE__ExtraTreesClassifier__max_features=0.55, RFE__ExtraTreesClassifier__n_estimators=100, RFE__step=0.6000000000000001), ExtraTreesClassifier__bootstrap=True, ExtraTreesClassifier__criterion=gini, ExtraTreesClas

TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
        disable_update_check=False, early_stop=None, generations=10,
        max_eval_time_mins=5, max_time_mins=None, memory=None,
        mutation_rate=0.9, n_jobs=1, offspring_size=None,
        periodic_checkpoint_folder=None, population_size=10,
        random_state=None, scoring=None, subsample=1.0,
        template='RandomTree', use_dask=False, verbosity=3,
        warm_start=False)

In [18]:
tpot_reg.score(x_train, y_train)

0.9850746268656716