## Using TPOT for AutoML

### Building the Pipeline Optimizer

In [1]:
# Importing necessary tools and libraries
import tpot
from tpot import TPOTClassifier
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [2]:
# Initializing our TPOT pipeline optimizer
pipeline_optimizer = TPOTClassifier(generations=5, 
                                    verbosity=2, 
                                    config_dict="TPOT light")

### Preparing Data for Training

In [3]:
# Loading a dataset for training
data = datasets.load_breast_cancer()

# Splitting our data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data["data"], 
                                                    data["target"], 
                                                    test_size=0.2, 
                                                    stratify=data["target"])

### Training and Evaluating Our AutoML Algorithm

In [4]:
# Training the AutoML algorithm
pipeline_optimizer.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/600 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9648351648351647

Generation 2 - Current best internal CV score: 0.9824175824175825

Generation 3 - Current best internal CV score: 0.9824175824175825

Generation 4 - Current best internal CV score: 0.9824175824175825

Generation 5 - Current best internal CV score: 0.9846153846153847

Best pipeline: LogisticRegression(RobustScaler(CombineDFs(input_matrix, input_matrix)), C=0.1, dual=False, penalty=l2)


TPOTClassifier(config_dict='TPOT light', generations=5, verbosity=2)

In [5]:
# Evaluating performance on the test set
print(pipeline_optimizer.score(X_test, y_test))

0.9736842105263158


In [6]:
# Viewing the best pipeline
print(pipeline_optimizer.fitted_pipeline_)

Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('functiontransformer-1',
                                                 FunctionTransformer(func=<function copy at 0x000001897760FCA0>)),
                                                ('functiontransformer-2',
                                                 FunctionTransformer(func=<function copy at 0x000001897760FCA0>))])),
                ('robustscaler', RobustScaler()),
                ('logisticregression', LogisticRegression(C=0.1))])


In [7]:
# Viewing all evaluated pipelines
print(pipeline_optimizer.evaluated_individuals_)

{'LogisticRegression(input_matrix, LogisticRegression__C=0.1, LogisticRegression__dual=False, LogisticRegression__penalty=l2)': {'generation': 0, 'mutation_count': 0, 'crossover_count': 0, 'predecessor': ('ROOT',), 'operator_count': 1, 'internal_cv_score': 0.9428571428571428}, 'BernoulliNB(ZeroCount(input_matrix), BernoulliNB__alpha=0.001, BernoulliNB__fit_prior=True)': {'generation': 0, 'mutation_count': 0, 'crossover_count': 0, 'predecessor': ('ROOT',), 'operator_count': 2, 'internal_cv_score': 0.6263736263736264}, 'DecisionTreeClassifier(GaussianNB(input_matrix), DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=1, DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=7)': {'generation': 0, 'mutation_count': 0, 'crossover_count': 0, 'predecessor': ('ROOT',), 'operator_count': 2, 'internal_cv_score': 0.9472527472527472}, 'MultinomialNB(Normalizer(input_matrix, Normalizer__norm=l2), MultinomialNB__alpha=1.0, MultinomialNB__fit

### Exporting Best Pipeline Code

In [8]:
# Exporting the pipeline's code
pipeline_optimizer.export('tpot_pipeline.py')

In [None]:
### CODE GENERATED BY TPOT ###
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import RobustScaler
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', 
                        sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], 
                             random_state=None)

# Average CV score on the training set was: 0.9846153846153847
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    RobustScaler(),
    LogisticRegression(C=0.1, dual=False, penalty="l2")
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

### Doing Inference with the TPOT Pipeline

In [9]:
# Predicting class names
y_pred = pipeline_optimizer.predict(X_test)

# Viewing predictions
print(y_pred[:10])

[1 0 1 0 0 1 0 1 1 0]


### Creating Custom TPOT Configurations

In [10]:
# Creating a custom TPOT config
tpot_config = {"sklearn.linear_model.SGDClassifier": {
                    "loss": ["log", "hinge", "modified_huber"], 
                    "penalty": ["elasticnet", "l2"]},
               "sklearn.tree.DecisionTreeClassifier": {}}

In [11]:
# Creating a pipeline optimizer with a custom config
pipeline_optimizer = TPOTClassifier(generations=5, verbosity=2, config_dict=tpot_config)

In [12]:
# Training the optimizer with the custom config
pipeline_optimizer.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/600 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9362637362637363

Generation 2 - Current best internal CV score: 0.9362637362637363

Generation 3 - Current best internal CV score: 0.9362637362637363

Generation 4 - Current best internal CV score: 0.9406593406593406

Generation 5 - Current best internal CV score: 0.9406593406593406

Best pipeline: DecisionTreeClassifier(CombineDFs(input_matrix, SGDClassifier(input_matrix, loss=modified_huber, penalty=l2)))


TPOTClassifier(config_dict={'sklearn.linear_model.SGDClassifier': {'loss': ['log',
                                                                            'hinge',
                                                                            'modified_huber'],
                                                                   'penalty': ['elasticnet',
                                                                               'l2']},
                            'sklearn.tree.DecisionTreeClassifier': {}},
               generations=5, verbosity=2)

### Setting Custom Pipeline Templates

In [13]:
# Creating a custom pipeline template
tpot_template = "Selector-Transformer-Classifier"

In [14]:
# Creating a custom pipeline template with a specific selector
tpot_template = "SelectFwe-Transformer-Classifier"

In [15]:
# Creating a pipeline optimizer with a custom config
pipeline_optimizer = TPOTClassifier(generations=5, verbosity=2, 
                                    config_dict="TPOT light",
                                    template=tpot_template)

In [16]:
# Training the optimizer with the custom config
pipeline_optimizer.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/600 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9780219780219781

Generation 2 - Current best internal CV score: 0.9780219780219781

Generation 3 - Current best internal CV score: 0.9780219780219781

Generation 4 - Current best internal CV score: 0.9780219780219781

Generation 5 - Current best internal CV score: 0.9780219780219781

Best pipeline: LogisticRegression(StandardScaler(SelectFwe(input_matrix, alpha=0.004)), C=0.5, dual=False, penalty=l2)


TPOTClassifier(config_dict='TPOT light', generations=5,
               template='SelectFwe-Transformer-Classifier', verbosity=2)

In [17]:
# Viewing the best pipeline
print(pipeline_optimizer.fitted_pipeline_)

Pipeline(steps=[('selectfwe', SelectFwe(alpha=0.004)),
                ('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=0.5))])
