<a href="https://colab.research.google.com/github/swilsonmfc/automl/blob/main/TPOT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TPOT

![](https://raw.githubusercontent.com/EpistasisLab/tpot/master/images/tpot-logo.jpg)

# Install

In [15]:
!pip install tpot



# Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from tpot import TPOTClassifier
from tpot import TPOTRegressor

ModuleNotFoundError: ignored

# TPOT

![](https://raw.githubusercontent.com/EpistasisLab/tpot/master/images/tpot-ml-pipeline.png)

## Example Pipeline

![](https://raw.githubusercontent.com/EpistasisLab/tpot/master/images/tpot-pipeline-example.png)

# Classification

## Fit

In [17]:
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data.astype(np.float64),
    iris.target.astype(np.float64), 
    test_size=0.25)

tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)

tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_iris_pipeline.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=300.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.9826086956521738

Generation 2 - Current best internal CV score: 0.9826086956521738

Generation 3 - Current best internal CV score: 0.9826086956521738

Generation 4 - Current best internal CV score: 0.9826086956521738

Generation 5 - Current best internal CV score: 0.9826086956521738

Best pipeline: MLPClassifier(input_matrix, alpha=0.001, learning_rate_init=0.1)
0.9736842105263158


## Inspect

In [18]:
!cat tpot_iris_pipeline.py

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.9826086956521738
exported_pipeline = MLPClassifier(alpha=0.001, learning_rate_init=0.1)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


## Compare

In [19]:
results_df = pd.DataFrame(columns=['Accuracy'])

In [20]:
lr = LogisticRegression(solver='lbfgs', multi_class='auto')
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
score = accuracy_score(pred, y_test)
results_df.loc['Logistic Regression'] = [score]

In [21]:
pred = tpot.predict(X_test)
score = accuracy_score(pred, y_test)
results_df.loc['TPOT'] = [score]

In [22]:
results_df

Unnamed: 0,Accuracy
Logistic Regression,0.973684
TPOT,0.973684


# Regression

## Fit

In [23]:
boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(
    boston.data.astype(np.float64),
    boston.target.astype(np.float64), 
    test_size=0.25)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(-tpot.score(X_test, y_test))
tpot.export('tpot_boston_pipeline.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=300.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: -14.324537700543285

Generation 2 - Current best internal CV score: -13.75463485568186

Generation 3 - Current best internal CV score: -13.604432474775468

Generation 4 - Current best internal CV score: -13.604432474775468

Generation 5 - Current best internal CV score: -13.115636754240995

Best pipeline: ExtraTreesRegressor(ExtraTreesRegressor(ExtraTreesRegressor(input_matrix, bootstrap=False, max_features=0.55, min_samples_leaf=2, min_samples_split=14, n_estimators=100), bootstrap=False, max_features=0.55, min_samples_leaf=2, min_samples_split=14, n_estimators=100), bootstrap=False, max_features=0.55, min_samples_leaf=2, min_samples_split=18, n_estimators=100)
6.230472175142373


## Inspect

In [24]:
!cat tpot_boston_pipeline.py

import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -13.115636754240995
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.55, min_samples_leaf=2, min_samples_split=14, n_estimators=100)),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.55, min_samples_leaf=2, min_samples_split=14, n_estimators=100)

## Compare

In [25]:
results_df = pd.DataFrame(columns=['MAE', 'MSE'])

In [26]:
rf = RandomForestRegressor(n_estimators=100, random_state=12)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
mae = mean_absolute_error(pred, y_test)
mse = mean_squared_error(pred, y_test)
results_df.loc['RandomForest'] = [mae, mse]

In [27]:
pred = tpot.predict(X_test)
mae = mean_absolute_error(pred, y_test)
mse = mean_squared_error(pred, y_test)
results_df.loc['TPOT'] = [mae, mse]

In [28]:
results_df

Unnamed: 0,MAE,MSE
RandomForest,2.224024,9.38671
TPOT,1.807776,6.230472


# Algorithms

* sklearn.naive_bayes.BernoulliNB: 
 * alpha: [1e-3, 1e-2, 1e-1, 1., 10., 100.]
 * fit_prior: [True, False] }
* sklearn.naive_bayes.MultinomialNB: 
 * alpha: [1e-3, 1e-2, 1e-1, 1., 10., 100.]
 * fit_prior: [True, False]
* sklearn.tree.DecisionTreeClassifier: 
 * criterion: [“gini”, “entropy”]
 * max_depth: range(1, 11)
 * min_samples_split: range(2, 21)
 * min_samples_leaf: range(1, 21)
* sklearn.ensemble.ExtraTreesClassifier: 
 * n_estimators: [100]
 * criterion: [“gini”, “entropy”]
 * max_features: np.arange(0.05, 1.01, 0.05)
 * min_samples_split: range(2, 21)
 * min_samples_leaf: range(1, 21)
 * bootstrap: [True, False]
* sklearn.ensemble.RandomForestClassifier: 
 * n_estimators: [100]
 * criterion: [“gini”, “entropy”]
 * max_features: np.arange(0.05, 1.01, 0.05)
 * min_samples_split: range(2, 21)
 * min_samples_leaf: range(1, 21)
 * bootstrap: [True, False] 
* sklearn.ensemble.GradientBoostingClassifier’:
 * n_estimators’: [100]
 * learning_rate: [1e-3, 1e-2, 1e-1, 0.5, 1.]
 * max_depth: range(1, 11)
 * min_samples_split: range(2, 21)
 * min_samples_leaf: range(1, 21)
 * subsample: np.arange(0.05, 1.01, 0.05)
 * max_features’: np.arange(0.05, 1.01, 0.05)
* sklearn.neighbors.KNeighborsClassifier’:
 * n_neighbors: range(1, 101)
 * weights: [“uniform”, “distance”]
 * pL [1, 2] 
* sklearn.svm.LinearSVC:
 * penalty: [“l1”, “l2”]
 * loss: [“hinge”, “squared_hinge”]
 * dual: [True, False]
 * tol: [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
 * C: [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]
* sklearn.linear_model.LogisticRegression:
 * penalty: [“l1”, “l2”]
 * C: [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]
 * dual: [True, False]
* xgboost.XGBClassifier: 
 * n_estimators: [100]
 * max_depth: range(1, 11)
 * learning_rate: [1e-3, 1e-2, 1e-1, 0.5, 1.]
 * subsample: np.arange(0.05, 1.01, 0.05)
 * min_child_weight: range(1, 21)
 * nthread: [1] 

# Feature Engineering

* sklearn.preprocessing.Binarizer’:
 * threshold’: np.arange(0.0, 1.01, 0.05) 
* sklearn.decomposition.FastICA’:
 * tol: np.arange(0.0, 1.01, 0.05)
* sklearn.cluster.FeatureAgglomeration’:
 * linkage: [‘ward’, ‘complete’, ‘average’]
 * affinity: [‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’, ‘cosine’]
* sklearn.preprocessing.MaxAbsScaler 
* sklearn.preprocessing.MinMaxScaler
* sklearn.preprocessing.Normalizer’
 * ‘norm’: [‘l1’, ‘l2’, ‘max’] 
* sklearn.kernel_approximation.Nystroem:
 * kernel: [‘rbf’, ‘cosine’, ‘chi2’, ‘laplacian’, ‘polynomial’, ‘poly’, ‘linear’, ‘additive_chi2’, ‘sigmoid’], 
 * gamma: np.arange(0.0, 1.01, 0.05)
 * n_components: range(1, 11) 
* sklearn.decomposition.PCA:
 * svd_solver: [‘randomized’]
 * iterated_power: range(1, 11) 
* sklearn.preprocessing.PolynomialFeatures:
 * degree: [2]
 * include_bias: [False]
 * interaction_only: [False] 
* sklearn.kernel_approximation.RBFSampler:
 * gamma: np.arange(0.0, 1.01, 0.05)
* sklearn.preprocessing.RobustScaler
* sklearn.preprocessing.StandardScaler
* tpot.builtins.ZeroCount
* tpot.builtins.OneHotEncoder:
 * minimum_fraction: [0.05, 0.1, 0.15, 0.2, 0.25]
 * sparse’: [False]

# Tips
* Prepare to wait - TPOT has a lot of models to work through
 * Control number of generations & population
 * You can get 2 or more models if not converged
 * TPOT does offer a pause and resume method
* Reproduceability - Randomness
* TPOT has its own OHE
 * < 10 unique values are treated as categorical
 * You can encode your own and feed into TPOT
* TPOT isn't the only place you'll see Genetic Algorithms paired with ML
 * Neural Networks (Nodes & Layers)
* Genetic Algorithms aren't the only Meta-Heuristic