# TPOT using Random Forest 

### Auto Dataset

In [1]:
import pandas as pd
import numpy as np
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from pmlb import fetch_data


In [2]:
auto = fetch_data('auto')
auto.head()

Unnamed: 0,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,target
0,26,11,1,0,2,2,1,0,93.7,157.3,...,1,4,19,9.4,42,16,31.0,38.0,119,2
1,3,17,1,1,1,3,1,0,99.1,186.6,...,5,25,10,9.0,23,16,19.0,26.0,70,2
2,17,19,1,0,2,0,2,0,98.4,176.2,...,5,30,28,9.3,9,8,24.0,30.0,62,2
3,19,9,1,0,2,0,2,0,96.6,180.3,...,5,22,12,8.3,21,7,16.0,18.0,92,3
4,51,20,1,0,2,0,1,0,94.5,159.3,...,5,13,24,8.5,54,16,24.0,29.0,15,3


In [3]:
features_auto = auto.drop('target', axis=1)

In [4]:
train_auto_features, test_auto_features, train_auto_target, test_auto_target = train_test_split(features_auto, auto['target'], train_size = 0.75, test_size = 0.25, random_state = 42) 

In [5]:
#specify that we want Random Forest
tpot_config = {
   'sklearn.ensemble.RandomForestClassifier': {
        'n_estimators': [1000],
        'criterion': ["gini", "entropy"],
        'max_features': np.arange(0.05, 1.01, 0.1),
        'min_samples_split': range(2, 21),
        'min_samples_leaf':  range(1, 21),
        'bootstrap': [True, False]
    }
}

In [None]:

tpot = TPOTClassifier(generations=100, verbosity=2,
                      config_dict=tpot_config, random_state = 42)
tpot.fit(train_auto_features, train_auto_target)
print(tpot.score(test_auto_features, test_auto_target))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=10100.0, style=ProgressStyle(…


Generation 1 - Current best internal CV score: 0.801290322580645
Generation 2 - Current best internal CV score: 0.801290322580645

In [9]:
tpot.export('tpot_RandomForest_auto.py')

### Banana

In [10]:
banana = fetch_data('banana')
banana.head()

Unnamed: 0,At1,At2,target
0,1.14,-0.114,-1.0
1,-1.52,-1.15,1.0
2,-1.05,0.72,-1.0
3,-0.916,0.397,1.0
4,-1.09,0.437,1.0


In [11]:
features_banana = banana.drop('target', axis = 1)
train_banana_features, test_banana_features, train_banana_target, test_banana_target = train_test_split(features_banana, banana['target'], train_size= 0.75, test_size = 0.25, random_state=42)

In [12]:
tpot.fit(train_banana_features, train_banana_target)
print(tpot.score(test_banana_features, test_banana_target))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=600.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.9008805031446541
Generation 2 - Current best internal CV score: 0.9008805031446541
Generation 3 - Current best internal CV score: 0.901132075471698
Generation 4 - Current best internal CV score: 0.9018867924528301
Generation 5 - Current best internal CV score: 0.9018867924528301
Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.05, min_samples_leaf=1, min_samples_split=14, n_estimators=1000)
0.8950943396226415


In [13]:
tpot.export('tpot_RandomForest_banana.py')

### Breast Cancer

In [14]:
B_Cancer = fetch_data('breast-cancer')
B_Cancer.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,target
0,2,2,2,0,2,3,1,3,0,1
1,3,0,2,0,1,1,1,1,0,0
2,3,0,6,0,1,2,0,2,0,1
3,2,2,6,0,2,3,1,2,1,0
4,2,2,5,4,2,2,0,5,0,1


In [15]:
features_B_Cancer = B_Cancer.drop('target', axis = 1)
train_cancer_features, test_cancer_features, train_cancer_target, test_cancer_target = train_test_split(features_B_Cancer , B_Cancer['target'], train_size = 0.75, test_size = 0.25 , random_state =42 )

In [16]:
tpot.fit(train_cancer_features, train_cancer_target)
print(tpot.score(test_cancer_features, test_cancer_target))
tpot.export('tpot_RandomForest_Breast_Cancer.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=600.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.7428571428571429
Generation 2 - Current best internal CV score: 0.7428571428571429

Exception ignored in: <function WeakSet.__init__.<locals>._remove at 0x0000019EA2C6E558>
Traceback (most recent call last):
  File "C:\Users\waral\Anaconda3\lib\_weakrefset.py", line 38, in _remove
    def _remove(item, selfref=ref(self)):
stopit.utils.TimeoutException



Generation 3 - Current best internal CV score: 0.7476190476190476
Generation 4 - Current best internal CV score: 0.7476190476190476
Generation 5 - Current best internal CV score: 0.7476190476190476
Best pipeline: RandomForestClassifier(RandomForestClassifier(CombineDFs(input_matrix, input_matrix), bootstrap=True, criterion=gini, max_features=0.05, min_samples_leaf=18, min_samples_split=3, n_estimators=1000), bootstrap=True, criterion=gini, max_features=0.25000000000000006, min_samples_leaf=1, min_samples_split=11, n_estimators=1000)
0.7916666666666666


### Contraceptive

In [17]:
contraceptive = fetch_data('contraceptive')
contraceptive.head()

Unnamed: 0,Wife_age,Wife_education,Husband_education,Children,Wife_religion,Wife_working,Husband_occupation,Standard-of-living,Media_exposure,target
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1


In [18]:
features_contraceptive = contraceptive.drop('target', axis =1)


In [20]:
train_contra_features, test_contra_features, train_contra_target, test_contra_target = train_test_split(features_contraceptive, contraceptive['target'], train_size=0.75, test_size = 0.25, random_state=42)

In [21]:
tpot.fit(train_contra_features, train_contra_target)
print(tpot.score(test_contra_features, test_contra_target))
tpot.export('tpot_RandomForest_contraception.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=600.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.5607075277663511
Generation 2 - Current best internal CV score: 0.5607075277663511

Exception ignored in: <function WeakSet.__init__.<locals>._remove at 0x0000019EA2C6E558>
Traceback (most recent call last):
  File "C:\Users\waral\Anaconda3\lib\_weakrefset.py", line 38, in _remove
    def _remove(item, selfref=ref(self)):
stopit.utils.TimeoutException



Generation 3 - Current best internal CV score: 0.5607075277663511
Generation 4 - Current best internal CV score: 0.5607075277663511
Generation 5 - Current best internal CV score: 0.5607075277663511
Best pipeline: RandomForestClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.35000000000000003, min_samples_leaf=14, min_samples_split=4, n_estimators=1000)
0.6070460704607046


### Pima 

In [28]:
pima = fetch_data('pima')
pima.head()

Unnamed: 0,Pregnant,plasma glucose,Diastolic blood pressure,Triceps skin fold thickness,2-Hour serum insulin,Body mass index,Diabetes pedigree function,Age,target
0,4.0,117.0,62.0,12.0,0.0,29.7,0.38,30.0,1
1,4.0,158.0,78.0,0.0,0.0,32.9,0.803,31.0,1
2,2.0,118.0,80.0,0.0,0.0,42.9,0.693,21.0,1
3,13.0,129.0,0.0,30.0,0.0,39.9,0.569,44.0,1
4,5.0,162.0,104.0,0.0,0.0,37.7,0.151,52.0,1


In [29]:
features_pima = pima.drop('target', axis = 1)
train_pima_features, test_pima_features, train_pima_target, test_pima_target = train_test_split(features_pima, pima['target'], train_size=0.75, test_size=0.25, random_state=42)

In [30]:
tpot.fit(train_pima_features, train_pima_target)
print(tpot.score(test_pima_features, test_pima_target))
tpot.export('tpot_RandomForest_pima.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=600.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.7759970014992503
Generation 2 - Current best internal CV score: 0.7759970014992503
Generation 3 - Current best internal CV score: 0.7759970014992503
Generation 4 - Current best internal CV score: 0.7759970014992503
Generation 5 - Current best internal CV score: 0.7777211394302849
Best pipeline: RandomForestClassifier(CombineDFs(input_matrix, RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.35000000000000003, min_samples_leaf=10, min_samples_split=3, n_estimators=1000)), bootstrap=False, criterion=gini, max_features=0.45000000000000007, min_samples_leaf=20, min_samples_split=14, n_estimators=1000)
0.78125
