In [1]:
from tpot import TPOTRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pandas as pd

In [2]:
df_x = pd.read_csv('data/train_X_beta.csv', index_col=0)
df_y = pd.read_csv('data/train_y_beta.csv', index_col=0)
df_x_test = pd.read_csv('data/test_X_beta.csv', index_col=0)
df_y_test = pd.read_csv('data/test_y_beta.csv', index_col=0)

In [3]:
df_x.head()

Unnamed: 0,1,2,9,10,12,13,14,15,16,18,...,100628307,100630918,100631378,100642175,100652770,100652791,100653022,100689073,100689074,100820829
hct15,3.500325,3.756189,7.506203,4.122036,4.173129,3.503203,9.26822,3.81104,9.685387,5.303841,...,4.092048,5.094173,3.838323,5.096992,4.676327,3.679293,4.23113,3.556815,3.419656,5.074979
jhh4,3.966084,3.634253,5.373,4.021875,4.188665,3.537557,8.98712,3.523175,9.438923,3.700196,...,4.17885,5.46223,3.554032,5.974853,4.54004,3.465602,4.179827,3.654634,6.282016,4.772049
snu1105,5.987927,3.735161,6.718164,4.358988,4.054809,3.826454,8.876455,3.604815,9.886685,4.002146,...,3.304983,4.850331,3.608216,5.427099,4.660967,3.28227,4.358016,3.74716,3.373917,5.400448
j82,4.717745,3.816445,8.32989,4.542543,4.443213,3.834258,8.612918,3.683737,10.086658,5.139002,...,3.62755,5.37515,3.474723,4.517609,4.268056,3.476972,3.994838,3.683059,3.375735,5.427906
dang,4.099564,3.677475,7.171958,3.949591,3.837196,8.14068,8.946058,3.357206,9.466033,3.623898,...,3.736106,5.234944,3.462128,6.323425,4.947185,3.177413,3.775169,3.672165,3.733566,7.274285


In [4]:
df_y.head()

Unnamed: 0,Beta
hct15,0.000259
jhh4,0.0202
snu1105,0.006825
j82,0.01334
dang,0.03424


In [5]:
df_y['Beta'] = df_y['Beta'].apply(lambda x: x*100)
df_y.head()

Unnamed: 0,Beta
hct15,0.02588
jhh4,2.02
snu1105,0.6825
j82,1.334
dang,3.424


In [6]:
# TPOT setup
GENERATIONS = 3
POP_SIZE = 100
CV = 3

tpot = TPOTRegressor(
    generations=GENERATIONS,
    population_size=POP_SIZE,
    random_state=123,
    n_jobs=1, # cuML requires n_jobs=1
    cv=CV,
    verbosity=2,
)

tpot.fit(df_x, df_y['Beta'])

preds = tpot.predict(df_x_test)/100
print(r2_score(df_y_test, preds))

# Generation 1 - Current best internal CV score: -1.29947159128464
#
# Generation 2 - Current best internal CV score: -1.286105279456808
#
# Generation 3 - Current best internal CV score: -1.285009615623126
#
# Best pipeline: ElasticNetCV(Normalizer(DecisionTreeRegressor(RobustScaler(input_matrix), max_depth=9, min_samples_leaf=14, min_samples_split=3), norm=l1), l1_ratio=0.2, tol=0.01)

Optimization Progress:   0%|          | 0/400 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -1.29947159128464

Generation 2 - Current best internal CV score: -1.286105279456808

Generation 3 - Current best internal CV score: -1.285009615623126

Best pipeline: ElasticNetCV(Normalizer(DecisionTreeRegressor(RobustScaler(input_matrix), max_depth=9, min_samples_leaf=14, min_samples_split=3), norm=l1), l1_ratio=0.2, tol=0.01)


ValueError: could not convert string to float: 'snu466'

In [7]:
tpot.export('tpot_regression_cuml_pipeline.py')
print(tpot.export())

import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Normalizer, RobustScaler
from sklearn.tree import DecisionTreeRegressor
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=123)

# Average CV score on the training set was: -1.285009615623126
exported_pipeline = make_pipeline(
    RobustScaler(),
    StackingEstimator(estimator=DecisionTreeRegressor(max_depth=9, min_samples_leaf=14, min_samples_split=3)),
    Normalizer(