attempting to run an autoML pipeline for location A

In [1]:
from tpot import TPOTRegressor
from functions import load_data, get_train_targets, get_test_data, prepare_submission
from sklearn.model_selection import train_test_split, RepeatedKFold

data_a, data_b, data_c = load_data()

X_train_a, targets_a = get_train_targets(data_a)

In [2]:
X_train_a.columns

Index(['time', 'absolute_humidity_2m:gm3', 'air_density_2m:kgm3',
       'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W',
       'cloud_base_agl:m', 'dew_or_rime:idx', 'dew_point_2m:K',
       'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J',
       'effective_cloud_cover:p', 'elevation:m', 'fresh_snow_12h:cm',
       'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm',
       'fresh_snow_6h:cm', 'is_day:idx', 'is_in_shadow:idx',
       'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx',
       'pressure_100m:hPa', 'pressure_50m:hPa', 'prob_rime:p',
       'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'sfc_pressure:hPa',
       'snow_density:kgm3', 'snow_depth:cm', 'snow_drift:idx',
       'snow_melt_10min:mm', 'snow_water:kgm2', 'sun_azimuth:d',
       'sun_elevation:d', 'super_cooled_liquid_water:kgm2', 't_1000hPa:K',
       'total_cloud_cover:p', 'visibility:m', 'wind_speed_10m:ms',
       'wind_speed_u_10m:ms', 'wind_sp

In [3]:
drop_cols = ['time', 'date_calc', 'elevation:m', 'fresh_snow_1h:cm', 'wind_speed_u_10m:ms', 'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms']

In [12]:
X = X_train_a.drop(columns=drop_cols).fillna(0)
y = targets_a

cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)

model = TPOTRegressor(generations=5, population_size=50, scoring='neg_mean_absolute_error', cv=cv, verbosity=2, random_state=1, n_jobs=-1)
model.fit(X, y)
model.export('tpot_locA_best_model.py')

                                                                               
                                                                               
TPOT closed during evaluation in one generation.
                                                                               
                                                                               
TPOT closed prematurely. Will use the current best pipeline.
                                                                               
Best pipeline: XGBRegressor(ExtraTreesRegressor(input_matrix, bootstrap=False, max_features=0.3, min_samples_leaf=1, min_samples_split=12, n_estimators=100), learning_rate=1.0, max_depth=10, min_child_weight=2, n_estimators=100, n_jobs=1, objective=reg:squarederror, subsample=0.7500000000000001, verbosity=0)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_train_a.drop(columns=drop_cols).fillna(0), targets_a,
                                                     train_size=0.9, test_size=0.1, random_state=42)
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_loc_A_pipeline.py')

                                                                               
                                                                               
TPOT closed during evaluation in one generation.
                                                                               
                                                                               
TPOT closed prematurely. Will use the current best pipeline.
                                                                               

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: -10.812040755234403
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    ExtraTreesRegressor(bootstrap=False, max_features=0.5, min_samples_leaf=2, min_samples_split=3, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)