In [81]:
# Here we directly train an ExtraTreeRegressor directly
# since automl module is not avialable publicly

%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, r2_score
# import automl


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [82]:
# Prepare dataset for training

df = pd.read_csv("best_rho_GW_train_test.csv", index_col=0)
# create features
df["neps"] = 1/(df["n"]*df["eps"])
df["sqrtneps"] = (1/(df["n"]*df["eps"])).apply(np.sqrt)
city_features = pd.read_csv("city_features_fromGW.csv", index_col=0)
data = df.join(city_features.set_index('city'), on="city")

# first 45 values are the train set, use rest as test
data_train = data.iloc[:45,:]
data_test = data.iloc[45:,:]
print(len(data_train), len(data_test))
data_train.head(2)

45 75


Unnamed: 0,city,n,eps,best,neps,sqrtneps,hot10,entropy_512
0,gowalla_Cook,98218,0.05,0.03,0.000204,0.01427,679,7.340522
1,gowalla_Cook,98218,0.1,0.025,0.000102,0.01009,679,7.340522


In [83]:
y_train = data_train['best']
X_train = data_train.loc[:, data.columns != 'best']
print('Training set size', len(y_train), len(X_train))

X_test = data_test.loc[:, data.columns != 'best']
print('Testing set size', len(X_test))

# keep only the required features in the train set
feature_set = ['n','eps','neps','sqrtneps', 'entropy_512']
X_train = X_train[X_train.columns.intersection(feature_set)]
X_test = X_test[X_test.columns.intersection(feature_set)]


print(X_train.head(2))
print(X_test.head(2))

Training set size 45 45
Testing set size 75
       n   eps      neps  sqrtneps  entropy_512
0  98218  0.05  0.000204   0.01427     7.340522
1  98218  0.10  0.000102   0.01009     7.340522
        n   eps      neps  sqrtneps  entropy_512
45  54792  0.05  0.000365  0.019105     6.171421
46  54792  0.10  0.000183  0.013510     6.171421


In [84]:
# define the model with hyperparameter determine via automl
model = ExtraTreesRegressor(n_estimators=250, max_features= 0.777777778, min_samples_leaf= 0.000625, min_samples_split= 0.00125)
# Create cross-validation folds
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=7)
# Evaluate mae by cross-validation
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
# report approximate model performance
print('MAE: %.5f (%.5f)' % (mean(n_scores), std(n_scores)))
# fit model
model.fit(X_train, y_train)

MAE: -0.00353 (0.00161)


ExtraTreesRegressor(max_features=0.777777778, min_samples_leaf=0.000625,
                    min_samples_split=0.00125, n_estimators=250)

In [85]:
# save model for later use
import gzip
import pickle
with gzip.open('ParamSelect_trained_model.pklz', 'wb') as f:
    pickle.dump(model, f)

In [86]:
# If we want to load model and make new predictions
import gzip
import pickle
with gzip.open('ParamSelect_trained_model.pklz', 'r') as f:
    est1 = pickle.load(f)
y_pred = est1.predict(X_test)
X_test_print = X_test.copy()
X_test_print['predicted_vals'] = y_pred
X_test_print.head(5)

Unnamed: 0,n,eps,neps,sqrtneps,entropy_512,predicted_vals
45,54792,0.05,0.000365,0.019105,6.171421,0.0307
46,54792,0.1,0.000183,0.01351,6.171421,0.02504
47,54792,0.2,9.1e-05,0.009553,6.171421,0.0225
48,54792,0.4,4.6e-05,0.006755,6.171421,0.0204
49,54792,0.8,2.3e-05,0.004776,6.171421,0.016404
