In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import zipfile
import io
import pandas as pd
from IPython.display import display
from collections import defaultdict

zipped_data_path = "../data/clean_data/class-competition-not-one-hot-encoders.zip"

dataframes = defaultdict(pd.DataFrame)
with zipfile.ZipFile(zipped_data_path, "r") as zip:
    for filename in zip.namelist():
        if filename.endswith(".csv"):
            with zip.open(filename) as f:
                dataframes.update({ filename : pd.read_csv(io.TextIOWrapper(f)) })

                # Lets take a look at the files
                print(f"FILE: {filename}")
                # If you want to see file info uncomment this:
                # display(dataframes[filename].info())
                # display(dataframes[filename].head())

train_data = dataframes["train.csv"]
test_data  = dataframes["test_public.csv"]


FILE: train.csv
FILE: test_public.csv


In [4]:
from sklearn.model_selection import train_test_split

ALL_FEATURES = ['TRIP_ID', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'MISSING_DATA',
 'POLYLINE', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C', 'YEAR_2013', 'YEAR_2014',
 'MONTH_1', 'MONTH_2', 'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6',
 'MONTH_7', 'MONTH_8', 'MONTH_9', 'MONTH_10', 'MONTH_11', 'MONTH_12',
 'DAY_OF_WEEK_0', 'DAY_OF_WEEK_1', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3',
 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5', 'DAY_OF_WEEK_6', 'HOUR_0', 'HOUR_1',
 'HOUR_2', 'HOUR_3', 'HOUR_4', 'HOUR_5', 'HOUR_6', 'HOUR_7', 'HOUR_8',
 'HOUR_9', 'HOUR_10', 'HOUR_11', 'HOUR_12', 'HOUR_13', 'HOUR_14',
 'HOUR_15', 'HOUR_16', 'HOUR_17', 'HOUR_18', 'HOUR_19', 'HOUR_20',
 'HOUR_21', 'HOUR_22', 'HOUR_23']

# We could totally change this. Utilization of these just probably requires further preprocessing.
ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION = ['TRIP_ID', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'POLYLINE']

train_data_sample = train_data.sample(frac=0.5, random_state=420) # frac is used to control percentage of train data used
X = train_data_sample.drop("TRAVEL_TIME", axis=1)
X = X.loc[:, ~X.columns.isin(ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION)]
y = train_data_sample["TRAVEL_TIME"]

test_features = test_data.loc[:, ~test_data.columns.isin(ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION)]

In [5]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

In [6]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=4305)

In [7]:
def rmse(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean(np.square(np.subtract(y_true, y_pred))))

In [8]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor()
param_grid = {'hidden_layer_sizes': [(9, 9), (8, 8), (7, 7), (6, 6), (5, 5), (4, 4), (3, 3), (9, 9, 9), (8, 8, 8), (7, 7, 7), (6, 6, 6), (5, 5, 5), (4, 4, 4), (3, 3, 3)],
              'activation': ['relu'],
              'solver': ['adam'],
              'nesterovs_momentum' : [True],
              'momentum' : [0.9, 0.8, 0.7, 0.6, 0.5],                          
              'learning_rate_init': [0.001, 0.01, 0.1, 0.3, 0.5, 1, 2],              
              'learning_rate' : ['adaptive'],
              'max_iter': [1000],
              'early_stopping': [False],
              'warm_start': [True]
             }

In [9]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
search_cv = RandomizedSearchCV(mlp, param_grid, scoring=make_scorer(rmse, greater_is_better=False),
                   cv=kf, verbose=3, pre_dispatch='2*n_jobs')

In [10]:
%%time
search_cv.fit(X, y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END activation=relu, early_stopping=False, hidden_layer_sizes=(8, 8), learning_rate=adaptive, learning_rate_init=0.1, max_iter=1000, momentum=0.6, nesterovs_momentum=True, solver=adam, warm_start=True;, score=-704.595 total time=  46.6s
[CV 2/3] END activation=relu, early_stopping=False, hidden_layer_sizes=(8, 8), learning_rate=adaptive, learning_rate_init=0.1, max_iter=1000, momentum=0.6, nesterovs_momentum=True, solver=adam, warm_start=True;, score=-678.233 total time=  54.1s
[CV 3/3] END activation=relu, early_stopping=False, hidden_layer_sizes=(8, 8), learning_rate=adaptive, learning_rate_init=0.1, max_iter=1000, momentum=0.6, nesterovs_momentum=True, solver=adam, warm_start=True;, score=-666.100 total time=  42.9s
[CV 1/3] END activation=relu, early_stopping=False, hidden_layer_sizes=(5, 5), learning_rate=adaptive, learning_rate_init=0.001, max_iter=1000, momentum=0.7, nesterovs_momentum=True, solver=adam, warm_

In [11]:
results = pd.DataFrame(search_cv.cv_results_).sort_values(by='mean_test_score', ascending=False)
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_warm_start,param_solver,param_nesterovs_momentum,param_momentum,param_max_iter,param_learning_rate_init,...,param_hidden_layer_sizes,param_early_stopping,param_activation,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,138.049962,29.190449,0.036569,0.006003,True,adam,True,0.9,1000,0.001,...,"(7, 7)",False,relu,"{'warm_start': True, 'solver': 'adam', 'nester...",-704.58795,-675.659648,-660.939764,-680.395787,18.131268,1
5,192.180129,18.198306,0.046889,0.003735,True,adam,True,0.6,1000,0.001,...,"(8, 8)",False,relu,"{'warm_start': True, 'solver': 'adam', 'nester...",-704.408196,-675.598737,-661.384284,-680.463739,17.898146,2
1,99.057625,30.456161,0.036901,0.000814,True,adam,True,0.7,1000,0.001,...,"(5, 5)",False,relu,"{'warm_start': True, 'solver': 'adam', 'nester...",-706.605173,-678.413609,-663.694409,-682.904397,17.803722,3
0,47.912474,4.651382,0.068147,0.006927,True,adam,True,0.6,1000,0.1,...,"(8, 8)",False,relu,"{'warm_start': True, 'solver': 'adam', 'nester...",-704.594596,-678.233449,-666.100275,-682.976107,16.069076,4
4,43.755925,10.653337,0.049201,0.006926,True,adam,True,0.8,1000,0.1,...,"(6, 6)",False,relu,"{'warm_start': True, 'solver': 'adam', 'nester...",-707.400066,-679.335551,-662.605078,-683.113565,18.481573,5


In [12]:
search_cv.best_estimator_

In [13]:
scaler = preprocessing.StandardScaler().fit(test_features)
test_features = scaler.transform(test_features)
y_pred = search_cv.best_estimator_.predict(test_features)

In [14]:
from cnn_utils import test_prediction_to_csv

test_prediction_to_csv(y_pred, "predicting_cnn_sklearn.csv", test_data)