In [2]:
import pandas as pd
import numpy as np

import pickle
from dateutil.parser import parse
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from time import time
from sklearn.metrics import accuracy_score

In [3]:
PROCESSED_DATA_FOLDER = "data/4_all_data_preprocessed"
TRAIN_DATASET_FILE = "train_dataset.pkl"
TEST_DATASET_FILE = "test_dataset.pkl"

RESULT_DIR = "data/hyperparams_tune"

In [4]:
with open(f"{PROCESSED_DATA_FOLDER}/{TRAIN_DATASET_FILE}", 'rb') as handle:
    df_train = pickle.load(handle)
    
with open(f"{PROCESSED_DATA_FOLDER}/{TEST_DATASET_FILE}", 'rb') as handle:
    df_test = pickle.load(handle)


In [5]:
# LOAD TRAIN AND TEST DATA
# PLEASE UPDATE BEFORE USING REST OF THE FILE
# IN CASE YOU HAVE TE DATASETS STORED ELSEWHERE

df_train = df_train.fillna(0)
train_y = df_train.event_indicator
y = df_train.event_indicator.astype(float)
train_x = df_train.drop(["isw_date_tomorrow_datetime", "day_datetime", "city_resolvedAddress", "event_indicator"], axis=1)

df_test = df_test.fillna(0)
test_y = df_test.event_indicator
y = df_test.event_indicator.astype(float)
test_x = df_test.drop(["isw_date_tomorrow_datetime", "day_datetime", "city_resolvedAddress", "event_indicator"], axis=1)

In [6]:
def tune_with_halving_grid_search(x_train, y_train, param_grid, model, result_name):
    start = time()
    halving_gs_results = HalvingGridSearchCV(
        model,
        param_grid,
        cv=3,
        factor=3,
        min_resources='exhaust',
        verbose=10
    ).fit(x_train, y_train)

    duration = time() - start

    results = pd.DataFrame(halving_gs_results.cv_results_)
    results.loc[:, 'mean_test_score'] *= 100

    # take the most relevant columns and sort
    results = results.loc[:, ('iter', 'rank_test_score', 'mean_test_score', 'params')]
    results.sort_values(by=['iter', 'rank_test_score'], ascending=[False, True], inplace=True)
    results.to_csv(f"{RESULT_DIR}/{result_name}.csv")
    
    return results, duration

In [7]:
def perform_tune(param_grid, model, result_name):
    #halving_results, halving_duration = tune_with_halving_grid_search(train_x[10000:].head(500), train_y[10000:].head(500), param_grid, model, result_name)
    halving_results, halving_duration = tune_with_halving_grid_search(train_x[100:], train_y[100:], param_grid, model, result_name)
    print(halving_results.head())

    score = halving_results['mean_test_score'].iloc[0]
    params = halving_results['params'].iloc[0]

    print(f'Best score for HalvingGridSearchCv is {score:.3f}, took {halving_duration:.2f} seconds')
    print(f'Params: {params}')

In [None]:
# Logistic regression

model = LogisticRegression(max_iter=1000)
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

param_grid = dict(solver=solvers,penalty=penalty,C=c_values)

perform_tune(param_grid, model, 'LogisticRegression_Hyperparams')

In [None]:
# Support Vector Machines

model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
tol = [1e-3, 1e-2]

param_grid = dict(kernel=kernel,C=C,gamma=gamma,tol=tol)

perform_tune(param_grid, model, 'SupportVectorMachine_Hyperparams')

In [None]:
# Stochastic Gradient Descent
model = SGDClassifier()
penalty = [None, 'l2', 'l1', 'elasticnet']
alpha = [1e-4, 1e-3, 1e-2]
tol = [1e-3, 1e-2]

param_grid = dict(penalty=penalty,alpha=alpha,tol=tol)

perform_tune(param_grid, model, 'StochasticGradientDescent_Hyperparams')

In [None]:
# Random Forest
model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']

grid = dict(n_estimators=n_estimators,max_features=max_features)

perform_tune(param_grid, model, 'RandomForest_Hyperparams')