In [1]:
import zipfile
import pandas as pd
import numpy as np

# data_preparation.ipynb created train.zip which has train.csv inside
zipped_data_path = "../data/clean_data/class-competition.zip"
train_csv = "train.csv"
test_csv = "test_public.csv"

with zipfile.ZipFile(zipped_data_path, "r") as zip:
    train_data = pd.read_csv(zip.open(train_csv))
    test_data = pd.read_csv(zip.open(test_csv))

In [2]:
from sklearn.model_selection import train_test_split

ALL_FEATURES = ['TRIP_ID', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'MISSING_DATA',
 'POLYLINE', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C', 'YEAR_2013', 'YEAR_2014',
 'MONTH_1', 'MONTH_2', 'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6',
 'MONTH_7', 'MONTH_8', 'MONTH_9', 'MONTH_10', 'MONTH_11', 'MONTH_12',
 'DAY_OF_WEEK_0', 'DAY_OF_WEEK_1', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3',
 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5', 'DAY_OF_WEEK_6', 'HOUR_0', 'HOUR_1',
 'HOUR_2', 'HOUR_3', 'HOUR_4', 'HOUR_5', 'HOUR_6', 'HOUR_7', 'HOUR_8',
 'HOUR_9', 'HOUR_10', 'HOUR_11', 'HOUR_12', 'HOUR_13', 'HOUR_14',
 'HOUR_15', 'HOUR_16', 'HOUR_17', 'HOUR_18', 'HOUR_19', 'HOUR_20',
 'HOUR_21', 'HOUR_22', 'HOUR_23']

# We could totally change this. Utilization of these just probably requires further preprocessing.
ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION = ['TRIP_ID', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'POLYLINE']

X = train_data.drop("TRAVEL_TIME", axis=1)
X = X.loc[:, ~X.columns.isin(ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION)]
y = train_data["TRAVEL_TIME"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)


# The test_data needs to have the proper features w/o TRIP_ID for estimation
# test_features -> features for estimation
# test_data -> whole dataframe including TRIP_ID for predction csv
test_features = test_data.loc[:, ~test_data.columns.isin(ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION)]


In [3]:
from sklearn.metrics import mean_squared_error

def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

# This is gonna run for a really long time unless we operate on a sample 
def train_test_split_sample(frac, random_state=420):
    # TODO: We should use stratified sampling here. Would help get more representative samples.
    train_data_sample = train_data.sample(frac=frac, random_state=random_state) # frac is used to control percentage of train data used
    X_sample = train_data_sample.drop("TRAVEL_TIME", axis=1)
    X_sample = X_sample.loc[:, ~X_sample.columns.isin(ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION)]
    y_sample = train_data_sample["TRAVEL_TIME"]

    X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=420)
    return X_train_sample, X_test_sample, y_train_sample, y_test_sample

### Baseline: predicting average

In [4]:
# Establish a baseline for model comparison. What if we always predict the average travel time?
avg_travel_time = y_train.mean()
y_pred = [ avg_travel_time for i in range(len(X_test)) ]

root_mean_squared_error(y_test, y_pred)

692.1182957123148

#### Reviewing Baseline:
* RMSE = 678.8930754443071
* the most naive approach

In [5]:
def test_prediction_to_csv(y_pred, outfile_name):
    
	output_df = pd.DataFrame(test_data["TRIP_ID"])
	output_df["TRAVEL_TIME"] = y_pred
	output_df.head()

	output_df.to_csv(f'../guesses/{outfile_name}', index=False)

test_prediction_to_csv(avg_travel_time, "predicting_average_travel_time.csv")

### Linear Regression

In [69]:
# Running a linear regression
from sklearn import linear_model

lreg = linear_model.LinearRegression()
lreg.fit(X_train, y_train)
y_pred = lreg.predict(X_test)

root_mean_squared_error(y_test, y_pred)

684.066273563164

In [71]:
y_pred = lreg.predict(test_features)
test_prediction_to_csv(y_pred, "predicting_linear_regression.csv")

[4.79405011e+11 4.79405011e+11 4.79405011e+11 4.79405011e+11
 4.79405011e+11 4.79405012e+11 4.79405011e+11 4.79405012e+11
 4.79405011e+11 4.79405011e+11 4.79405011e+11 4.79405012e+11
 4.79405012e+11 4.79405012e+11 4.79405012e+11 4.79405012e+11
 4.79405012e+11 4.79405012e+11 4.79405011e+11 4.79405012e+11
 4.79405011e+11 4.79405012e+11 4.79405012e+11 4.79405011e+11
 4.79405011e+11 4.79405011e+11 4.79405011e+11 4.79405011e+11
 4.79405011e+11 4.79405011e+11 4.79405011e+11 4.79405011e+11
 4.79405012e+11 4.79405012e+11 4.79405012e+11 4.79405012e+11
 4.79405012e+11 4.79405011e+11 4.79405011e+11 4.79405011e+11
 4.79405011e+11 4.79405012e+11 4.79405012e+11 4.79405011e+11
 4.79405011e+11 4.79405011e+11 4.79405012e+11 4.79405012e+11
 4.79405011e+11 4.79405011e+11 4.79405011e+11 4.79405011e+11
 4.79405011e+11 4.79405011e+11 4.79405011e+11 4.79405011e+11
 4.79405011e+11 4.79405011e+11 4.79405012e+11 4.79405012e+11
 4.79405011e+11 4.79405012e+11 4.79405012e+11 4.79405012e+11
 4.79405011e+11 4.794050

In [8]:
# Lasso
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

rmse = make_scorer(lambda y_true, y_pred: root_mean_squared_error(y_true, y_pred), greater_is_better=False)
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split_sample(frac=0.01)

lasso = Lasso()

param_grid = {
    'alpha': np.logspace(-5, 2, 50)
}

search = GridSearchCV(lasso, param_grid, n_jobs = -1, scoring=rmse)
search.fit(X_train_sample, y_train_sample)

print(f'Best CV score: {-1 * search.best_score_}')
print(search.best_params_)

Best CV score: 763.585956808071
{'alpha': 0.5179474679231213}


In [73]:
# Test Lasso
lasso = Lasso(alpha = 0.5179474679231213)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

root_mean_squared_error(y_test, y_pred)

684.2282776160345

In [74]:
y_pred = lasso.predict(test_features)
test_prediction_to_csv(y_pred, "predicting_lasso_regression.csv")

In [75]:
# Ridge
from sklearn.linear_model import Ridge

lasso = Ridge()

param_grid = {
    'alpha': np.logspace(-5, 2, 50)
}

search = GridSearchCV(lasso, param_grid, n_jobs = -1, scoring=rmse)
search.fit(X_train_sample, y_train_sample)

print(f'Best CV score: {-1 * search.best_score_}')
print(search.best_params_)

Best CV score: 763.4839170992379
{'alpha': 100.0}


In [76]:
# Test Ridge
ridge = Ridge(alpha = 100)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

root_mean_squared_error(y_test, y_pred)

684.0679267582618

In [77]:
y_pred = ridge.predict(test_features)
test_prediction_to_csv(y_pred, "predicting_ridge_regression.csv")

#### Reviewing Linear Regression:
* Linear Regression: 671.0856201370245
* Lasso Regression: 671.1443087427473
* Ridge Regression: 671.0895985241863

### Support Vector Machines (SVM)

In [81]:
# !!!! SKIP THIS IF YOU DON'T WANT TO WAIT A LONG TIME !!!!

# Running a SVM. Using 0.1% of original data runs in ~2mins on my laptop, 0.5% runs in ~23mins.
# This can be skipped unless you really want to rerun it. 0.5% yielded:
# Best CV score: 600.6364736351106
# {'C': 100.0, 'kernel': 'linear'}
# so just using this for now.


from sklearn.svm import SVR

X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split_sample(frac=0.01, random_state=500)

svm = SVR()

param_grid = {
    'C': np.logspace(0, 2, 25),
    'kernel': ['rbf', 'sigmoid', 'linear']
}

search = GridSearchCV(svm, param_grid, n_jobs = -1, scoring=rmse)
search.fit(X_train_sample, y_train_sample)

print(f'Best CV score: {-1 * search.best_score_}')
print(search.best_params_)

svm_poly = SVR(kernel = 'poly')

param_grid = {
    'C': np.logspace(0, 2, 25),
    'degree': np.arange(1, 4)
}

search = GridSearchCV(svm_poly, param_grid, n_jobs = -1, scoring=rmse)
search.fit(X_train_sample, y_train_sample)

print(f'Best CV score: {-1 * search.best_score_}')
print(search.best_params_)

Best CV score: 674.4944260890286
{'C': 68.12920690579611, 'kernel': 'rbf'}
Best CV score: 674.2583457807992
{'C': 68.12920690579611, 'degree': 2}


In [6]:
# Test the SVM on 10% of data (8% train/2% validation)
from sklearn.svm import SVR
svm = SVR(kernel = 'poly', C=68.12920690579611, degree=2)
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split_sample(frac=0.1, random_state=500)
svm.fit(X_train_sample, y_train_sample)
y_pred = svm.predict(X_test_sample)

root_mean_squared_error(y_test_sample, y_pred)

683.8392104356353

In [7]:
y_pred = svm.predict(test_features)
test_prediction_to_csv(y_pred, "predicting_svm_poly_kernel_ten_percent.csv")

In [13]:
# That was good but does it generalize to another larger sample? - Yes, it does!
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split_sample(frac=0.015)

svm = SVR(kernel = 'linear', C=100)

svm.fit(X_train_sample, y_train_sample)
y_pred = svm.predict(X_test_sample)

root_mean_squared_error(y_test_sample, y_pred)

804.1847346133947

#### Reviewing SVM:
* SVM did well on 0.5% of data w/ RSME = 738.95
* seemed to perform worse with more data (1.5%) w/ RSME = 804.18
* could try some other sample sizes on not my shitty laptop
* UPDATE: After trying on my PC, was able to get higher train set which yielded better performance

### Bagging

In [11]:
# Bagging w/ Decision Trees
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeClassifier

X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split_sample(frac=0.1)

bag = BaggingRegressor(estimator = DecisionTreeClassifier(), random_state = 420)

param_grid = {
    'n_estimators': list(range(1,20))
}

search = GridSearchCV(bag, param_grid, n_jobs = -1, scoring = rmse)
search.fit(X_train_sample, y_train_sample)

print(f'Best CV score: {-1 * search.best_score_}')
print(search.best_params_)

Best CV score: 696.7578957269487
{'n_estimators': 19}


In [12]:
# Test the Bagging Regressor
bag = BaggingRegressor(estimator = DecisionTreeClassifier(), random_state = 420, n_estimators=19)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)

root_mean_squared_error(y_test, y_pred)

754.9926609619301

In [13]:
y_pred = bag.predict(test_features)
test_prediction_to_csv(y_pred, "predicting_bagging_regressor_19_estimators.csv")

#### Reviewing Bagging:
* RSME = 755.15 on all training data

### Random Forest

In [9]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split_sample(frac=0.01)

rf = RandomForestRegressor(random_state=420)

param_grid = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

search = RandomizedSearchCV(rf, param_grid, n_jobs = -1, scoring = rmse)
search.fit(X_train_sample, y_train_sample)

print(f'Best CV score: {-1 * search.best_score_}')
print(search.best_params_)

Best CV score: 766.9098843101626
{'n_estimators': 1200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': True}


In [10]:
# Test the Random Forest
rf = RandomForestRegressor(random_state=420, max_features='sqrt', n_estimators = 1200, min_samples_leaf=4, min_samples_split=10, max_depth=60, bootstrap=True)

rf.fit(X_train_sample, y_train_sample)
y_pred = rf.predict(X_test_sample)

root_mean_squared_error(y_test_sample, y_pred)

738.5281634010671

In [12]:
# THIS ONE TAKES A LONG TIME TOO !!!!
rf = RandomForestRegressor(random_state=420, max_features='sqrt', n_estimators = 1200, min_samples_leaf=4, min_samples_split=10, max_depth=60, bootstrap=True, n_jobs=-1)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

root_mean_squared_error(y_test, y_pred)

681.5772344495936

In [None]:
# All train data
rf = RandomForestRegressor(random_state=420, max_features='sqrt', n_estimators = 1200, min_samples_leaf=4, min_samples_split=10, max_depth=60, bootstrap=True, n_jobs=-1)

rf.fit(X, y)

In [13]:
y_pred = rf.predict(test_features)
test_prediction_to_csv(y_pred, "predicting_random_forest.csv")

#### Reviewing Random Forest:
* RSME = 668.6428710752018 on all training data
* solid baseline to try and beat with deep learning