In [33]:
import torch
import torch.nn as nn
import torchvision
import pandas as pd
import numpy as np
import zipfile
import io
import pandas as pd
import math
from IPython.display import display
from collections import defaultdict
from sklearn.model_selection import train_test_split

zipped_data_path = "../data/clean_data/class-competition-cleaned.zip"

dataframes = defaultdict(pd.DataFrame)
with zipfile.ZipFile(zipped_data_path, "r") as zipped:
    for filename in zipped.namelist():
        if filename.endswith(".csv"):
            with zipped.open(filename) as f:
                dataframes.update({ filename : pd.read_csv(io.TextIOWrapper(f)) })

                # Lets take a look at the files
                print(f"FILE: {filename}")
                # If you want to see file info uncomment this:
                # display(dataframes[filename].info())
                # display(dataframes[filename].head())

train_data_A = dataframes["train_call_type_A.csv"]
train_data_B = dataframes["train_call_type_B.csv"]
train_data_C = dataframes["train_call_type_C.csv"]
test_data  = dataframes["test_public.csv"]

FILE: train_call_type_A.csv
FILE: train_call_type_B.csv
FILE: train_call_type_C.csv
FILE: test_public.csv


In [34]:
ALL_FEATURES = ['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'POLYLINE', 'TRAVEL_TIME', 'START_LOCATION', 'MON_sin', 'MON_cos', 'DAY_sin', 'DAY_cos', 'HR_sin', 'HR_cos', 'WK_sin', 'WK_cos', 'YR_2013']

# I just want to train on a couple features
B_FEATURES = ['ORIGIN_STAND', 'TAXI_ID', 'TRAVEL_TIME', 'START_LOCATION', 'MON_sin', 'MON_cos', 'DAY_sin', 'DAY_cos', 'HR_sin', 'HR_cos', 'WK_sin', 'WK_cos', 'YR_2013']

In [35]:
TRAIN_DF_B = train_data_B.loc[:, train_data_B.columns.isin(B_FEATURES)]
test_features = test_data.loc[:, test_data.columns.isin(B_FEATURES)]
TRAIN_DF_B.head()

Unnamed: 0,ORIGIN_STAND,TAXI_ID,TRAVEL_TIME,START_LOCATION,MON_sin,MON_cos,DAY_sin,DAY_cos,HR_sin,HR_cos,WK_sin,WK_cos,YR_2013
0,7.0,20000596,270,"41.1599801853,-8.64198392478",1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True
1,28.0,20000403,960,"41.1632224305,-8.58404677278",1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True
2,38.0,20000309,285,"41.1607148883,-8.60424608207",1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True
3,52.0,20000178,255,"41.1549650972,-8.61321698848",1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True
4,23.0,20000686,285,"41.1460158298,-8.61257471887",1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True


In [36]:
# Lets split Start Location into longitude and latitude
def extract_longitude_latitude(row, longitude=True):
    return np.float64(row.split(",")[longitude])

def extract_longitude(row):
    return extract_longitude_latitude(row, longitude=True)

def extract_latitude(row):
    return extract_longitude_latitude(row, longitude=False)

TRAIN_DF_B = TRAIN_DF_B.copy()

TRAIN_DF_B["START_LAT"] = TRAIN_DF_B["START_LOCATION"].apply(extract_latitude)
TRAIN_DF_B["START_LON"] = TRAIN_DF_B["START_LOCATION"].apply(extract_longitude)
TRAIN_DF_B = TRAIN_DF_B.drop(["START_LOCATION"], axis=1)
TRAIN_DF_B.head()

Unnamed: 0,ORIGIN_STAND,TAXI_ID,TRAVEL_TIME,MON_sin,MON_cos,DAY_sin,DAY_cos,HR_sin,HR_cos,WK_sin,WK_cos,YR_2013,START_LAT,START_LON
0,7.0,20000596,270,1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True,41.15998,-8.641984
1,28.0,20000403,960,1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True,41.163222,-8.584047
2,38.0,20000309,285,1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True,41.160715,-8.604246
3,52.0,20000178,255,1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True,41.154965,-8.613217
4,23.0,20000686,285,1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True,41.146016,-8.612575


In [37]:
X_sample = TRAIN_DF_B.sample(frac=0.1)
y_sample = X_sample["TRAVEL_TIME"]
X_sample = X_sample.drop(['TRAVEL_TIME'], axis=1)
X_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72103 entries, 601122 to 196381
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ORIGIN_STAND  72103 non-null  float64
 1   TAXI_ID       72103 non-null  int64  
 2   MON_sin       72103 non-null  float64
 3   MON_cos       72103 non-null  float64
 4   DAY_sin       72103 non-null  float64
 5   DAY_cos       72103 non-null  float64
 6   HR_sin        72103 non-null  float64
 7   HR_cos        72103 non-null  float64
 8   WK_sin        72103 non-null  float64
 9   WK_cos        72103 non-null  float64
 10  YR_2013       72103 non-null  bool   
 11  START_LAT     72103 non-null  float64
 12  START_LON     72103 non-null  float64
dtypes: bool(1), float64(11), int64(1)
memory usage: 7.2 MB


In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.metrics import mean_squared_error

def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

rmse = make_scorer(lambda y_true, y_pred: root_mean_squared_error(y_true, y_pred), greater_is_better=False)
# Define the classifier
classifier = DecisionTreeRegressor()

# Define the hyperparameters and their potential values
hyperparameters = {
    'criterion': ['friedman_mse', 'absolute_error', 'squared_error'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3]
}

# Perform grid search CV
grid_search = GridSearchCV(classifier, hyperparameters, cv=5, n_jobs=-1, verbose=4, scoring=rmse)
grid_search.fit(X_sample, y_sample)

# Print the best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Hyperparameters: {'criterion': 'friedman_mse', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5}
Best Score: -343.21883066768544


In [38]:
X = train_data_B.loc[:, train_data_B.columns.isin(B_FEATURES)]
y = X["TRAVEL_TIME"]
X = X.drop(['TRAVEL_TIME'], axis=1)
X["START_LAT"] = X["START_LOCATION"].apply(extract_latitude)
X["START_LON"] = X["START_LOCATION"].apply(extract_longitude)
X = X.drop(["START_LOCATION"], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 721030 entries, 0 to 721029
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   ORIGIN_STAND  721030 non-null  float64
 1   TAXI_ID       721030 non-null  int64  
 2   MON_sin       721030 non-null  float64
 3   MON_cos       721030 non-null  float64
 4   DAY_sin       721030 non-null  float64
 5   DAY_cos       721030 non-null  float64
 6   HR_sin        721030 non-null  float64
 7   HR_cos        721030 non-null  float64
 8   WK_sin        721030 non-null  float64
 9   WK_cos        721030 non-null  float64
 10  YR_2013       721030 non-null  bool   
 11  START_LAT     721030 non-null  float64
 12  START_LON     721030 non-null  float64
dtypes: bool(1), float64(11), int64(1)
memory usage: 66.7 MB


In [39]:
# test_data["START_LAT"] = test_data["START_LOCATION"].apply(extract_latitude)
# test_data["START_LON"] = test_data["START_LOCATION"].apply(extract_longitude)
# test_data = test_data.drop(["START_LOCATION"], axis=1)
test_data_B = test_features[~test_features["ORIGIN_STAND"].isna()]

test_data_B = test_data_B.copy()

test_data_B["START_LAT"] = test_data_B["START_LOCATION"].apply(extract_latitude)
test_data_B["START_LON"] = test_data_B["START_LOCATION"].apply(extract_longitude)
test_data_B = test_data_B.drop(["START_LOCATION"], axis=1)

In [40]:
from sklearn.tree import DecisionTreeRegressor
dtreg = DecisionTreeRegressor(criterion='friedman_mse', max_depth=5, min_samples_leaf=1, min_samples_split=5)
dtreg.fit(X, y)

In [41]:
y_pred = dtreg.predict(test_data_B)

predictions = np.full(len(test_features), 716.339198028831)
# Find the indices of test_data_B rows in test_features
indices = test_features.index.isin(test_data_B.index)

# Update the predictions array with the corresponding y_pred values
predictions[indices] = y_pred

In [43]:
from sklearn.tree import export_text

tree_rules = export_text(dtreg, feature_names=['ORIGIN_STAND', 'TAXI_ID', 'MON_sin', 'MON_cos', 'DAY_sin', 'DAY_cos', 'HR_sin', 'HR_cos', 'WK_sin', 'WK_cos', 'YR_2013', 'START_LAT', 'START_LON'])
print("Splits:")
print(tree_rules)

Splits:
|--- HR_sin <= -0.13
|   |--- START_LAT <= 41.15
|   |   |--- ORIGIN_STAND <= 16.50
|   |   |   |--- TAXI_ID <= 20000903.00
|   |   |   |   |--- HR_cos <= -0.79
|   |   |   |   |   |--- value: [658.77]
|   |   |   |   |--- HR_cos >  -0.79
|   |   |   |   |   |--- value: [620.56]
|   |   |   |--- TAXI_ID >  20000903.00
|   |   |   |   |--- HR_cos <= -0.12
|   |   |   |   |   |--- value: [240.00]
|   |   |   |   |--- HR_cos >  -0.12
|   |   |   |   |   |--- value: [1350.55]
|   |   |--- ORIGIN_STAND >  16.50
|   |   |   |--- START_LON <= -8.62
|   |   |   |   |--- ORIGIN_STAND <= 21.50
|   |   |   |   |   |--- value: [577.98]
|   |   |   |   |--- ORIGIN_STAND >  21.50
|   |   |   |   |   |--- value: [631.54]
|   |   |   |--- START_LON >  -8.62
|   |   |   |   |--- WK_cos <= 0.20
|   |   |   |   |   |--- value: [563.11]
|   |   |   |   |--- WK_cos >  0.20
|   |   |   |   |   |--- value: [524.42]
|   |--- START_LAT >  41.15
|   |   |--- START_LAT <= 41.18
|   |   |   |--- START_LON

In [45]:
# Predicting for rest of data
TRAIN_DF_A_C = pd.concat([train_data_A, train_data_C])

TRAIN_DF_A_C = TRAIN_DF_A_C[TRAIN_DF_A_C["ORIGIN_STAND"].isna()]
test_data_A_C = test_data[test_data["ORIGIN_STAND"].isna()]

# I just want to train on a couple features
A_FEATURES = ['CALL_TYPE', 'TAXI_ID', 'TRAVEL_TIME', 'MON_sin', 'MON_cos', 'DAY_sin', 'DAY_cos', 'HR_sin', 'HR_cos', 'WK_sin', 'WK_cos', 'YR_2013']

TRAIN_DF_A_C = TRAIN_DF_A_C.loc[:, TRAIN_DF_A_C.columns.isin(A_FEATURES)]
test_data_A_C = test_data_A_C.loc[:, test_data_A_C.columns.isin(A_FEATURES)]

TRAIN_DF_A_C["CALL_TYPE"] = TRAIN_DF_A_C["CALL_TYPE"] == "A"
test_data_A_C["CALL_TYPE"] = test_data_A_C["CALL_TYPE"] == "A"
TRAIN_DF_A_C.info()

<class 'pandas.core.frame.DataFrame'>
Index: 813045 entries, 0 to 466578
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   CALL_TYPE    813045 non-null  bool   
 1   TAXI_ID      813045 non-null  int64  
 2   TRAVEL_TIME  813045 non-null  int64  
 3   MON_sin      813045 non-null  float64
 4   MON_cos      813045 non-null  float64
 5   DAY_sin      813045 non-null  float64
 6   DAY_cos      813045 non-null  float64
 7   HR_sin       813045 non-null  float64
 8   HR_cos       813045 non-null  float64
 9   WK_sin       813045 non-null  float64
 10  WK_cos       813045 non-null  float64
 11  YR_2013      813045 non-null  bool   
dtypes: bool(2), float64(8), int64(2)
memory usage: 69.8 MB


In [46]:
X_sample = TRAIN_DF_A_C.sample(frac=0.1, shu)
y_sample = X_sample["TRAVEL_TIME"]
X_sample = X_sample.drop(['TRAVEL_TIME'], axis=1)
X_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81304 entries, 62277 to 286743
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CALL_TYPE  81304 non-null  bool   
 1   TAXI_ID    81304 non-null  int64  
 2   MON_sin    81304 non-null  float64
 3   MON_cos    81304 non-null  float64
 4   DAY_sin    81304 non-null  float64
 5   DAY_cos    81304 non-null  float64
 6   HR_sin     81304 non-null  float64
 7   HR_cos     81304 non-null  float64
 8   WK_sin     81304 non-null  float64
 9   WK_cos     81304 non-null  float64
 10  YR_2013    81304 non-null  bool   
dtypes: bool(2), float64(8), int64(1)
memory usage: 6.4 MB


In [30]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.metrics import mean_squared_error

def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

rmse = make_scorer(lambda y_true, y_pred: root_mean_squared_error(y_true, y_pred), greater_is_better=False)
# Define the classifier
classifier = DecisionTreeRegressor()

# Define the hyperparameters and their potential values
hyperparameters = {
    'criterion': ['friedman_mse', 'absolute_error', 'squared_error'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3]
}

# Perform grid search CV
grid_search = GridSearchCV(classifier, hyperparameters, cv=5, n_jobs=-1, verbose=4, scoring=rmse)
grid_search.fit(X_sample, y_sample)

# Print the best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Hyperparameters: {'criterion': 'friedman_mse', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Score: -424.7584921875058


In [47]:
X = TRAIN_DF_A_C.copy()
y = X["TRAVEL_TIME"]
X = X.drop(['TRAVEL_TIME'], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 813045 entries, 0 to 466578
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   CALL_TYPE  813045 non-null  bool   
 1   TAXI_ID    813045 non-null  int64  
 2   MON_sin    813045 non-null  float64
 3   MON_cos    813045 non-null  float64
 4   DAY_sin    813045 non-null  float64
 5   DAY_cos    813045 non-null  float64
 6   HR_sin     813045 non-null  float64
 7   HR_cos     813045 non-null  float64
 8   WK_sin     813045 non-null  float64
 9   WK_cos     813045 non-null  float64
 10  YR_2013    813045 non-null  bool   
dtypes: bool(2), float64(8), int64(1)
memory usage: 63.6 MB


In [48]:
dtreg = DecisionTreeRegressor(criterion='friedman_mse', max_depth=5, min_samples_leaf=1, min_samples_split=2)
dtreg.fit(X, y)

In [49]:
y_pred = dtreg.predict(test_data_A_C)

# Find the indices of test_data_B rows in test_features
indices = test_features.index.isin(test_data_A_C.index)

# Update the predictions array with the corresponding y_pred values
predictions[indices] = y_pred

In [50]:
predictions

array([830.87817982, 737.90172884, 830.87817982, 737.90172884,
       830.87817982, 832.73899981, 830.87817982, 832.73899981,
       830.87817982, 830.87817982, 732.52244395, 832.73899981,
       773.7950731 , 832.73899981, 832.73899981, 832.73899981,
       832.73899981, 832.73899981, 830.87817982, 832.73899981,
       830.87817982, 832.73899981, 832.73899981, 830.87817982,
       732.52244395, 732.52244395, 830.87817982, 830.87817982,
       732.52244395, 830.87817982, 830.87817982, 737.90172884,
       832.73899981, 832.73899981, 832.73899981, 832.73899981,
       832.73899981, 732.52244395, 830.87817982, 737.90172884,
       732.52244395, 832.73899981, 832.73899981, 830.87817982,
       732.52244395, 732.52244395, 832.73899981, 832.73899981,
       732.52244395, 732.52244395, 732.52244395, 732.52244395,
       732.52244395, 830.87817982, 830.87817982, 732.52244395,
       830.87817982, 732.52244395, 832.73899981, 773.7950731 ,
       830.87817982, 832.73899981, 832.73899981, 832.73

In [52]:
from sklearn.tree import export_text

tree_rules = export_text(dtreg, feature_names=['CALL_TYPE', 'TAXI_ID', 'MON_sin', 'MON_cos', 'DAY_sin', 'DAY_cos', 'HR_sin', 'HR_cos', 'WK_sin', 'WK_cos', 'YR_2013'])
print("Splits:")
print(tree_rules)

Splits:
|--- HR_sin <= -0.13
|   |--- CALL_TYPE <= 0.50
|   |   |--- TAXI_ID <= 20000901.00
|   |   |   |--- TAXI_ID <= 20000093.00
|   |   |   |   |--- TAXI_ID <= 20000065.00
|   |   |   |   |   |--- value: [627.47]
|   |   |   |   |--- TAXI_ID >  20000065.00
|   |   |   |   |   |--- value: [376.59]
|   |   |   |--- TAXI_ID >  20000093.00
|   |   |   |   |--- HR_cos <= 0.92
|   |   |   |   |   |--- value: [604.16]
|   |   |   |   |--- HR_cos >  0.92
|   |   |   |   |   |--- value: [667.35]
|   |   |--- TAXI_ID >  20000901.00
|   |   |   |--- HR_cos <= 0.13
|   |   |   |   |--- MON_cos <= -0.25
|   |   |   |   |   |--- value: [721.76]
|   |   |   |   |--- MON_cos >  -0.25
|   |   |   |   |   |--- value: [590.71]
|   |   |   |--- HR_cos >  0.13
|   |   |   |   |--- HR_cos <= 0.79
|   |   |   |   |   |--- value: [1051.91]
|   |   |   |   |--- HR_cos >  0.79
|   |   |   |   |   |--- value: [1285.05]
|   |--- CALL_TYPE >  0.50
|   |   |--- HR_cos <= 0.38
|   |   |   |--- WK_sin <= -0.22
| 

In [53]:
def test_prediction_to_csv(y_pred, outfile_name, test_data):
    
	output_df = pd.DataFrame(test_data["TRIP_ID"])
	output_df["TRAVEL_TIME"] = y_pred
	output_df.head()

	output_df.to_csv(f'../guesses/{outfile_name}', index=False)

test_prediction_to_csv(predictions, "decision_tree_separate_models.csv", test_data)