In [22]:
import torch
import torch.nn as nn
import torchvision
import pandas as pd
import numpy as np
import zipfile
import io
import pandas as pd
import math
from IPython.display import display
from collections import defaultdict

# data_preparation.ipynb created train.zip which has train.csv inside
zipped_data_path = "../data/clean_data/class-competition-cleaned.zip"
train_csv = "train_call_type_A.csv"
test_csv = "test_public.csv"

with zipfile.ZipFile(zipped_data_path, "r") as zipf:
    train_data = pd.read_csv(zipf.open(train_csv))
    test_data = pd.read_csv(zipf.open(test_csv))

In [23]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [24]:
# Use only data points with CALL_TYPE A
test_data = test_data[test_data['CALL_TYPE'] == 'A']
test_data.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,START_LOCATION,MON_sin,MON_cos,DAY_sin,DAY_cos,HR_sin,HR_cos,WK_sin,WK_cos,YR_2013
5,T6,A,42612.0,,20000607,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False
7,T8,A,31780.0,,20000619,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False
21,T22,A,85698.0,,20000199,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False
22,T23,A,37007.0,,20000480,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False
36,T37,A,2002.0,,20000159,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False


In [25]:
from sklearn.preprocessing import StandardScaler
mean_encoding_taxi = train_data.groupby('TAXI_ID')['TRAVEL_TIME'].mean().reset_index()
mean_encoding_call = train_data.groupby('ORIGIN_CALL')['TRAVEL_TIME'].mean().reset_index()

# Create a dictionary mapping 'TAXI_ID' to mean 'TRAVEL_TIME' value
mean_encoding_taxi_dict = dict(zip(mean_encoding_taxi['TAXI_ID'], mean_encoding_taxi['TRAVEL_TIME']))

# Create a dictionary mapping 'ORIGIN_CALL' to mean 'TRAVEL_TIME' value
mean_encoding_call_dict = dict(zip(mean_encoding_call['ORIGIN_CALL'], mean_encoding_call['TRAVEL_TIME']))

# Replace the 'TAXI_ID' values with mean target encoding values
train_data.loc[:, 'TAXI_ID_MEAN_ENC'] = train_data['TAXI_ID'].map(mean_encoding_taxi_dict)
test_data.loc[:, 'TAXI_ID_MEAN_ENC'] = test_data['TAXI_ID'].map(mean_encoding_taxi_dict)

# Replace the 'ORIGIN_CALL' values with mean target encoding values
train_data.loc[:, 'ORIGIN_CALL_MEAN_ENC'] = train_data['ORIGIN_CALL'].map(mean_encoding_call_dict)
test_data.loc[:, 'ORIGIN_CALL_MEAN_ENC'] = test_data['ORIGIN_CALL'].map(mean_encoding_call_dict)

overall_mean_enc = train_data['ORIGIN_CALL_MEAN_ENC'].mean()
test_data['ORIGIN_CALL_MEAN_ENC'].fillna(overall_mean_enc, inplace=True)

mean_taxi_enc_train = pd.DataFrame(train_data['TAXI_ID_MEAN_ENC'])
mean_taxi_enc_test = pd.DataFrame(test_data['TAXI_ID_MEAN_ENC'])
mean_call_enc_train = pd.DataFrame(train_data['ORIGIN_CALL_MEAN_ENC'])
mean_call_enc_test = pd.DataFrame(test_data['ORIGIN_CALL_MEAN_ENC'])

# Initialize StandardScaler and fit it on the mean encoding column
scaler = StandardScaler()
scaler.fit(mean_taxi_enc_train)

# Transform the mean encoding column using the fitted scaler
normalized_taxi_enc_train = scaler.transform(mean_taxi_enc_train)
normalized_taxi_enc_test = scaler.transform(mean_taxi_enc_test)

# Replace the original mean encoding column with the normalized values
train_data.loc[:, 'TAXI_ID_MEAN_ENC_NORMALIZED'] = normalized_taxi_enc_train
test_data.loc[:, 'TAXI_ID_MEAN_ENC_NORMALIZED'] = normalized_taxi_enc_test

# Initialize StandardScaler and fit it on the mean encoding column
scaler = StandardScaler()
scaler.fit(mean_call_enc_train)

# Transform the mean encoding column using the fitted scaler
normalized_call_enc_train = scaler.transform(mean_call_enc_train)
normalized_call_enc_test = scaler.transform(mean_call_enc_test)

# Replace the original mean encoding column with the normalized values
train_data.loc[:, 'ORIGIN_CALL_MEAN_ENC_NORMALIZED'] = normalized_call_enc_train
test_data.loc[:, 'ORIGIN_CALL_MEAN_ENC_NORMALIZED'] = normalized_call_enc_test

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346466 entries, 0 to 346465
Data columns (total 21 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   TRIP_ID                          346466 non-null  int64  
 1   CALL_TYPE                        346466 non-null  object 
 2   ORIGIN_CALL                      346466 non-null  float64
 3   ORIGIN_STAND                     0 non-null       float64
 4   TAXI_ID                          346466 non-null  int64  
 5   POLYLINE                         346466 non-null  object 
 6   TRAVEL_TIME                      346466 non-null  int64  
 7   START_LOCATION                   0 non-null       float64
 8   MON_sin                          346466 non-null  float64
 9   MON_cos                          346466 non-null  float64
 10  DAY_sin                          346466 non-null  float64
 11  DAY_cos                          346466 non-null  float64
 12  HR

In [26]:
from sklearn.model_selection import train_test_split

# We could totally change this. Utilization of these just probably requires further preprocessing.
ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION = ['TRIP_ID', 'CALL_TYPE', 'ORIGIN_STAND', 'POLYLINE', 'START_LOCATION', 'ORIGIN_CALL', 'TAXI_ID', 'ORIGIN_CALL_MEAN_ENC', 'TAXI_ID_MEAN_ENC']

train_data_sample = train_data.sample(frac=0.8, random_state=420) # frac is used to control percentage of train data used
X = train_data_sample.drop("TRAVEL_TIME", axis=1)
X = X.loc[:, ~X.columns.isin(ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION)]
y = train_data_sample["TRAVEL_TIME"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)

test_features = test_data.loc[:, ~test_data.columns.isin(ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION)]

In [33]:
X_train.head()

Unnamed: 0,MON_sin,MON_cos,DAY_sin,DAY_cos,HR_sin,HR_cos,WK_sin,WK_cos,YR_2013,TAXI_ID_MEAN_ENC_NORMALIZED,ORIGIN_CALL_MEAN_ENC_NORMALIZED
90406,-0.866025,0.5,0.968077,-0.250653,0.707107,0.707107,0.974928,-0.222521,True,-0.124068,0.08366
179074,0.5,0.8660254,0.998717,-0.050649,0.0,1.0,0.974928,-0.222521,False,0.400316,-0.484996
185955,0.5,0.8660254,0.101168,-0.994869,0.866025,-0.5,0.974928,-0.222521,False,0.867617,1.023515
268597,0.866025,-0.5,0.651372,-0.758758,-0.5,-0.866025,-0.974928,-0.222521,False,0.243007,-2.007787
250858,1.0,6.123234000000001e-17,-0.937752,0.347305,0.707107,-0.707107,0.781831,0.62349,False,0.002879,0.535719


In [27]:
from sklearn.metrics import mean_squared_error

def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [28]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=420, max_features='sqrt', n_estimators = 1200, min_samples_leaf=4, min_samples_split=10, max_depth=60, bootstrap=True, n_jobs=-1)

In [29]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

root_mean_squared_error(y_test, y_pred)

317.4090789825179

In [30]:
def test_prediction_to_csv(y_pred, outfile_name):
	output_df = pd.DataFrame(test_data["TRIP_ID"])
	output_df["TRAVEL_TIME"] = y_pred
	output_df.head()
	output_df.to_csv(f'../guesses/{outfile_name}', index=False)
	
y_pred = rf.predict(test_features)
test_prediction_to_csv(y_pred, "predicting_random_forest_calltype_A.csv")

### XGBoost

In [31]:
from xgboost import XGBRFRegressor

model = XGBRFRegressor(n_estimators=1200, colsample_bynode=0.2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

root_mean_squared_error(y_test, y_pred)

338.066586606972

In [32]:
from xgboost import XGBRFRegressor
from numpy import arange
# TAKES ~4 MIN 30 SEC ON M1 PRO CPU
n_trees = [10, 50, 100, 500, 1200]
models = dict()
for v in n_trees:
    print('Num trees: ', v)
    for b in arange(0.1, 1.1, 0.2):
        models[str(v)] = XGBRFRegressor(n_estimators=v, colsample_bynode=b)
        models[str(v)].fit(X_train, y_train)
        y_pred = models[str(v)].predict(X_test)
        print('colsample_bynode: ', b, '| RMSE: ', root_mean_squared_error(y_test, y_pred))

Num trees:  10
colsample_bynode:  0.1 | RMSE:  371.71444965498233
colsample_bynode:  0.30000000000000004 | RMSE:  325.52322223156284
colsample_bynode:  0.5000000000000001 | RMSE:  321.1265959018283
colsample_bynode:  0.7000000000000001 | RMSE:  319.1493216136946
colsample_bynode:  0.9000000000000001 | RMSE:  319.1390414815367
Num trees:  50
colsample_bynode:  0.1 | RMSE:  360.3202152027043
colsample_bynode:  0.30000000000000004 | RMSE:  329.3915865121829
colsample_bynode:  0.5000000000000001 | RMSE:  321.63492056943
colsample_bynode:  0.7000000000000001 | RMSE:  319.31030045976615
colsample_bynode:  0.9000000000000001 | RMSE:  318.971755248209
Num trees:  100
colsample_bynode:  0.1 | RMSE:  358.8260328378554
colsample_bynode:  0.30000000000000004 | RMSE:  328.45256037440595
colsample_bynode:  0.5000000000000001 | RMSE:  321.0969674007282
colsample_bynode:  0.7000000000000001 | RMSE:  319.130423067615
colsample_bynode:  0.9000000000000001 | RMSE:  318.9244420934945
Num trees:  500
colsa

In [None]:
y_pred = model.predict(test_features)
test_prediction_to_csv(y_pred, "predicting_rf_xgboost_calltype_A.csv")