In [10]:
import torch
import torch.nn as nn
import torchvision
import pandas as pd
import numpy as np
import zipfile
import io
import pandas as pd
import math
from IPython.display import display
from collections import defaultdict

# data_preparation.ipynb created train.zip which has train.csv inside
zipped_data_path = "../data/clean_data/class-competition-cleaned.zip"
train_csv = "train_call_type_C.csv"
test_csv = "test_public.csv"

with zipfile.ZipFile(zipped_data_path, "r") as zipf:
    train_data = pd.read_csv(zipf.open(train_csv))
    test_data = pd.read_csv(zipf.open(test_csv))

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [12]:
# Use only data points with CALL_TYPE C
test_data = test_data[test_data['CALL_TYPE'] == 'C']
test_data.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,START_LOCATION,MON_sin,MON_cos,DAY_sin,DAY_cos,HR_sin,HR_cos,WK_sin,WK_cos,YR_2013
11,T12,C,,,20000160,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False
12,T13,C,,,20000017,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False
13,T14,C,,,20000312,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False
14,T15,C,,,20000497,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False
15,T16,C,,,20000440,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False


In [13]:
from sklearn.preprocessing import StandardScaler
mean_encoding = train_data.groupby('TAXI_ID')['TRAVEL_TIME'].mean().reset_index()

# Create a dictionary mapping 'TAXI_ID' to mean 'TRAVEL_TIME' value
mean_encoding_dict = dict(zip(mean_encoding['TAXI_ID'], mean_encoding['TRAVEL_TIME']))

# Replace the 'TAXI_ID' values with mean target encoding values
train_data.loc[:, 'TAXI_ID_MEAN_ENC'] = train_data['TAXI_ID'].map(mean_encoding_dict)
test_data.loc[:, 'TAXI_ID_MEAN_ENC'] = test_data['TAXI_ID'].map(mean_encoding_dict)

mean_enc_train = pd.DataFrame(train_data['TAXI_ID_MEAN_ENC'])
mean_enc_test = pd.DataFrame(test_data['TAXI_ID_MEAN_ENC'])

# Initialize StandardScaler and fit it on the mean encoding column
scaler = StandardScaler()
scaler.fit(mean_enc_train)

# Transform the mean encoding column using the fitted scaler
normalized_enc_train = scaler.transform(mean_enc_train)
normalized_enc_test = scaler.transform(mean_enc_test)

# Replace the original mean encoding column with the normalized values
train_data.loc[:, 'TAXI_ID_MEAN_ENC_NORMALIZED'] = normalized_enc_train
test_data.loc[:, 'TAXI_ID_MEAN_ENC_NORMALIZED'] = normalized_enc_test

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466579 entries, 0 to 466578
Data columns (total 19 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   TRIP_ID                      466579 non-null  int64  
 1   CALL_TYPE                    466579 non-null  object 
 2   ORIGIN_CALL                  0 non-null       float64
 3   ORIGIN_STAND                 0 non-null       float64
 4   TAXI_ID                      466579 non-null  int64  
 5   POLYLINE                     466579 non-null  object 
 6   TRAVEL_TIME                  466579 non-null  int64  
 7   START_LOCATION               0 non-null       float64
 8   MON_sin                      466579 non-null  float64
 9   MON_cos                      466579 non-null  float64
 10  DAY_sin                      466579 non-null  float64
 11  DAY_cos                      466579 non-null  float64
 12  HR_sin                       466579 non-null  float64
 13 

In [14]:
from sklearn.model_selection import train_test_split

# We could totally change this. Utilization of these just probably requires further preprocessing.
ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION = ['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'POLYLINE', 'START_LOCATION', 'TAXI_ID', 'TAXI_ID_MEAN_ENC']

train_data_sample = train_data.sample(frac=0.8, random_state=420) # frac is used to control percentage of train data used
X = train_data_sample.drop("TRAVEL_TIME", axis=1)
X = X.loc[:, ~X.columns.isin(ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION)]
y = train_data_sample["TRAVEL_TIME"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)

test_features = test_data.loc[:, ~test_data.columns.isin(ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION)]

In [21]:
X_train.head()

Unnamed: 0,MON_sin,MON_cos,DAY_sin,DAY_cos,HR_sin,HR_cos,WK_sin,WK_cos,YR_2013,TAXI_ID_MEAN_ENC_NORMALIZED
5297,-0.5,-0.8660254,0.848644,0.528964,-0.5,-0.8660254,-0.433884,-0.900969,True,-0.872989
444220,1.224647e-16,-1.0,0.299363,-0.954139,-1.0,-1.83697e-16,-0.974928,-0.222521,False,-0.293832
23860,-0.5,-0.8660254,-0.790776,-0.612106,-0.965926,0.258819,-0.974928,-0.222521,True,-0.388125
318767,1.0,6.123234000000001e-17,-0.101168,-0.994869,-0.5,-0.8660254,-0.781831,0.62349,False,-0.267906
199640,-2.449294e-16,1.0,0.485302,-0.874347,1.0,6.123234000000001e-17,-0.433884,-0.900969,True,1.368616


In [15]:
from sklearn.metrics import mean_squared_error

def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [16]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=420, max_features='sqrt', n_estimators = 1200, min_samples_leaf=4, min_samples_split=10, max_depth=60, bootstrap=True, n_jobs=-1)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

root_mean_squared_error(y_test, y_pred)

431.8350086971084

In [17]:
def test_prediction_to_csv(y_pred, outfile_name):
	output_df = pd.DataFrame(test_data["TRIP_ID"])
	output_df["TRAVEL_TIME"] = y_pred
	output_df.head()
	output_df.to_csv(f'../guesses/{outfile_name}', index=False)

y_pred = rf.predict(test_features)
test_prediction_to_csv(y_pred, "predicting_random_forest_calltype_C.csv")

### XGBoost

In [18]:
from xgboost import XGBRFRegressor

model = XGBRFRegressor(n_estimators=1200, colsample_bynode=0.2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

root_mean_squared_error(y_test, y_pred)

441.0936532513977

In [22]:
from xgboost import XGBRFRegressor
from numpy import arange
# TAKES 5 MIN 40 SEC ON M1 PRO CPU
n_trees = [10, 50, 100, 500, 1200]
models = dict()
for v in n_trees:
    print('Num trees: ', v)
    for b in arange(0.1, 1.1, 0.2):
        models[str(v)] = XGBRFRegressor(n_estimators=v, colsample_bynode=b)
        models[str(v)].fit(X_train, y_train)
        y_pred = models[str(v)].predict(X_test)
        print('colsample_bynode: ', b, '| RMSE: ', root_mean_squared_error(y_test, y_pred))

Num trees:  10
colsample_bynode:  0.1 | RMSE:  449.1692949046564
colsample_bynode:  0.30000000000000004 | RMSE:  439.1376594352199
colsample_bynode:  0.5000000000000001 | RMSE:  434.51951595348595
colsample_bynode:  0.7000000000000001 | RMSE:  432.6656574363073
colsample_bynode:  0.9000000000000001 | RMSE:  432.78121008037044
Num trees:  50
colsample_bynode:  0.1 | RMSE:  449.63066751132175
colsample_bynode:  0.30000000000000004 | RMSE:  438.6660971605217
colsample_bynode:  0.5000000000000001 | RMSE:  433.8036895080591
colsample_bynode:  0.7000000000000001 | RMSE:  432.65159247696903
colsample_bynode:  0.9000000000000001 | RMSE:  432.7031515182598
Num trees:  100
colsample_bynode:  0.1 | RMSE:  451.31147263417387
colsample_bynode:  0.30000000000000004 | RMSE:  438.5882477440147
colsample_bynode:  0.5000000000000001 | RMSE:  433.7355275464443
colsample_bynode:  0.7000000000000001 | RMSE:  432.58843426173195
colsample_bynode:  0.9000000000000001 | RMSE:  432.7124444987802
Num trees:  500

In [None]:
y_pred = model.predict(test_features)
test_prediction_to_csv(y_pred, "predicting_rf_xgboost_calltype_C.csv")