# NYC Taxi Fare & Duration
## Model Training

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import wget

sys.path.append("..")

import source.configs as configs
import source.preprocessing as preprocessing

# Check if we are in COLAB
IN_COLAB = 'google.colab' in sys.modules

In [3]:
if not os.path.exists("../dataset/yellow_tripdata_2022-05.parquet"):
    if IN_COLAB:
        !wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-05.parquet "../dataset"
    else:
        wget.download(configs.START_DATASET_URL, "../dataset")

In [4]:
dataset = pd.read_parquet("../dataset/yellow_tripdata_2022-05.parquet")

In [5]:
dataset = preprocessing.add_trip_duration_feature(dataset)
dataset = preprocessing.fill_na_values(dataset)
#dataset = preprocessing.filter_outliers(dataset)
dataset = preprocessing.delete_outliers(dataset)

In [6]:
dataset = preprocessing.add_day_of_week(dataset)
dataset = preprocessing.add_hour_of_day(dataset)
dataset = preprocessing.map_hour_zone(dataset)
dataset = preprocessing.map_rush_hour(dataset)

In [7]:
dataset = preprocessing.create_one_hot_encodings(dataset)

### Features selection
* trip_distance
* vendor_id
* passenger_count
* day_of_week (engineered)
* hour_zone (engineered)
* rush_hour

### Targets 
* trip_duration (engineered)
* fare_amount

### Dropped
* tpep_pickup_datetime: We use engineered features extracted from this timestamp.
* tpep_dropoff_datetime: We can't use this, it's from the future.
* store_and_fwd_flag: Trip record was held in vehicle memory.
* extra: Rush hour + overnight.
* mta_tax: Autmatic triggered based on rate in use.
* improvement_surcharge: Surcharge.
* total_amount: Fare + taxes + etc.
* tip_amount: Not used.
* congestion_surcharge: Not included for now.
* PULocationID & DOLocationID: Taxi zones, add 100s of categorical features, not used.
* payment_type: Should not be relevant.
* tolls_amount: Not used.
* airport_fee: Not used.

In [8]:
features_drop_list = ["tpep_pickup_datetime",
                    "tpep_dropoff_datetime", 
                    "store_and_fwd_flag", 
                    "extra", 
                    "mta_tax", 
                    "improvement_surcharge", 
                    "total_amount",
                    "tip_amount" ,
                    "congestion_surcharge", 
                    "PULocationID",
                    "DOLocationID",
                    "payment_type",
                    "tolls_amount",
                    "airport_fee"]

features_drop_list.append("hour_of_day")

In [9]:
train_dataset = preprocessing.drop_features(dataset, features_drop_list)

In [10]:
train_dataset.columns.to_list()

['passenger_count',
 'trip_distance',
 'RatecodeID',
 'fare_amount',
 'trip_duration',
 'day_of_week',
 'rush_hour',
 'RatecodeID_1.0',
 'RatecodeID_2.0',
 'RatecodeID_3.0',
 'RatecodeID_4.0',
 'RatecodeID_5.0',
 'VendorID_1',
 'VendorID_2',
 'day_of_week_0',
 'day_of_week_1',
 'day_of_week_2',
 'day_of_week_3',
 'day_of_week_4',
 'day_of_week_5',
 'day_of_week_6',
 'hour_zone_Afternoon',
 'hour_zone_Evening',
 'hour_zone_Morning',
 'hour_zone_Night',
 'hour_zone_Noon']

In [11]:
for col in train_dataset.isna().sum():
    assert col==0

In [12]:
train_dataset.describe()

Unnamed: 0,passenger_count,trip_distance,RatecodeID,fare_amount,trip_duration,day_of_week,rush_hour,RatecodeID_1.0,RatecodeID_2.0,RatecodeID_3.0,...,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,hour_zone_Afternoon,hour_zone_Evening,hour_zone_Morning,hour_zone_Night,hour_zone_Noon
count,1773738.0,1773738.0,1773738.0,1773738.0,1773738.0,1773738.0,1773738.0,1773738.0,1773738.0,1773738.0,...,1773738.0,1773738.0,1773738.0,1773738.0,1773738.0,1773738.0,1773738.0,1773738.0,1773738.0,1773738.0
mean,1.404355,2.293065,1.01029,11.09225,13.1666,2.812467,0.3287763,0.9971478,5.637811e-07,0.0003444703,...,0.1580341,0.1644685,0.1572589,0.1203757,0.1224922,0.1858296,0.2462303,0.2367988,0.1602283,0.1709131
std,0.9336862,1.829311,0.1975633,5.935139,8.052106,2.005538,0.4697686,0.0533295,0.0007508536,0.01855672,...,0.364773,0.3707003,0.3640447,0.3254005,0.3278535,0.3889692,0.4308144,0.4251179,0.3668177,0.3764331
min,1.0,0.0,1.0,2.09,0.01666667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.08,1.0,7.0,7.216667,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.72,1.0,9.5,11.46667,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,2.85,1.0,13.5,17.36667,5.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,9.56,5.0,47.0,53.11667,6.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [14]:
# Split features and targets
X, y_td, y_fa = preprocessing.split_dataset(train_dataset)

In [15]:
X_train, X_test, y_train_td, y_test_td = train_test_split(X, y_td, test_size=0.2, random_state=42)

In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
model_td = RandomForestRegressor(n_jobs=-1, n_estimators=40)
#model_td = LinearRegression()
model_td.fit(X_train_scaled, y_train_td.values.ravel())
predictions = model_td.predict(X_test_scaled)

In [18]:
mse_td = mean_squared_error(y_test_td, predictions)
r2_td = r2_score(y_test_td, predictions)

In [19]:
print(f"MSE: {mse_td}")
print(f"re: {r2_td}")

MSE: 23.278861974076484
re: 0.6404123397824355
