# NYC Taxi Fare & Duration
## Model Training

In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import wget

sys.path.append("..")

import source.configs as configs
import source.preprocessing as preprocessing

# Check if we are in COLAB
IN_COLAB = 'google.colab' in sys.modules

In [15]:
if not os.path.exists("../dataset/yellow_tripdata_2022-05.parquet"):
    if IN_COLAB:
        !wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-05.parquet "../dataset"
    else:
        wget.download(configs.START_DATASET_URL, "../dataset")

In [16]:
dataset = pd.read_parquet("../dataset/yellow_tripdata_2022-05.parquet")

In [17]:
dataset = preprocessing.add_trip_duration_feature(dataset)
dataset = preprocessing.fill_na_values(dataset)
dataset = preprocessing.filter_outliers(dataset)

In [18]:
dataset = preprocessing.add_day_of_week(dataset)
dataset = preprocessing.add_hour_of_day(dataset)
dataset = preprocessing.map_hour_zone(dataset)
dataset = preprocessing.map_rush_hour(dataset)

In [19]:
dataset = preprocessing.create_one_hot_encodings(dataset)

### Features selection
* trip_distance
* vendor_id
* passenger_count
* day_of_week (engineered)
* hour_zone (engineered)
* rush_hour

### Targets 
* trip_duration (engineered)
* fare_amount

### Dropped
* tpep_pickup_datetime: We use engineered features extracted from this timestamp.
* tpep_dropoff_datetime: We can't use this, it's from the future.
* store_and_fwd_flag: Trip record was held in vehicle memory.
* extra: Rush hour + overnight.
* mta_tax: Autmatic triggered based on rate in use.
* improvement_surcharge: Surcharge.
* total_amount: Fare + taxes + etc.
* tip_amount: Not used.
* congestion_surcharge: Not included for now.
* PULocationID & DOLocationID: Taxi zones, add 100s of categorical features, not used.
* payment_type: Should not be relevant.
* tolls_amount: Not used.
* airport_fee: Not used.

In [20]:
features_drop_list = ["tpep_pickup_datetime",
                    "tpep_dropoff_datetime", 
                    "store_and_fwd_flag", 
                    "extra", 
                    "mta_tax", 
                    "improvement_surcharge", 
                    "total_amount",
                    "tip_amount" ,
                    "congestion_surcharge", 
                    "PULocationID",
                    "DOLocationID",
                    "payment_type",
                    "tolls_amount",
                    "airport_fee"]

features_drop_list.append("hour_of_day")

In [21]:
dataset = preprocessing.drop_features(dataset, features_drop_list)

In [22]:
dataset.columns.to_list()

['passenger_count',
 'trip_distance',
 'RatecodeID',
 'fare_amount',
 'trip_duration',
 'day_of_week',
 'rush_hour',
 'RatecodeID_1.0',
 'RatecodeID_2.0',
 'RatecodeID_3.0',
 'RatecodeID_4.0',
 'RatecodeID_5.0',
 'RatecodeID_6.0',
 'VendorID_1',
 'VendorID_2',
 'VendorID_5',
 'VendorID_6',
 'day_of_week_0',
 'day_of_week_1',
 'day_of_week_2',
 'day_of_week_3',
 'day_of_week_4',
 'day_of_week_5',
 'day_of_week_6',
 'hour_zone_Afternoon',
 'hour_zone_Evening',
 'hour_zone_Morning',
 'hour_zone_Night',
 'hour_zone_Noon']

In [23]:
for col in dataset.isna().sum():
    assert col==0

In [24]:
dataset.describe()

Unnamed: 0,passenger_count,trip_distance,RatecodeID,fare_amount,trip_duration,day_of_week,rush_hour,RatecodeID_1.0,RatecodeID_2.0,RatecodeID_3.0,...,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,hour_zone_Afternoon,hour_zone_Evening,hour_zone_Morning,hour_zone_Night,hour_zone_Noon
count,3588295.0,3588295.0,3588295.0,3588295.0,3588295.0,3588295.0,3588295.0,3588295.0,3588295.0,3588295.0,...,3588295.0,3588295.0,3588295.0,3588295.0,3588295.0,3588295.0,3588295.0,3588295.0,3588295.0,3588295.0
mean,1.341704,3.400188,1.083079,12.53954,18.22221,2.918017,0.3176007,0.9465144,0.04103314,0.003242766,...,0.139551,0.143486,0.1372239,0.1301445,0.1403318,0.1897141,0.2428981,0.2330787,0.1607117,0.1735975
std,0.7744593,3.996767,0.4307322,8.07262,51.56555,2.000686,0.4655433,0.2249997,0.1983669,0.05685289,...,0.3465206,0.3505678,0.3440837,0.3364624,0.347331,0.3920748,0.4288341,0.422792,0.3672649,0.378763
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.15,1.0,7.5,7.666667,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.96,1.0,10.5,12.7,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,3.6,1.0,14.75,20.61667,5.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5.0,20.0,6.0,50.0,6823.55,6.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [31]:
# Split features and targets
X, y_td, y_fa = preprocessing.split_dataset(dataset)

In [32]:
X_train, X_test, y_train_td, y_test_td = train_test_split(X, y_td, test_size=0.2, random_state=42)

In [33]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [42]:
model_td = RandomForestRegressor(n_jobs=-1, n_estimators=40)
#model_td = LinearRegression()
model_td.fit(X_train_scaled, y_train_td.values.ravel())
predictions = model_td.predict(X_test_scaled)

In [43]:
mse_td = mean_squared_error(y_test_td, predictions)
r2_td = r2_score(y_test_td, predictions)

In [44]:
print(f"MSE: {mse_td}")
print(f"re: {r2_td}")

MSE: 2802.5777391211564
re: -0.062444797793963724
