# NYC Taxi Fare & Duration
## Model Training

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import wget
import pickle

sys.path.append("..")

import source.configs as configs
import preprocessing as preprocessing

# Check if we are in COLAB
IN_COLAB = 'google.colab' in sys.modules

In [3]:
if not os.path.exists("../dataset/yellow_tripdata_2022-05.parquet"):
    if IN_COLAB:
        !wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-05.parquet "../dataset"
    else:
        wget.download(configs.START_DATASET_URL, "../dataset")

In [4]:
dataset = pd.read_parquet("../dataset/yellow_tripdata_2022-05.parquet")

In [5]:
dataset = preprocessing.add_features(dataset)
dataset = preprocessing.add_targets(dataset)
dataset = preprocessing.fill_na_values(dataset)
dataset = preprocessing.process_outliers(dataset, "delete")

### Features
* trip_distance
* hour_of_day: Extracted from tpep_pickup_datetime
* hour_zone: Extracted from tpep_pickup_datetime. Categorical:[Morning, Noon, Afternoon, Evening, Night]
* rush_hour: Extracted from tpep_pickup_datetime. Binary.
### Targets
* fare_amount
* trip_duration

In [6]:
columns = preprocessing.features + preprocessing.targets
train_dataset = preprocessing.select_features(dataset, columns)
train_dataset, encoders = preprocessing.create_one_hot_encodings(train_dataset, columns)
pickle.dump(encoders, open('encoders.pkl', 'wb'))

In [7]:
train_dataset.columns.to_list()

['trip_distance',
 'hour_of_day',
 'rush_hour',
 'fare_amount',
 'trip_duration',
 'hour_zone_afternoon',
 'hour_zone_evening',
 'hour_zone_morning',
 'hour_zone_night',
 'hour_zone_noon']

In [8]:
for col in train_dataset.isna().sum():
    assert col==0
print("Ok, no NAs in train dataset.")

Ok, no NAs in train dataset.


In [9]:
train_dataset.describe()

Unnamed: 0,trip_distance,hour_of_day,rush_hour,fare_amount,trip_duration,hour_zone_afternoon,hour_zone_evening,hour_zone_morning,hour_zone_night,hour_zone_noon
count,3275421.0,3275421.0,3275421.0,3275421.0,3275421.0,3275421.0,3275421.0,3275421.0,3275421.0,3275421.0
mean,3.243137,14.16682,0.3170713,13.69555,15.33393,0.3019203,0.2459235,0.2269525,0.1685594,0.05664432
std,3.744687,5.748614,0.4653355,10.54161,10.91489,0.4590909,0.4306335,0.4188617,0.3743624,0.2311618
min,0.0,0.0,0.0,2.5,1.1,0.0,0.0,0.0,0.0,0.0
25%,1.14,10.0,0.0,7.0,7.666667,0.0,0.0,0.0,0.0,0.0
50%,1.9,15.0,0.0,10.0,12.41667,0.0,0.0,0.0,0.0,0.0
75%,3.43,19.0,1.0,15.5,19.68333,1.0,0.0,0.0,0.0,0.0
max,20.3,23.0,1.0,55.5,65.91667,1.0,1.0,1.0,1.0,1.0


In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [11]:
# Split features and targets
X, y = preprocessing.split_dataset(train_dataset)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
y_train_td = y_train["trip_duration"]
model_td = RandomForestRegressor(n_jobs=-1, n_estimators=5)
model_td.fit(X_train_scaled, y_train_td.values.ravel())
predictions = model_td.predict(X_test_scaled)

In [15]:
y_test_td = y_test["trip_duration"]
mse_td = mean_squared_error(y_test_td, predictions)
r2_td = r2_score(y_test_td, predictions)
print(f"MSE: {mse_td}")
print(f"re: {r2_td}")

MSE: 31.668346386910024
re: 0.7345484889010755


In [16]:
y_train_fa = y_train["fare_amount"]
model_fa = RandomForestRegressor(n_jobs=-1, n_estimators=5)
model_fa.fit(X_train_scaled, y_train_fa.values.ravel())
predictions = model_fa.predict(X_test_scaled)

In [17]:
y_test_fa = y_test["fare_amount"]
mse_fa = mean_squared_error(y_test_fa, predictions)
r2_fa = r2_score(y_test_fa, predictions)
print(f"MSE: {mse_fa}")
print(f"re: {r2_fa}")

MSE: 7.469940771627599
re: 0.9328755983533104


In [18]:
pickle.dump(model_fa, open('model_fa.pkl', 'wb'))
pickle.dump(model_td, open('model_td.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))

### Inference

In [19]:
if os.path.exists("model_td.pkl"):
    with open("model_td.pkl", "rb") as model_td_file:
        model_td = pickle.load(model_td_file)
else:
    print("TD model not found!")

if os.path.exists("model_fa.pkl"):
    with open("model_fa.pkl", "rb") as model_fa_file:
        model_fa = pickle.load(model_fa_file)
else:
    print("FA model not found!")

if os.path.exists("scaler.pkl"):
    with open("scaler.pkl", "rb") as scaler_file:
        scaler = pickle.load(scaler_file)
else:
    print("Scaler model not found!")

if os.path.exists("encoders.pkl"):
    with open("encoders.pkl", "rb") as encoders_file:
        encoders = pickle.load(encoders_file)
else:
    print("Encoders model not found!")

Input

In [20]:
pickup_date="2023/12/12"
pickup_time="12:15:12"
tpep_pickup_datetime = pickup_date + " " + pickup_time
trip_distance = "5.5"

In [21]:
data = {
    "trip_distance": [float(trip_distance)], 
    "tpep_pickup_datetime": [pd.to_datetime(tpep_pickup_datetime)] }
df = pd.DataFrame(data)
df = preprocessing.add_features(df)
df.drop(columns="tpep_pickup_datetime", inplace=True)

In [22]:
for encoder_model in encoders:
    encoder, col = encoder_model
    array = pd.DataFrame(df[col].values.reshape(1,-1), columns=[col])
    encoded_data = encoder.transform(array)
    encoded_data = encoded_data.toarray()
    encoded_df = pd.DataFrame(encoded_data, 
                            columns=encoder.get_feature_names_out([col])
    )
    df = pd.concat([df, encoded_df], axis=1, join="inner")
    df.drop(columns=col, inplace=True)

In [23]:
x = scaler.transform(df)

In [24]:
y_fa = round(model_fa.predict(x)[0],2)
y_td = int(model_td.predict(x)[0])

In [25]:
print(f"Trip duration: {y_td} minutes")
print(f"Fare amount: $ {y_fa}")

Trip duration: 27 minutes
Fare amount: $ 20.09
