# NYC Taxi Fare & Duration
## Model Testing

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import wget
import pickle
import glob

sys.path.append("..")

import source.configs as configs
import preprocessing as preprocessing

# Check if we are in COLAB
IN_COLAB = 'google.colab' in sys.modules

Download dataset files (2022)

In [3]:
if IN_COLAB:
    !wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-05.parquet "../dataset"
else:
    for url in configs.TEST_DATASET_URLS:
        fname = url.split("/")[-1] 
        if not os.path.exists(f"../dataset/test/{fname}"):
            print(f"Downloading {fname}")
            wget.download(url, "../dataset/test/")
        else:
            print(f"{fname} already in disk. Skipping download.")

yellow_tripdata_2022-08.parquet already in disk. Skipping download.


In [4]:
files_list = glob.glob("../dataset/test/*.parquet")
dataset_list = []
for file in files_list:
    print(f"Reading {file}")
    df_month = pd.read_parquet(file)
    dataset_list.append(df_month)
print("Pandas concat for dataframes...")
dataset = pd.concat(dataset_list, axis=0, ignore_index=True)
print("Done!")

Reading ../dataset/test/yellow_tripdata_2022-08.parquet
Pandas concat for dataframes...
Done!


In [5]:
print("Add targets")
dataset = preprocessing.add_targets(dataset)

print("Process outliers")
dataset = preprocessing.process_outliers(dataset, "delete")

if os.path.exists("avg_speed_dict.model"):
    with open("avg_speed_dict.model", "rb") as avg_speed_dict_fd:
        avg_speed_dict = pickle.load(avg_speed_dict_fd)
        print("Avereage Speed dictionary", avg_speed_dict)
else:
    print("Error: Average speed dictionary not found!")

print("Add features")
dataset, _ = preprocessing.add_features(dataset, avg_speed_dict)

print("Fill na values")
dataset = preprocessing.fill_na_values(dataset)

print("Done!")

Add targets
Process outliers
Avereage Speed dictionary {0: 0.25195469322838193, 1: 0.2503693835995309, 2: 0.24999335352944238, 3: 0.26612218282651856, 4: 0.31345897957385943, 5: 0.3355459231509819, 6: 0.27443853508122146, 7: 0.21423070424047383, 8: 0.18310836751240536, 9: 0.1801333123203435, 10: 0.17675564690190834, 11: 0.16859408041651547, 12: 0.16663810140450216, 13: 0.16766048469717576, 14: 0.16282710598596348, 15: 0.15792099847450106, 16: 0.16202178873469653, 17: 0.16106102095237498, 18: 0.16875892761499706, 19: 0.18793062365816957, 20: 0.207260086062618, 21: 0.21805265270227106, 22: 0.22578244300596678, 23: 0.24366564777182562}
Add features
Using pre-processed average speed dictionary
Fill na values
Done!


In [6]:
columns = preprocessing.features + preprocessing.targets
test_dataset = preprocessing.select_features(dataset, columns)
test_dataset, encoders = preprocessing.create_one_hot_encodings(test_dataset, columns)

In [7]:
test_dataset.columns.to_list()

['trip_distance',
 'hour_of_day',
 'rush_hour',
 'day_of_week',
 'trip_d2',
 'avg_speed',
 'fare_amount',
 'trip_duration']

In [8]:
test_dataset.describe()

Unnamed: 0,trip_distance,hour_of_day,rush_hour,day_of_week,trip_d2,avg_speed,fare_amount,trip_duration
count,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0
mean,3.591742,14.22469,0.3214934,2.772824,31.75198,0.2519547,14.25034,15.2732
std,4.341817,5.750488,0.4670498,1.940597,80.0417,1.498801e-15,11.96088,11.62466
min,0.1,0.0,0.0,0.0,0.01,0.2519547,2.5,0.25
25%,1.17,11.0,0.0,1.0,1.3689,0.2519547,7.0,7.466667
50%,1.94,15.0,0.0,3.0,3.7636,0.2519547,10.0,12.01667
75%,3.69,19.0,1.0,4.0,13.6161,0.2519547,16.0,19.16667
max,25.0,23.0,1.0,6.0,625.0,0.2519547,75.0,89.96667


## Model Testing

In [9]:
from sklearn.metrics import mean_squared_error, r2_score

model_prefix = "lgbm"
model_name_td = f"{model_prefix}_model_td.model"
model_name_fa = f"{model_prefix}_model_fa.model"

print(model_name_td)
print(model_name_fa)

if model_prefix == "lgbm":
    import lightgbm as lgb
    model_td = lgb.Booster(model_file=model_name_td)
    model_fa = lgb.Booster(model_file=model_name_fa)
else:
    if os.path.exists(model_name_td):
        with open(model_name_td, "rb") as model_td_fd:
            model_td = pickle.load(model_td_fd)
    else:
        print("TD model not found!")

    if os.path.exists(model_name_fa):
        with open(model_name_fa, "rb") as model_fa_fd:
            model_fa = pickle.load(model_fa_fd)
    else:
        print("FA model not found!")

if os.path.exists("encoders.model"):
    with open("encoders.model", "rb") as encoders_file:
        encoders = pickle.load(encoders_file)
else:
    print("Encoders model not found!")

if os.path.exists("avg_speed_dict.model"):
    with open("avg_speed_dict.model", "rb") as avg_speed_dict_fd:
        avg_speed_dict = pickle.load(avg_speed_dict_fd)
else:
    print("Average speed dictionary not found!")

lgbm_model_td.model
lgbm_model_fa.model


Split dataset

In [10]:
# Split features and targets
X, y = preprocessing.split_dataset(test_dataset)
y_test_td = y["trip_duration"]
y_test_fa = y["fare_amount"]

In [11]:
print("Predict fare amount on test dataset")
test_predictions_fa = model_fa.predict(X)

print("Predict trip duration on test dataset")
test_predictions_td = model_td.predict(X)

rmse_td = mean_squared_error(y_test_td, test_predictions_td)**0.5
r2_td = r2_score(y_test_td, test_predictions_td)
print(f"Trip duration RMSE: {rmse_td} minutes")
print(f"Trip duration R2: {r2_td}")

rmse_fa = mean_squared_error(y_test_fa, test_predictions_fa)**0.5
r2_fa = r2_score(y_test_fa, test_predictions_fa)
print(f"Fare amount RMSE: {rmse_fa} $")
print(f"Fare amount R2: {r2_fa}")


Predict fare amount on test dataset
Predict trip duration on test dataset
Trip duration RMSE: 5.237583267460371 minutes
Trip duration R2: 0.7969975831792483
Fare amount RMSE: 2.8583603141702234 $
Fare amount R2: 0.9428905302315065


In [12]:
test_dataset.loc[:,"pred_fa"] = test_predictions_fa
test_dataset.loc[:,"pred_td"] = test_predictions_td

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset.loc[:,"pred_fa"] = test_predictions_fa
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset.loc[:,"pred_td"] = test_predictions_td


In [13]:
test_dataset.describe()

Unnamed: 0,trip_distance,hour_of_day,rush_hour,day_of_week,trip_d2,avg_speed,fare_amount,trip_duration,pred_fa,pred_td
count,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0
mean,3.591742,14.22469,0.3214934,2.772824,31.75198,0.2519547,14.25034,15.2732,14.61828,16.33512
std,4.341817,5.750488,0.4670498,1.940597,80.0417,1.498801e-15,11.96088,11.62466,11.6221,11.52929
min,0.1,0.0,0.0,0.0,0.01,0.2519547,2.5,0.25,4.370062,2.69825
25%,1.17,11.0,0.0,1.0,1.3689,0.2519547,7.0,7.466667,7.467262,8.521201
50%,1.94,15.0,0.0,3.0,3.7636,0.2519547,10.0,12.01667,10.40263,13.03153
75%,3.69,19.0,1.0,4.0,13.6161,0.2519547,16.0,19.16667,16.28002,20.65751
max,25.0,23.0,1.0,6.0,625.0,0.2519547,75.0,89.96667,58.53652,74.32955


In [14]:
test_dataset["fa_diff"] = abs(test_dataset["pred_fa"]-test_dataset["fare_amount"])
test_dataset["td_diff"] = abs(test_dataset["pred_td"]-test_dataset["trip_duration"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset["fa_diff"] = abs(test_dataset["pred_fa"]-test_dataset["fare_amount"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset["td_diff"] = abs(test_dataset["pred_td"]-test_dataset["trip_duration"])


In [15]:
test_dataset.describe()

Unnamed: 0,trip_distance,hour_of_day,rush_hour,day_of_week,trip_d2,avg_speed,fare_amount,trip_duration,pred_fa,pred_td,fa_diff,td_diff
count,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0
mean,3.591742,14.22469,0.3214934,2.772824,31.75198,0.2519547,14.25034,15.2732,14.61828,16.33512,1.509064,3.54836
std,4.341817,5.750488,0.4670498,1.940597,80.0417,1.498801e-15,11.96088,11.62466,11.6221,11.52929,2.42754,3.852457
min,0.1,0.0,0.0,0.0,0.01,0.2519547,2.5,0.25,4.370062,2.69825,1.811566e-06,2.863479e-06
25%,1.17,11.0,0.0,1.0,1.3689,0.2519547,7.0,7.466667,7.467262,8.521201,0.4898242,1.116568
50%,1.94,15.0,0.0,3.0,3.7636,0.2519547,10.0,12.01667,10.40263,13.03153,1.012888,2.40671
75%,3.69,19.0,1.0,4.0,13.6161,0.2519547,16.0,19.16667,16.28002,20.65751,1.808888,4.559814
max,25.0,23.0,1.0,6.0,625.0,0.2519547,75.0,89.96667,58.53652,74.32955,70.50556,82.04779


In [16]:
100 * (len(test_dataset[test_dataset["fa_diff"]<2.5 & ]) / len(test_dataset))

85.62240561273914

In [17]:
100 * (len(test_dataset[test_dataset["td_diff"]<5]) / len(test_dataset))

78.11333341577662

In [20]:
100 * (len(test_dataset[(test_dataset["fa_diff"]<2.5) & (test_dataset["td_diff"]<5)]) / len(test_dataset))

76.47212798496234