# NYC Taxi Fare & Duration
## Model Testing

In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import wget
import pickle
import glob

sys.path.append("..")

import source.configs as configs
import preprocessing as preprocessing

# Check if we are in COLAB
IN_COLAB = 'google.colab' in sys.modules

Download dataset files (2022)

In [29]:
if IN_COLAB:
    !wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-05.parquet "../dataset"
else:
    for url in configs.TEST_DATASET_URLS:
        fname = url.split("/")[-1] 
        if not os.path.exists(f"../dataset/test/{fname}"):
            print(f"Downloading {fname}")
            wget.download(url, "../dataset/test/")
        else:
            print(f"{fname} already in disk. Skipping download.")

yellow_tripdata_2022-08.parquet already in disk. Skipping download.


In [30]:
files_list = glob.glob("../dataset/test/*.parquet")
dataset_list = []
for file in files_list:
    print(f"Reading {file}")
    df_month = pd.read_parquet(file)
    dataset_list.append(df_month)
print("Pandas concat for dataframes...")
dataset = pd.concat(dataset_list, axis=0, ignore_index=True)
print("Done!")

Reading ../dataset/test/yellow_tripdata_2022-08.parquet


Pandas concat for dataframes...
Done!


In [31]:
print("Add targets")
dataset = preprocessing.add_targets(dataset)

print("Process outliers")
dataset = preprocessing.process_outliers(dataset, "delete")

if os.path.exists("avg_speed_dict.model"):
    with open("avg_speed_dict.model", "rb") as avg_speed_dict_fd:
        avg_speed_dict = pickle.load(avg_speed_dict_fd)
        print("Avereage Speed dictionary", avg_speed_dict)
else:
    print("Error: Average speed dictionary not found!")

print("Add features")
dataset, _ = preprocessing.add_features(dataset, avg_speed_dict)

print("Fill na values")
dataset = preprocessing.fill_na_values(dataset)

print("Done!")

Add targets


Process outliers
Avereage Speed dictionary {0: 0.25195469322838193, 1: 0.2503693835995309, 2: 0.24999335352944238, 3: 0.26612218282651856, 4: 0.31345897957385943, 5: 0.3355459231509819, 6: 0.27443853508122146, 7: 0.21423070424047383, 8: 0.18310836751240536, 9: 0.1801333123203435, 10: 0.17675564690190834, 11: 0.16859408041651547, 12: 0.16663810140450216, 13: 0.16766048469717576, 14: 0.16282710598596348, 15: 0.15792099847450106, 16: 0.16202178873469653, 17: 0.16106102095237498, 18: 0.16875892761499706, 19: 0.18793062365816957, 20: 0.207260086062618, 21: 0.21805265270227106, 22: 0.22578244300596678, 23: 0.24366564777182562}
Add features
Using pre-processed average speed dictionary
Fill na values
Done!


In [32]:
columns = preprocessing.features + preprocessing.targets
test_dataset = preprocessing.select_features(dataset, columns)
test_dataset, encoders = preprocessing.create_one_hot_encodings(test_dataset, columns)

In [33]:
test_dataset.columns.to_list()

['trip_distance',
 'hour_of_day',
 'rush_hour',
 'day_of_week',
 'trip_d2',
 'avg_speed',
 'fare_amount',
 'trip_duration']

In [34]:
test_dataset.describe()

Unnamed: 0,trip_distance,hour_of_day,rush_hour,day_of_week,trip_d2,avg_speed,fare_amount,trip_duration
count,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0
mean,3.591742,14.22469,0.3214934,2.772824,31.75198,0.1899496,14.25034,15.2732
std,4.341817,5.750488,0.4670498,1.940597,80.0417,0.0341938,11.96088,11.62466
min,0.1,0.0,0.0,0.0,0.01,0.157921,2.5,0.25
25%,1.17,11.0,0.0,1.0,1.3689,0.1628271,7.0,7.466667
50%,1.94,15.0,0.0,3.0,3.7636,0.1767556,10.0,12.01667
75%,3.69,19.0,1.0,4.0,13.6161,0.2142307,16.0,19.16667
max,25.0,23.0,1.0,6.0,625.0,0.3355459,75.0,89.96667


## Model Testing

In [35]:
from sklearn.metrics import mean_squared_error, r2_score

model_prefix = "lgbm"
model_name_td = f"{model_prefix}_model_td.model"
model_name_fa = f"{model_prefix}_model_fa.model"

print(model_name_td)
print(model_name_fa)

if model_prefix == "lgbm":
    import lightgbm as lgb
    model_td = lgb.Booster(model_file=model_name_td)
    model_fa = lgb.Booster(model_file=model_name_fa)
else:
    if os.path.exists(model_name_td):
        with open(model_name_td, "rb") as model_td_fd:
            model_td = pickle.load(model_td_fd)
    else:
        print("TD model not found!")

    if os.path.exists(model_name_fa):
        with open(model_name_fa, "rb") as model_fa_fd:
            model_fa = pickle.load(model_fa_fd)
    else:
        print("FA model not found!")

if os.path.exists("encoders.model"):
    with open("encoders.model", "rb") as encoders_file:
        encoders = pickle.load(encoders_file)
else:
    print("Encoders model not found!")

if os.path.exists("avg_speed_dict.model"):
    with open("avg_speed_dict.model", "rb") as avg_speed_dict_fd:
        avg_speed_dict = pickle.load(avg_speed_dict_fd)
else:
    print("Average speed dictionary not found!")

lgbm_model_td.model
lgbm_model_fa.model


Split dataset

In [36]:
# Split features and targets
X, y = preprocessing.split_dataset(test_dataset)
y_test_td = y["trip_duration"]
y_test_fa = y["fare_amount"]

In [37]:
print("Predict fare amount on test dataset")
test_predictions_fa = model_fa.predict(X)

print("Predict trip duration on test dataset")
test_predictions_td = model_td.predict(X)

rmse_td = mean_squared_error(y_test_td, test_predictions_td)**0.5
r2_td = r2_score(y_test_td, test_predictions_td)
print(f"Trip duration RMSE: {rmse_td} minutes")
print(f"Trip duration R2: {r2_td}")

rmse_fa = mean_squared_error(y_test_fa, test_predictions_fa)**0.5
r2_fa = r2_score(y_test_fa, test_predictions_fa)
print(f"Fare amount RMSE: {rmse_fa} $")
print(f"Fare amount R2: {r2_fa}")


Predict fare amount on test dataset
Predict trip duration on test dataset


In [None]:
test_dataset.loc[:,"pred_fa"] = test_predictions_fa
test_dataset.loc[:,"pred_td"] = test_predictions_td

In [None]:
test_dataset.describe()

Unnamed: 0,trip_distance,hour_of_day,rush_hour,day_of_week,trip_d2,avg_speed,fare_amount,trip_duration,pred_fa,pred_td
count,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0
mean,3.591742,14.22469,0.3214934,2.772824,31.75198,0.1899496,14.25034,15.2732,14.60962,16.30623
std,4.341817,5.750488,0.4670498,1.940597,80.0417,0.0341938,11.96088,11.62466,11.58816,11.27654
min,0.1,0.0,0.0,0.0,0.01,0.157921,2.5,0.25,4.930147,3.121008
25%,1.17,11.0,0.0,1.0,1.3689,0.1628271,7.0,7.466667,7.456148,8.585933
50%,1.94,15.0,0.0,3.0,3.7636,0.1767556,10.0,12.01667,10.38591,12.96791
75%,3.69,19.0,1.0,4.0,13.6161,0.2142307,16.0,19.16667,16.21795,20.44598
max,25.0,23.0,1.0,6.0,625.0,0.3355459,75.0,89.96667,55.3819,66.81872


In [None]:
test_dataset["fa_diff"] = abs(test_dataset["pred_fa"]-test_dataset["fare_amount"])
test_dataset["td_diff"] = abs(test_dataset["pred_td"]-test_dataset["trip_duration"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset["fa_diff"] = abs(test_dataset["pred_fa"]-test_dataset["fare_amount"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset["td_diff"] = abs(test_dataset["pred_td"]-test_dataset["trip_duration"])


In [None]:
test_dataset.describe()

Unnamed: 0,trip_distance,hour_of_day,rush_hour,day_of_week,trip_d2,avg_speed,fare_amount,trip_duration,pred_fa,pred_td,fa_diff,td_diff
count,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0,2911092.0
mean,3.591742,14.22469,0.3214934,2.772824,31.75198,0.1899496,14.25034,15.2732,14.60962,16.30623,1.520637,3.568585
std,4.341817,5.750488,0.4670498,1.940597,80.0417,0.0341938,11.96088,11.62466,11.58816,11.27654,2.42686,3.776422
min,0.1,0.0,0.0,0.0,0.01,0.157921,2.5,0.25,4.930147,3.121008,3.358693e-05,3.129138e-06
25%,1.17,11.0,0.0,1.0,1.3689,0.1628271,7.0,7.466667,7.456148,8.585933,0.5152383,1.173806
50%,1.94,15.0,0.0,3.0,3.7636,0.1767556,10.0,12.01667,10.38591,12.96791,1.03791,2.47031
75%,3.69,19.0,1.0,4.0,13.6161,0.2142307,16.0,19.16667,16.21795,20.44598,1.814703,4.576941
max,25.0,23.0,1.0,6.0,625.0,0.3355459,75.0,89.96667,55.3819,66.81872,70.06412,82.16819


In [None]:
100 * (len(test_dataset[test_dataset["fa_diff"]<2.5]) / len(test_dataset))

85.60990171385858

In [None]:
100 * (len(test_dataset[test_dataset["td_diff"]<5]) / len(test_dataset))

77.982866910424

In [None]:
100 * (len(test_dataset[(test_dataset["fa_diff"]<2.5) & (test_dataset["td_diff"]<5)]) / len(test_dataset))

76.2821649058154