# NYC Taxi Fare & Duration
## Model Training

In [1]:
%load_ext autoreload
%autoreload 2

In [71]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import wget
import pickle

sys.path.append("..")

import source.configs as configs
import preprocessing as preprocessing

# Check if we are in COLAB
IN_COLAB = 'google.colab' in sys.modules

In [72]:
if not os.path.exists("../dataset/yellow_tripdata_2022-05.parquet"):
    if IN_COLAB:
        !wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-05.parquet "../dataset"
    else:
        wget.download(configs.START_DATASET_URL, "../dataset")

In [73]:
dataset = pd.read_parquet("../dataset/yellow_tripdata_2022-05.parquet")

In [74]:
dataset = preprocessing.add_features(dataset, "train")
dataset = preprocessing.fill_na_values(dataset)
dataset = preprocessing.process_outliers(dataset, "delete")

### Features
* trip_distance
* hour_of_day: Extracted from tpep_pickup_datetime
* hour_zone: Extracted from tpep_pickup_datetime. Categorical:[Morning, Noon, Afternoon, Evening, Night]
* rush_hour: Extracted from tpep_pickup_datetime. Binary.
### Targets
* fare_amount
* trip_duration

In [75]:
features = preprocessing.features
targets = preprocessing.targets
columns = features + targets
train_dataset = preprocessing.select_features(dataset, columns)
train_dataset, encoders = preprocessing.create_one_hot_encodings(train_dataset, columns)
pickle.dump(encoders, open('encoders.pkl', 'wb'))

In [76]:
train_dataset.columns.to_list()

['trip_distance',
 'hour_of_day',
 'rush_hour',
 'fare_amount',
 'trip_duration',
 'hour_zone_afternoon',
 'hour_zone_evening',
 'hour_zone_morning',
 'hour_zone_night',
 'hour_zone_noon']

In [77]:
for col in train_dataset.isna().sum():
    assert col==0
print("Ok, no NAs in train dataset.")

Ok, no NAs in train dataset.


In [11]:
train_dataset.describe()

Unnamed: 0,trip_distance,hour_of_day,rush_hour,fare_amount,trip_duration,hour_zone_afternoon,hour_zone_evening,hour_zone_morning,hour_zone_night,hour_zone_noon
count,3275421.0,3275421.0,3275421.0,3275421.0,3275421.0,3275421.0,3275421.0,3275421.0,3275421.0,3275421.0
mean,3.243137,14.16682,0.3170713,13.69555,15.33393,0.3019203,0.2459235,0.2269525,0.1685594,0.05664432
std,3.744687,5.748614,0.4653355,10.54161,10.91489,0.4590909,0.4306335,0.4188617,0.3743624,0.2311618
min,0.0,0.0,0.0,2.5,1.1,0.0,0.0,0.0,0.0,0.0
25%,1.14,10.0,0.0,7.0,7.666667,0.0,0.0,0.0,0.0,0.0
50%,1.9,15.0,0.0,10.0,12.41667,0.0,0.0,0.0,0.0,0.0
75%,3.43,19.0,1.0,15.5,19.68333,1.0,0.0,0.0,0.0,0.0
max,20.3,23.0,1.0,55.5,65.91667,1.0,1.0,1.0,1.0,1.0


In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [13]:
# Split features and targets
X, y = preprocessing.split_dataset(train_dataset)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
y_train_td = y_train["trip_duration"]
model_td = RandomForestRegressor(n_jobs=-1, n_estimators=50)
model_td.fit(X_train_scaled, y_train_td.values.ravel())
predictions = model_td.predict(X_test_scaled)

In [17]:
y_test_td = y_test["trip_duration"]
mse_td = mean_squared_error(y_test_td, predictions)
r2_td = r2_score(y_test_td, predictions)
print(f"MSE: {mse_td}")
print(f"re: {r2_td}")

MSE: 30.97501162289074
re: 0.7403601835995524


In [18]:
y_train_fa = y_train["fare_amount"]
model_fa = RandomForestRegressor(n_jobs=-1, n_estimators=50)
model_fa.fit(X_train_scaled, y_train_fa.values.ravel())
predictions = model_fa.predict(X_test_scaled)

In [19]:
y_test_fa = y_test["fare_amount"]
mse_fa = mean_squared_error(y_test_fa, predictions)
r2_fa = r2_score(y_test_fa, predictions)
print(f"MSE: {mse_fa}")
print(f"re: {r2_fa}")

MSE: 7.328737943058416
re: 0.9341444377816666


In [20]:
pickle.dump(model_fa, open('model_fa.pkl', 'wb'))
pickle.dump(model_td, open('model_td.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))

In [78]:
if os.path.exists("model_td.pkl"):
    with open("model_td.pkl", "rb") as model_td_file:
        model_td = pickle.load(model_td_file)
else:
    print("TD model not found!")

if os.path.exists("model_fa.pkl"):
    with open("model_fa.pkl", "rb") as model_fa_file:
        model_fa = pickle.load(model_fa_file)
else:
    print("FA model not found!")

if os.path.exists("scaler.pkl"):
    with open("scaler.pkl", "rb") as scaler_file:
        scaler = pickle.load(scaler_file)
else:
    print("Scaler model not found!")

if os.path.exists("encoders.pkl"):
    with open("encoders.pkl", "rb") as encoders_file:
        encoders = pickle.load(encoders_file)
else:
    print("Encoders model not found!")

In [98]:
pickup_date="2023/12/12"
pickup_time="12:15:12"
tpep_pickup_datetime = pickup_date + " " + pickup_time
trip_distance = "5.5"

In [106]:
data = {
    "trip_distance": [float(trip_distance)], 
    "tpep_pickup_datetime": [pd.to_datetime(tpep_pickup_datetime)] }
df = pd.DataFrame(data)
df = preprocessing.add_features(df, "inference")

In [107]:
df.columns.to_list()

['trip_distance',
 'tpep_pickup_datetime',
 'hour_of_day',
 'hour_zone',
 'rush_hour']

In [117]:
for encoder_model in encoders:
    encoder, col = encoder_model
    array = np.array(df[col]).reshape(1,-1)
    encoded_data = encoder.transform(array)
    print(encoded_data)
    # HERE
    encoded_df = pd.DataFrame(
        encoded_data, columns=encoder.get_feature_names_out([col])
    )
    df = pd.concat([df, encoded_df], axis=1, join="inner")

  (0, 4)	1.0




AttributeError: 'csr_matrix' object has no attribute 'to_array'

In [90]:
df = preprocessing.select_features(df, preprocessing.features)

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


AttributeError: 'numpy.ndarray' object has no attribute 'toarray'