In [1]:
import pandas as pd
import numpy as np
import gc

In [2]:
jan2023_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [3]:
jan2023_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [4]:
print(f'The number of columns is {len(jan2023_df.columns)}')

The number of columns is 19


In [5]:
jan2023_df['duration'] = (jan2023_df['tpep_dropoff_datetime'] - jan2023_df['tpep_pickup_datetime']).dt.total_seconds()/60

print(f'The SD of Duration for Jan 2023 is {jan2023_df['duration'].std()}')

The SD of Duration for Jan 2023 is 42.59435124195458


In [6]:
n_og_rows = len(jan2023_df)
non_outliers = jan2023_df[jan2023_df['duration'].between(1,60)]

print(f"Percent Remaining = {len(non_outliers)/n_og_rows}")

Percent Remaining = 0.9812202822125979


# Training

In [7]:
# cast feature cols to string
x_cols = ['PULocationID','DOLocationID']
jan2023_df[x_cols] = jan2023_df[x_cols].astype(str)

In [8]:
X = jan2023_df[x_cols]
y = jan2023_df['duration']
assert len(X) == len(y), f'X and y have different number of elements'

In [9]:
del jan2023_df
gc.collect()

17

In [10]:
# one hot encoding - create dict where keys are location ids and values are indices
ohe_pu_dict, ohe_do_dict = {}, {}
# not certain that pickup and dropoff have the same amount of locations so have to create dict separately
unique_pu_ids = X[x_cols[0]].unique()
for i in range(len(unique_pu_ids)):
    ohe_pu_dict[unique_pu_ids[i]] = i
unique_do_ids = X[x_cols[1]].unique()
for i in range(len(unique_do_ids)):
    ohe_do_dict[unique_do_ids[i]] = i

In [11]:
n_total_samples = len(X)
n_train = int(0.15*n_total_samples)

rnd_idx = np.arange(n_total_samples)
np.random.shuffle(rnd_idx)
rnd_idx = rnd_idx[:n_train]

X, y = X.to_numpy()[rnd_idx, :], y.to_numpy()[rnd_idx]

In [12]:
X_ohe = np.zeros((n_train, len(unique_pu_ids)+len(unique_do_ids)), dtype=np.int8)
for i in range(n_train):
    pu, do = X[i, :]
    puid, doid = ohe_pu_dict[pu], ohe_do_dict[do]
    X_ohe[i, [puid, doid]] = 1
X_ohe = np.delete(X_ohe, [len(unique_pu_ids), 0], axis=1)

In [13]:
del X, unique_pu_ids, unique_do_ids
gc.collect()

0

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error #true, pred

In [15]:
lr_model = LinearRegression().fit(X_ohe, y)

In [18]:
preds = lr_model.predict(X_ohe)
print(f"Train rmse is {root_mean_squared_error(y, preds)}")

Train rmse is 43.286348814235424
