In [14]:
import pandas as pd

from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import pickle

pd.__version__

'2.2.2'

In [3]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet').head(100000)

In [4]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2.0,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186.0,79.0,2.0,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,1.0,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140.0,236.0,1.0,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1.0,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,N,236.0,79.0,1.0,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
3,1.0,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,N,79.0,211.0,1.0,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
4,1.0,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,N,211.0,148.0,1.0,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0


In [5]:
df['duration'] = pd.to_datetime(df['tpep_dropoff_datetime']) - pd.to_datetime(df['tpep_pickup_datetime'])
df['duration'] = df['duration'].apply(lambda x: x.total_seconds() / 60)
df.shape

(100000, 20)

In [6]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

In [7]:
categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']

In [8]:
df[categorical] = df[categorical].astype(str)
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2.0,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186.0,79.0,2.0,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0,19.8
1,1.0,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140.0,236.0,1.0,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0,6.6
2,1.0,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,N,236.0,79.0,1.0,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0,17.916667
3,1.0,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,N,79.0,211.0,1.0,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0,8.3
4,1.0,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,N,211.0,148.0,1.0,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0,6.1


In [9]:
X = []
for feat in categorical:
    tmp = pd.get_dummies(df['PULocationID']).astype('Int8')
    tmp.columns = [f'{feat}_{x}' for x in tmp.columns]
    if len(X) > 0:
        X = pd.concat((X, tmp), axis=1)
    else:
        X = tmp

X = pd.concat((X, df[numerical]), axis=1)
print(X.shape)
X.head()


(97306, 445)


Unnamed: 0,PULocationID_1.0,PULocationID_10.0,PULocationID_100.0,PULocationID_106.0,PULocationID_107.0,PULocationID_108.0,PULocationID_11.0,PULocationID_112.0,PULocationID_113.0,PULocationID_114.0,...,DOLocationID_90.0,DOLocationID_91.0,DOLocationID_92.0,DOLocationID_93.0,DOLocationID_94.0,DOLocationID_95.0,DOLocationID_96.0,DOLocationID_97.0,DOLocationID_98.0,trip_distance
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.72
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.8
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.7
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.8


In [10]:
target = 'duration'
y = df[target]
y.shape

(97306,)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# model = RandomForestRegressor(
#     n_estimators=20,
#     criterion='absolute_error',
#     min_samples_leaf=5
# )
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f"""
    R2_score: {r2_score(y_test, y_pred)}\n
    MAE: {mean_absolute_error(y_test, y_pred)}\n
    MAPE: {mean_absolute_percentage_error(y_test, y_pred)}
""")


    R2_score: 0.6265639321560232

    MAE: 4.268248119601669

    MAPE: 0.33010469396987585



In [16]:
with open('./models/tree_regressor.pkl', 'wb') as f:
    pickle.dump(model, f)