In [3]:
import pandas as pd
import sklearn
import pickle
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

In [14]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    # extract day of week and hour of day and put into new columns
    df['day_of_week'] = df.tpep_pickup_datetime.dt.dayofweek
    df['hour_of_day'] = df.tpep_pickup_datetime.dt.hour
    df['day_of_week'] = df['day_of_week'].astype(str)
    df['hour_of_day'] = df['hour_of_day'].astype(str)
    
    # get 'congestion_surcharge', 'fare_amount', 'tip_amount', 'total_amount' and convert to float, delete rows with null values
    for field in ['congestion_surcharge', 'fare_amount', 'tip_amount', 'total_amount']:
        df[field] = pd.to_numeric(df[field], errors='coerce')
        df = df[df[field].notna()]
        df[field] = df[field].astype(float)
        
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    
    return df

In [15]:
df = read_dataframe('/home/roman/python/mlops/mlops-zoomcamp/yellow_tripdata_2025-01.parquet')
df.head()




Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,duration,day_of_week,hour_of_day,PU_DO
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,N,229,237,1,...,0.0,1.0,18.0,2.5,0.0,0.0,8.35,2,0,229_237
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,N,236,237,1,...,0.0,1.0,12.12,2.5,0.0,0.0,2.55,2,0,236_237
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,N,141,141,1,...,0.0,1.0,12.1,2.5,0.0,0.0,1.95,2,0,141_141
3,2,2025-01-01 00:14:27,2025-01-01 00:20:01,3.0,0.52,1.0,N,244,244,2,...,0.0,1.0,9.7,0.0,0.0,0.0,5.566667,2,0,244_244
4,2,2025-01-01 00:21:34,2025-01-01 00:25:06,3.0,0.66,1.0,N,244,116,2,...,0.0,1.0,8.3,0.0,0.0,0.0,3.533333,2,0,244_116


In [16]:
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

  return bound(*args, **kwds)


In [17]:
len(df_train), len(df_val)

(2292953, 286619)

In [20]:
categorical = ['PU_DO', 'day_of_week', 'hour_of_day'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance', 'fare_amount', 'tip_amount', 'total_amount']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [21]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [22]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

5.1813509870013625

In [23]:
with open('lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [15]:
lr = Lasso(0.01)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

9.218025148606207