In [1]:
import pandas as pd
import numpy as np
import datetime

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_selector
from sklearn.model_selection import cross_validate, PredefinedSplit
from sklearn.metrics import make_scorer

from scipy import sparse

import pickle

import mlflow

## Setting mlflow tracking experiment 

In [2]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment')

<Experiment: artifact_location='/Users/HZB/code/vickoru/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1718035375041, experiment_id='1', last_update_time=1718035375041, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

## Manual preprocessing and modeling

In [2]:
raw_data = pd.read_parquet('../raw_data/yellow_tripdata_2023-01.parquet')

In [3]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [4]:
deltime = datetime.timedelta(minutes=1)

In [5]:
deltime

datetime.timedelta(seconds=60)

In [6]:
(raw_data.tpep_dropoff_datetime[0] - raw_data.tpep_pickup_datetime[0])

Timedelta('0 days 00:08:26')

In [7]:
(raw_data.tpep_dropoff_datetime[0] - raw_data.tpep_pickup_datetime[0])/deltime

8.433333333333334

In [8]:
data = raw_data.copy()
data.loc[:, 'duration_minutes'] = (
    raw_data.tpep_dropoff_datetime - raw_data.tpep_pickup_datetime)/deltime

In [9]:
data.duration_minutes.std()

42.59435124195458

In [10]:
data_filtered = data[np.logical_and(
    data.duration_minutes >= 1, data.duration_minutes <= 60)].copy()

In [11]:
data_filtered.shape[0] / data.shape[0]

0.9812202822125979

In [12]:
ohe = OneHotEncoder(sparse_output=True)

In [13]:
ohe.fit(data_filtered[['PULocationID']])

In [14]:
pickup_loc = ohe.transform(data_filtered[['PULocationID']])

In [15]:
pickup_loc.shape

(3009173, 255)

In [16]:
ohe_do = OneHotEncoder(sparse_output=True)

In [17]:
ohe_do.fit(data_filtered[['DOLocationID']])

In [18]:
dropoff_loc = ohe_do.transform(data_filtered[['DOLocationID']])
dropoff_loc.shape

(3009173, 260)

In [19]:
X = sparse.hstack([pickup_loc, dropoff_loc])

In [20]:
X.shape

(3009173, 515)

In [21]:
data_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3009173 entries, 0 to 3066765
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee            floa

In [22]:
y = data_filtered.duration_minutes

In [23]:
simple_lin = LinearRegression()

In [24]:
simple_lin.fit(X, y)

In [25]:
y_pred = simple_lin.predict(X)

In [26]:
rmse = np.sqrt(np.mean((y - y_pred)**2))

In [27]:
rmse

7.649261948684693

## Pipelines and functions for preprocessing and modeling

In [3]:
def build_y_target(pickup, dropoff, minutes=1):
    deltime = datetime.timedelta(minutes=minutes)
    duration_mins = 1./deltime * (dropoff - pickup)/deltime
    
    return duration_mins 

def filtering_times(data):
    data_filtered = data[np.logical_and(
    data.duration_minutes >= 1, data.duration_minutes <= 60)].copy()
    
    return data_filtered

def process_df(raw_data, minutes=1):
    deltime = datetime.timedelta(minutes=minutes)
    data = raw_data.copy()
    data.loc[:, 'duration_minutes'] = (
    raw_data.tpep_dropoff_datetime - raw_data.tpep_pickup_datetime)/deltime
    data = filtering_times(data)
    columns = data[['PULocationID', 'DOLocationID', 'duration_minutes']]
    
    return data[['PULocationID', 'DOLocationID', 'duration_minutes']]   
    
    
def calc_rmse(y, y_pred):
    return np.sqrt(np.mean((y - y_pred)**2))


In [4]:
raw_data = pd.read_parquet('../raw_data/yellow_tripdata_2023-01.parquet')

In [5]:
raw_data = pd.read_parquet('../raw_data/yellow_tripdata_2023-01.parquet')

In [6]:
data_processed = process_df(raw_data, minutes=1) 

In [7]:
data_processed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3009173 entries, 0 to 3066765
Data columns (total 3 columns):
 #   Column            Dtype  
---  ------            -----  
 0   PULocationID      int64  
 1   DOLocationID      int64  
 2   duration_minutes  float64
dtypes: float64(1), int64(2)
memory usage: 91.8 MB


In [8]:
data_processed.duration_minutes.std()

9.939385620151036

In [9]:
data_processed.shape[0] / raw_data.shape[0]

0.9812202822125979

In [10]:
X = data_processed.drop(columns='duration_minutes')
y = data_processed.duration_minutes

In [11]:
raw_data_val = pd.read_parquet('../raw_data/yellow_tripdata_2024-02.parquet')

In [12]:
data_val = process_df(raw_data_val, minutes=1)

In [13]:
data_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2938060 entries, 0 to 3007525
Data columns (total 3 columns):
 #   Column            Dtype  
---  ------            -----  
 0   PULocationID      int32  
 1   DOLocationID      int32  
 2   duration_minutes  float64
dtypes: float64(1), int32(2)
memory usage: 67.2 MB


In [14]:
X_val = data_val.drop(columns='duration_minutes')
y_val = data_val.duration_minutes

### Simple Lin Reg

In [38]:
model_ = LinearRegression()
pipeline = make_pipeline(OneHotEncoder(sparse_output=True, handle_unknown='ignore'), 
                             model_)
pipeline

In [39]:
simple_lin = pipeline.fit(X, y)

In [40]:
y_pred = simple_lin.predict(X)

In [41]:
rmse = np.sqrt(np.mean((y - y_pred)**2))
rmse

7.649261948684693

#### RMSE on validation data 

In [42]:
y_pred_val = simple_lin.predict(X_val)

In [44]:
rmse_val = calc_rmse(y_val, y_pred_val)
rmse_val

8.17406205918777

### Ridge

In [45]:
estimator = Ridge(alpha=0.01)
pipeline = make_pipeline(OneHotEncoder(sparse_output=True, handle_unknown='ignore'), 
                             estimator)
pipeline

In [46]:
score_rmse = make_scorer(calc_rmse, greater_is_better=True)

cv = cross_validate(pipeline, X, y, cv=5, scoring=score_rmse)

In [47]:
cv['test_score']

array([7.62414851, 7.51872833, 7.72708651, 7.58238999, 7.87087418])

In [48]:
model = pipeline.fit(X, y)

In [49]:
y_pred = model.predict(X)

In [50]:
rmse = np.sqrt(np.mean((y - y_pred)**2))
rmse

7.649274275150644

#### RMSE on validation data 

In [51]:
y_pred_val = model.predict(X_val)

In [52]:
y_pred_val

array([12.27538813, 20.33953777, 37.41663051, ..., 11.79123202,
       10.60189672, 11.23474214])

In [53]:
rmse_val = calc_rmse(y_val, y_pred_val)
rmse_val

8.174095480198279

### Lasso

In [54]:
estimator = Lasso(alpha=0.01)
pipeline = make_pipeline(OneHotEncoder(sparse_output=True, handle_unknown='ignore'), 
                             estimator)
pipeline

In [55]:
score_rmse = make_scorer(calc_rmse, greater_is_better=True)

cv = cross_validate(pipeline, X, y, cv=5, scoring=score_rmse)

In [56]:
cv['test_score']

array([7.97241533, 7.90590057, 8.09116294, 7.95941311, 8.1859279 ])

In [57]:
model = pipeline.fit(X, y)

In [58]:
y_pred = model.predict(X)

In [59]:
rmse = np.sqrt(np.mean((y - y_pred)**2))
rmse

8.015530016620962

#### RMSE on validation data 

In [60]:
y_pred_val = model.predict(X_val)

In [61]:
y_pred_val

array([11.89603718, 15.30238301, 34.42515092, ..., 12.57509556,
       11.45089177, 12.03498133])

In [63]:
rmse_val = calc_rmse(y_val, y_pred_val)
rmse_val

8.499002680819043

#### Pickle file of Lasso model

In [66]:
with open('../models/lasso_reg.bin', 'wb') as f_out:
    pickle.dump(model, f_out) 
    

## Mlflow experiments

### Lasso

In [19]:
with mlflow.start_run():
    
    model_tag = 'lasso'
    
    mlflow.set_tag('developer', 'victor')
    mlflow.set_tag('model', model_tag)
    
    mlflow.log_param('train_data_path', '../raw_data/yellow_tripdata_2023-01.parquet')
    mlflow.log_param('val_data_path', '../raw_data/yellow_tripdata_2024-02.parquet')
    
    alpha = 0.5
    mlflow.log_param('alpha', alpha)
    
    estimator = Lasso(alpha=alpha)
    
    pipeline = make_pipeline(OneHotEncoder(sparse_output=True, handle_unknown='ignore'), 
                             estimator)
    
    pipeline.fit(X, y)
    
    y_pred = pipeline.predict(X_val)
    
    rmse = calc_rmse(y_val, y_pred) 
    mlflow.log_metric('rmse', rmse)
        

### Ridge

In [18]:
with mlflow.start_run():
    
    model_tag = 'ridge'

    mlflow.set_tag('developer', 'victor')
    mlflow.set_tag('model', model_tag)
    
    mlflow.log_param('train_data_path', '../raw_data/yellow_tripdata_2023-01.parquet')
    mlflow.log_param('val_data_path', '../raw_data/yellow_tripdata_2024-02.parquet')
    
    alpha = 0.1
    mlflow.log_param('alpha', alpha)
    
    estimator = Ridge(alpha=alpha)
    
    pipeline = make_pipeline(OneHotEncoder(sparse_output=True, handle_unknown='ignore'), 
                             estimator)
    
    pipeline.fit(X, y)
    
    y_pred = pipeline.predict(X_val)
    
    rmse = calc_rmse(y_val, y_pred) 
    mlflow.log_metric('rmse', rmse)