In [20]:
import pandas as pd
import pickle 

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import root_mean_squared_error as rmse

In [2]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-tracking-experiment")

2025/07/19 10:59:07 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/19 10:59:07 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2025/07/19 10:59:08 INFO mlflow.tracking.fluent: Experiment with name 'nyc-tracking-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/workspaces/MLOps_NYC_Taxi_data/02-experiment tracking/mlruns/1', creation_time=1752922748049, experiment_id='1', last_update_time=1752922748049, lifecycle_stage='active', name='nyc-tracking-experiment', tags={}>

In [7]:
df = pd.read_parquet('/workspaces/MLOps_NYC_Taxi_data/02-experiment tracking/data/green_tripdata_2023-01.parquet')
df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2023-01-01 00:26:10,2023-01-01 00:37:11,N,1.0,166,143,1.0,2.58,14.9,1.0,0.5,4.03,0.0,,1.0,24.18,1.0,1.0,2.75
1,2,2023-01-01 00:51:03,2023-01-01 00:57:49,N,1.0,24,43,1.0,1.81,10.7,1.0,0.5,2.64,0.0,,1.0,15.84,1.0,1.0,0.0
2,2,2023-01-01 00:35:12,2023-01-01 00:41:32,N,1.0,223,179,1.0,0.0,7.2,1.0,0.5,1.94,0.0,,1.0,11.64,1.0,1.0,0.0
3,1,2023-01-01 00:13:14,2023-01-01 00:19:03,N,1.0,41,238,1.0,1.3,6.5,0.5,1.5,1.7,0.0,,1.0,10.2,1.0,1.0,0.0
4,1,2023-01-01 00:33:04,2023-01-01 00:39:02,N,1.0,41,74,1.0,1.1,6.0,0.5,1.5,0.0,0.0,,1.0,8.0,1.0,1.0,0.0


In [8]:
df = df[['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_distance']]
df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)
df.head()

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,trip_distance,duration
0,2023-01-01 00:26:10,2023-01-01 00:37:11,166,143,2.58,11.016667
1,2023-01-01 00:51:03,2023-01-01 00:57:49,24,43,1.81,6.766667
2,2023-01-01 00:35:12,2023-01-01 00:41:32,223,179,0.0,6.333333
3,2023-01-01 00:13:14,2023-01-01 00:19:03,41,238,1.3,5.816667
4,2023-01-01 00:33:04,2023-01-01 00:39:02,41,74,1.1,5.966667


In [9]:
df.describe()

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,trip_distance,duration
count,68211,68211,68211.0,68211.0,68211.0,68211.0
mean,2023-01-16 20:10:55.679523,2023-01-16 20:29:01.515767,98.549735,138.429901,8.114852,18.097271
min,2009-01-01 20:21:27,2009-01-02 11:07:31,1.0,1.0,0.0,0.0
25%,2023-01-09 11:59:47.500000,2023-01-09 12:16:37.500000,74.0,74.0,1.11,7.216667
50%,2023-01-17 08:40:42,2023-01-17 08:56:38,75.0,138.0,1.85,11.4
75%,2023-01-24 15:52:30,2023-01-24 16:06:56,129.0,219.0,3.21,17.483333
max,2023-02-01 03:10:05,2023-02-01 17:27:05,265.0,265.0,120098.84,1439.883333
std,,,61.244314,76.761311,585.105955,74.925631


In [10]:
new_df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
new_df.shape

(65946, 6)

In [11]:
percentage = new_df.shape[0] / df.shape[0] * 100
print(percentage)

96.6794212077231


In [12]:
categorical = ['PULocationID', 'DOLocationID']
numerical =['trip_distance']

new_df[categorical] = new_df[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[categorical] = new_df[categorical].astype(str)


In [13]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df = df[['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_distance']]
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [14]:
df_val = read_dataframe('/workspaces/MLOps_NYC_Taxi_data/02-experiment tracking/data/green_tripdata_2023-02.parquet')

In [15]:
train_dicts = new_df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()

X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = new_df[target].values

In [16]:
lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
print("RMSE:", rmse(y_train, y_pred))

RMSE: 7.060351536580523


In [17]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'duration'
y_val = df_val[target].values

In [18]:
y_pred = lr.predict(X_val)
rmse(y_val, y_pred)

7.355376878970508

In [23]:
with mlflow.start_run():

    mlflow.set_tag("developer", 'umang')

    mlflow.log_param("train_data_path", "./data/green_tripdata_2023-01.parquet")
    mlflow.log_param("val_data_path", "./data/green_tripdata_2023-02.parquet")
    
    alpha = 0.1
    mlflow.log_param("alpha", alpha)

    lasso = Lasso(alpha)

    lasso.fit(X_train, y_train)

    y_pred = lasso.predict(X_val)
    rme = rmse(y_val, y_pred)
    print("RMSE:", rme)
    mlflow.log_metric("rmse", rme)

RMSE: 8.876795398908774
