In [18]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import root_mean_squared_error

In [2]:
import mlflow


In [3]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('my_new_exp')

2025/12/03 16:43:56 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/03 16:43:56 INFO mlflow.store.db.utils: Updating database tables
2025-12-03 16:43:56 INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
2025-12-03 16:43:56 INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2025-12-03 16:43:56 INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
2025-12-03 16:43:56 INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/workspaces/mlops-project/03-tracking/mlruns/1', creation_time=1764778521062, experiment_id='1', last_update_time=1764778521062, lifecycle_stage='active', name='my_new_exp', tags={'mlflow.experimentKind': 'custom_model_development'}>

# Q1 


In [4]:
df = pd.read_parquet('yellow_tripdata_2023-01.parquet')
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [5]:
len(df.columns)

19

answer : 19

## Q2

In [6]:
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime'])
df.duration = df.duration.apply(lambda x : x.total_seconds() / 60)

In [7]:
df.duration.describe()

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

answer : 42.59

# Q3

In [8]:
# removing outliers

df_clean = df[(df.duration >= 1) & (df.duration <= 60)]
outliers = len(df) - len(df_clean)

In [9]:
1 - outliers / len(df)

0.9812202822125979

# Q4

In [10]:
df_clean.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
duration                        float64
dtype: object

In [11]:
categorical = ['PULocationID','DOLocationID']
df_clean[categorical] = df_clean[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[categorical] = df_clean[categorical].astype(str)


In [12]:
inputs = categorical + ['trip_distance']

In [13]:
dict_df = df_clean[categorical].to_dict(orient="records")

In [14]:
vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(dict_df)
y_train = df_clean.duration.values


lr = LinearRegression()
lr.fit(X_train,y_train)

y_pred = lr.predict(X_train)

root_mean_squared_error(y_pred,y_train)

7.649262060255514

# test

In [15]:
df_test = pd.read_parquet('yellow_tripdata_2023-02.parquet')
df_test.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.3,1.0,N,142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
1,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0
2,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0
3,1,2023-02-01 00:29:33,2023-02-01 01:01:38,0.0,18.8,1.0,N,132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25
4,2,2023-02-01 00:12:28,2023-02-01 00:25:46,1.0,3.22,1.0,N,161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0


In [16]:
df_test['duration'] = (df_test['tpep_dropoff_datetime'] - df_test['tpep_pickup_datetime'])
df_test.duration = df_test.duration.apply(lambda x : x.total_seconds() / 60)

# removing outliers

df_clean_test = df_test[(df_test.duration >= 1) & (df_test.duration <= 60)]
df_clean_test[categorical] = df_clean_test[categorical].astype(str)
dict_df_test = df_clean_test[categorical].to_dict(orient="records")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean_test[categorical] = df_clean_test[categorical].astype(str)


In [17]:

X_test= vectorizer.transform(dict_df_test)
y_test= df_clean_test.duration.values

y_pred = lr.predict(X_test)

root_mean_squared_error(y_pred,y_test)

7.811816183354732

# mlflow

In [19]:
# lasso

with mlflow.start_run():
    mlflow.set_tag('developer', "nn")

    mlflow.log_param('train-path','yellow_tripdata_2023-01.parquet')
    mlflow.log_param('valid-path', 'yellow_tripdata_2023-02.parquet')

    alpha = 0.01

    mlflow.log_param("alpha", alpha)
    lr2 = Lasso(alpha)
    lr2.fit(X_train, y_train)

    y_pred = lr.predict(X_test)

    rmse = root_mean_squared_error(y_pred,y_test)
    mlflow.log_metric("rmse", rmse)
