In [46]:
import pandas as pd
import sklearn
import pickle
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, ElasticNet

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import root_mean_squared_error

import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope



In [47]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi")

<Experiment: artifact_location='/home/roman/python/mlops/mlops-zoomcamp/02-experiment-tracking/mlruns/2', creation_time=1751624403419, experiment_id='2', last_update_time=1751624403419, lifecycle_stage='active', name='nyc-taxi', tags={}>

In [48]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    # extract day of week and hour of day and put into new columns
    df['day_of_week'] = df.tpep_pickup_datetime.dt.dayofweek
    df['hour_of_day'] = df.tpep_pickup_datetime.dt.hour
    df['day_of_week'] = df['day_of_week'].astype(str)
    df['hour_of_day'] = df['hour_of_day'].astype(str)
    
    # get 'congestion_surcharge', 'fare_amount', 'tip_amount', 'total_amount' and convert to float, delete rows with null values
    for field in ['congestion_surcharge', 'fare_amount', 'tip_amount', 'total_amount']:
        # df[field] = pd.to_numeric(df[field], errors='coerce')
        df = df[df[field].notna()]
        df[field] = df[field].astype(float)
        
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    
    return df

In [None]:

dir_path = '/home/roman/python/mlops/mlops-zoomcamp'

train_path = dir_path + '/yellow_tripdata_2025-01_sample_train.parquet'
val_path = dir_path + '/yellow_tripdata_2025-01_sample_val.parquet'
test_path = dir_path + '/yellow_tripdata_2025-01_sample_test.parquet'


## Create dataset 

In [15]:
df = read_dataframe('/home/roman/python/mlops/mlops-zoomcamp/yellow_tripdata_2025-01_sample.parquet')
df.head()
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])

# save the datasets to parquet files

df_train.to_parquet(train_path)
df_val.to_parquet(val_path)
df_test.to_parquet(test_path)
# check the length of the datasets
len(df_train), len(df_val), len(df_test)

  return bound(*args, **kwds)


(229295, 28662, 28662)

In [59]:
df_train = pd.read_parquet(train_path)
df_val = pd.read_parquet(val_path)
df_test = pd.read_parquet(test_path)

df_train.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee,duration,day_of_week,hour_of_day,PU_DO
2802332,2,2025-01-30 19:36:26,2025-01-30 19:46:49,1.0,1.31,1.0,N,161,48,1,...,0.0,1.0,21.54,2.5,0.0,0.75,10.383333,3,19,161_48
144232,2,2025-01-02 20:17:42,2025-01-02 20:26:15,1.0,1.43,1.0,N,162,137,1,...,0.0,1.0,15.0,2.5,0.0,0.0,8.55,3,20,162_137
71470,2,2025-01-01 22:06:11,2025-01-01 22:09:16,1.0,0.79,1.0,N,141,262,1,...,0.0,1.0,13.8,2.5,0.0,0.0,3.083333,2,22,141_262
994426,2,2025-01-12 02:58:59,2025-01-12 03:01:53,1.0,0.5,1.0,N,79,79,1,...,0.0,1.0,13.02,2.5,0.0,0.75,2.9,6,2,79_79
2434245,1,2025-01-26 22:09:03,2025-01-26 22:25:30,1.0,2.8,1.0,N,230,140,1,...,0.0,1.0,24.8,2.5,0.0,0.75,16.45,6,22,230_140


In [9]:
# save 10 percent of data to a parquet file

sample_dataset_path = '/home/roman/python/mlops/mlops-zoomcamp/yellow_tripdata_2025-01_sample.parquet'
df.sample(frac=0.1).to_parquet(sample_dataset_path)

NameError: name 'df' is not defined

In [60]:
categorical = [ 'day_of_week', 'hour_of_day'] #'PULocationID', 'DOLocationID', 'PU_DO',
numerical = ['trip_distance', 'congestion_surcharge']  # 'fare_amount', 'tip_amount', 'total_amount'


ohe = OneHotEncoder(handle_unknown='ignore')

full_pipeline = ColumnTransformer(
    transformers=[
        ('ohe', ohe, categorical),
        ('scaler', StandardScaler(), numerical),
    ],
    verbose_feature_names_out=False,  # Ensure short feature names
    n_jobs=-1
)


# transform the training and validation data using the full pipeline
X_train = full_pipeline.fit_transform(df_train[categorical + numerical])
X_val = full_pipeline.transform(df_val[categorical + numerical])

feature_names = list(full_pipeline.get_feature_names_out())

print(feature_names)

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to di

['day_of_week_0', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'hour_of_day_0', 'hour_of_day_1', 'hour_of_day_10', 'hour_of_day_11', 'hour_of_day_12', 'hour_of_day_13', 'hour_of_day_14', 'hour_of_day_15', 'hour_of_day_16', 'hour_of_day_17', 'hour_of_day_18', 'hour_of_day_19', 'hour_of_day_2', 'hour_of_day_20', 'hour_of_day_21', 'hour_of_day_22', 'hour_of_day_23', 'hour_of_day_3', 'hour_of_day_4', 'hour_of_day_5', 'hour_of_day_6', 'hour_of_day_7', 'hour_of_day_8', 'hour_of_day_9', 'trip_distance', 'congestion_surcharge']


In [61]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [62]:
train = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
valid = xgb.DMatrix(X_val, label=y_val, feature_names=feature_names)

In [10]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        
        mlflow.log_param('categorical_features', categorical)
        mlflow.log_param('numerical_features', numerical)
        
        mlflow.log_param('train_dataset_path', train_path)
        mlflow.log_param('val_dataset_path', val_path)
    
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [34]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=3,
    trials=Trials()
)

  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?]

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:9.02459                          
[1]	validation-rmse:8.11913                          
[2]	validation-rmse:7.32049                          
[3]	validation-rmse:6.62028                          
[4]	validation-rmse:6.01117                          
[5]	validation-rmse:5.47901                          
[6]	validation-rmse:5.02124                          
[7]	validation-rmse:4.63048                          
[8]	validation-rmse:4.29830                          
[9]	validation-rmse:4.01887                          
[10]	validation-rmse:3.78582                         
[11]	validation-rmse:3.59273                         
[12]	validation-rmse:3.43407                         
[13]	validation-rmse:3.30466                         
[14]	validation-rmse:3.20145                         
[15]	validation-rmse:3.11836                         
[16]	validation-rmse:3.05365                         
[17]	validation-rmse:3.00293                         
[18]	validation-rmse:2.96411

KeyboardInterrupt: 

In [None]:
mlflow.end_run()

with mlflow.start_run():
    
    mlflow.set_tag('developer', 'khabarov')
    
    mlflow.log_param('model', 'ElasticNet')
    mlflow.log_param('categorical_features', categorical)
    mlflow.log_param('numerical_features', numerical)
    
    mlflow.log_param('train_dataset_path', train_path)
    mlflow.log_param('val_dataset_path', val_path)
    
    l1_ratio = 0.3
    alpha = 0.01
    
    mlflow.log_param('alpha', alpha)
    mlflow.log_param('l1_ratio', l1_ratio)
    
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    rmse = root_mean_squared_error(y_val, y_pred)
    
    mlflow.log_metric('rmse', rmse)
    
    print(f'RMSE: {rmse}')

Exception: Run with UUID d6ab6442b8ee4b479de98a1b66936a7f is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [72]:
import json
import matplotlib.pyplot as plt
import seaborn as sns


def comprehensive_feature_importance_analysis(model, feature_names=None):
    """Analyze and log comprehensive feature importance."""

    importance_types = ["weight", "gain", "cover", "total_gain"]

    for imp_type in importance_types:
        # Get importance scores
        importance = model.get_score(importance_type=imp_type)

        if not importance:
            continue

        # Sort features by importance
        sorted_features = sorted(
            importance.items(), key=lambda x: x[1], reverse=True
        )

        # Create visualization
        features, scores = zip(*sorted_features[:10])

        plt.figure(figsize=(10, 8))
        sns.barplot(x=list(scores), y=list(features))
        plt.title(f"Top 10 Feature Importance ({imp_type.title()})")
        plt.xlabel("Importance Score")
        plt.tight_layout()

        # Save and log plot
        plot_filename = f"feature_importance_{imp_type}.png"
        plt.savefig(plot_filename, bbox_inches="tight")
        mlflow.log_artifact(plot_filename)
        plt.close()

        # Log importance as JSON artifact
        json_filename = f"feature_importance_{imp_type}.json"
        with open(json_filename, "w") as f:
            json.dump(importance, f, indent=2)
        mlflow.log_artifact(json_filename)

In [74]:
import mlflow.xgboost


mlflow.end_run()

with mlflow.start_run():
    
    best_params = {
        'learning_rate': 0.06836426267409443,
        'max_depth': 10,
        'min_child_weight': 14.354365207007865,
        'objective': 'reg:linear',
        'reg_alpha': 0.2042301820266,
        'reg_lambda': 0.11861308163,
        'seed': 42,
    }
    
    mlflow.log_params(best_params)

    booster = xgb.train(best_params, dtrain=train,
                num_boost_round=1000,
                evals=[(valid, 'validation')],
                early_stopping_rounds=50
            )

    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    mlflow.log_param('categorical_features', categorical)
    mlflow.log_param('numerical_features', numerical)
    
    comprehensive_feature_importance_analysis(booster, feature_names=feature_names)
    
    mlflow.log_param('model', 'XGBoost')
    
    with open('models/preprocessor.b', 'wb') as f_out:
        pickle.dump(full_pipeline, f_out)
    
    mlflow.log_artifact('models/preprocessor.b', artifact_path="preprocessor")
    mlflow.xgboost.log_model(booster, artifact_path="model")
    
    

  self.starting_round = model.num_boosted_rounds()


[0]	validation-rmse:9.53690
[1]	validation-rmse:9.06132
[2]	validation-rmse:8.62692
[3]	validation-rmse:8.23074
[4]	validation-rmse:7.87073
[5]	validation-rmse:7.54336
[6]	validation-rmse:7.24711
[7]	validation-rmse:6.97871
[8]	validation-rmse:6.73638
[9]	validation-rmse:6.51651
[10]	validation-rmse:6.31927
[11]	validation-rmse:6.14209
[12]	validation-rmse:5.98366
[13]	validation-rmse:5.84258
[14]	validation-rmse:5.71658
[15]	validation-rmse:5.60474
[16]	validation-rmse:5.50436
[17]	validation-rmse:5.41499
[18]	validation-rmse:5.33564
[19]	validation-rmse:5.26541
[20]	validation-rmse:5.20260
[21]	validation-rmse:5.14694
[22]	validation-rmse:5.09801
[23]	validation-rmse:5.05558
[24]	validation-rmse:5.01505
[25]	validation-rmse:4.98026
[26]	validation-rmse:4.94914
[27]	validation-rmse:4.92184
[28]	validation-rmse:4.89661
[29]	validation-rmse:4.87506
[30]	validation-rmse:4.85568
[31]	validation-rmse:4.83911
[32]	validation-rmse:4.82385
[33]	validation-rmse:4.80935
[34]	validation-rmse:4.7

  xgb_model.save_model(model_data_path)


In [None]:
with open('models/lr_model.bin', 'wb') as f_out:
    pickle.dump((full_pipeline, model), f_out)