In [1]:
import mlflow
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
import os

In [2]:
mlflow.set_tracking_uri('sqlite:///mlflow_lab2.db')
mlflow.set_experiment('lab2-experiment')

2025/03/24 23:37:27 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/03/24 23:37:27 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/Users/ynaung/MSDS/Spring2/mlops/lab/lab2/mlruns/1', creation_time=1742884648057, experiment_id='1', last_update_time=1742884648057, lifecycle_stage='active', name='lab2-experiment', tags={}>

In [3]:
column_names = ['class', 'alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 
                'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
                'proanthocyanins', 'color_intensity', 'hue', 
                'od280/od315_of_diluted_wines', 'proline']

df = pd.read_csv('data/wine/wine.data', header=None, names=column_names)

df.head()

Unnamed: 0,class,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
df.describe()

Unnamed: 0,class,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,1.938202,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.775035,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,1.0,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,1.0,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,2.0,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,3.0,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,3.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [5]:
y = df['class']
X = df.drop('class', axis=1)

In [6]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, shuffle=True)

In [7]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns,index=X_train.index)

X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns,index=X_val.index)

X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

In [8]:
os.makedirs('save_data_lab2', exist_ok=True)

X_train_scaled.to_parquet('save_data_lab2/x_train.parquet')
X_val_scaled.to_parquet('save_data_lab2/x_val.parquet')
X_test_scaled.to_parquet('save_data_lab2/x_test.parquet')

pd.DataFrame(y_train, columns=['class']).to_parquet('save_data_lab2/y_train.parquet')
pd.DataFrame(y_val, columns=['class']).to_parquet('save_data_lab2/y_val.parquet')
pd.DataFrame(y_test, columns=['class']).to_parquet('save_data_lab2/y_test.parquet')

In [9]:
def objective(params):
    with mlflow.start_run():
        classifier_type = params['type']
        del params['type']
        
        if classifier_type == 'dt':
            clf = DecisionTreeClassifier(**params)
            model_name = "decision_tree_wine"
        elif classifier_type == 'rf':
            clf = RandomForestClassifier(**params)
            model_name = "random_forest_wine"
        elif classifier_type == 'lr':
            clf = LogisticRegression(**params, max_iter=2000)
            model_name = "logistic_regression_wine"
        else:
            return 0
        
        acc = cross_val_score(clf, X_train_scaled, y_train, cv=5).mean()
        
        mlflow.set_tag("Model", classifier_type)
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", acc)
        
        mlflow.sklearn.log_model(
            clf, 
            artifact_path='model',
            registered_model_name=model_name
        )
        
        return {'loss': -acc, 'status': STATUS_OK}

In [10]:
search_space = hp.choice('classifier_type', [
    {
        'type': 'dt',
        'criterion': hp.choice('dtree_criterion', ['gini', 'entropy']),
        'max_depth': hp.choice('dtree_max_depth', [None] + list(range(1, 10))),
        'min_samples_split': hp.randint('dtree_min_samples_split', 2, 10),
        'random_state': 24
    },
    {
        'type': 'rf',
        'n_estimators': hp.randint('rf_n_estimators', 20, 500),
        'max_features': hp.randint('rf_max_features', 2, 9),
        'criterion': hp.choice('rf_criterion', ['gini', 'entropy']),
        'random_state': 24
    },
    {
        'type': 'lr',
        'C': hp.loguniform('lr_C', np.log(0.01), np.log(10.0)),
        'random_state': 24
    }
])

In [11]:
algo = tpe.suggest
trials = Trials()
best_result = fmin(
        fn=objective, 
        space=search_space,
        algo=algo,
        max_evals=32,
        trials=trials)

  0%|          | 0/32 [00:00<?, ?trial/s, best loss=?]

Successfully registered model 'decision_tree_wine'.
Created version '1' of model 'decision_tree_wine'.



  3%|▎         | 1/32 [00:01<00:37,  1.21s/trial, best loss: -0.8138339920948617]

Successfully registered model 'random_forest_wine'.
Created version '1' of model 'random_forest_wine'.



  6%|▋         | 2/32 [00:02<00:37,  1.25s/trial, best loss: -0.9648221343873518]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '2' of model 'random_forest_wine'.



  9%|▉         | 3/32 [00:03<00:37,  1.31s/trial, best loss: -0.9648221343873518]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '3' of model 'random_forest_wine'.



 12%|█▎        | 4/32 [00:05<00:36,  1.29s/trial, best loss: -0.9648221343873518]

Registered model 'decision_tree_wine' already exists. Creating a new version of this model...
Created version '2' of model 'decision_tree_wine'.



 16%|█▌        | 5/32 [00:05<00:30,  1.12s/trial, best loss: -0.9648221343873518]

Successfully registered model 'logistic_regression_wine'.
Created version '1' of model 'logistic_regression_wine'.



 19%|█▉        | 6/32 [00:06<00:26,  1.01s/trial, best loss: -0.9731225296442687]

Registered model 'logistic_regression_wine' already exists. Creating a new version of this model...
Created version '2' of model 'logistic_regression_wine'.



 22%|██▏       | 7/32 [00:07<00:23,  1.06trial/s, best loss: -0.9731225296442687]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '4' of model 'random_forest_wine'.



 25%|██▌       | 8/32 [00:08<00:22,  1.08trial/s, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '5' of model 'random_forest_wine'.



 28%|██▊       | 9/32 [00:09<00:25,  1.12s/trial, best loss: -0.9735177865612649]

Registered model 'decision_tree_wine' already exists. Creating a new version of this model...
Created version '3' of model 'decision_tree_wine'.



 31%|███▏      | 10/32 [00:10<00:22,  1.02s/trial, best loss: -0.9735177865612649]

Registered model 'logistic_regression_wine' already exists. Creating a new version of this model...
Created version '3' of model 'logistic_regression_wine'.



 34%|███▍      | 11/32 [00:11<00:21,  1.00s/trial, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '6' of model 'random_forest_wine'.



 38%|███▊      | 12/32 [00:13<00:23,  1.17s/trial, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '7' of model 'random_forest_wine'.



 41%|████      | 13/32 [00:14<00:21,  1.13s/trial, best loss: -0.9735177865612649]

Registered model 'logistic_regression_wine' already exists. Creating a new version of this model...
Created version '4' of model 'logistic_regression_wine'.



 44%|████▍     | 14/32 [00:15<00:18,  1.04s/trial, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '8' of model 'random_forest_wine'.



 47%|████▋     | 15/32 [00:16<00:19,  1.17s/trial, best loss: -0.9735177865612649]

Registered model 'decision_tree_wine' already exists. Creating a new version of this model...
Created version '4' of model 'decision_tree_wine'.



 50%|█████     | 16/32 [00:17<00:17,  1.07s/trial, best loss: -0.9735177865612649]

Registered model 'logistic_regression_wine' already exists. Creating a new version of this model...
Created version '5' of model 'logistic_regression_wine'.



 53%|█████▎    | 17/32 [00:18<00:14,  1.00trial/s, best loss: -0.9735177865612649]

Registered model 'decision_tree_wine' already exists. Creating a new version of this model...
Created version '5' of model 'decision_tree_wine'.



 56%|█████▋    | 18/32 [00:19<00:13,  1.05trial/s, best loss: -0.9735177865612649]

Registered model 'logistic_regression_wine' already exists. Creating a new version of this model...
Created version '6' of model 'logistic_regression_wine'.



 59%|█████▉    | 19/32 [00:19<00:11,  1.10trial/s, best loss: -0.9735177865612649]

Registered model 'decision_tree_wine' already exists. Creating a new version of this model...
Created version '6' of model 'decision_tree_wine'.



 62%|██████▎   | 20/32 [00:20<00:10,  1.13trial/s, best loss: -0.9735177865612649]

Registered model 'logistic_regression_wine' already exists. Creating a new version of this model...
Created version '7' of model 'logistic_regression_wine'.



 66%|██████▌   | 21/32 [00:21<00:09,  1.15trial/s, best loss: -0.9735177865612649]

Registered model 'logistic_regression_wine' already exists. Creating a new version of this model...
Created version '8' of model 'logistic_regression_wine'.



 69%|██████▉   | 22/32 [00:22<00:09,  1.09trial/s, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '9' of model 'random_forest_wine'.



 72%|███████▏  | 23/32 [00:24<00:09,  1.06s/trial, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '10' of model 'random_forest_wine'.



 75%|███████▌  | 24/32 [00:25<00:08,  1.11s/trial, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '11' of model 'random_forest_wine'.



 78%|███████▊  | 25/32 [00:26<00:08,  1.25s/trial, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '12' of model 'random_forest_wine'.



 81%|████████▏ | 26/32 [00:28<00:07,  1.24s/trial, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '13' of model 'random_forest_wine'.



 84%|████████▍ | 27/32 [00:29<00:06,  1.33s/trial, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '14' of model 'random_forest_wine'.



 88%|████████▊ | 28/32 [00:30<00:05,  1.31s/trial, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '15' of model 'random_forest_wine'.



 91%|█████████ | 29/32 [00:32<00:04,  1.45s/trial, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '16' of model 'random_forest_wine'.



 94%|█████████▍| 30/32 [00:34<00:02,  1.45s/trial, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '17' of model 'random_forest_wine'.



 97%|█████████▋| 31/32 [00:35<00:01,  1.46s/trial, best loss: -0.9735177865612649]

Registered model 'random_forest_wine' already exists. Creating a new version of this model...
Created version '18' of model 'random_forest_wine'.



100%|██████████| 32/32 [00:37<00:00,  1.17s/trial, best loss: -0.9735177865612649]


In [12]:
with mlflow.start_run(run_name="feature_selection"):
    rf_selector = RandomForestClassifier(n_estimators=100, random_state=24)
    rf_selector.fit(X_train_scaled, y_train)
    
    feature_importance = pd.DataFrame({
        'feature': X_train_scaled.columns,
        'importance': rf_selector.feature_importances_
    }).sort_values('importance', ascending=False)
    
    top_features = feature_importance.head(7)['feature'].tolist()

    X_train_selected = X_train_scaled[top_features]
    X_val_selected = X_val_scaled[top_features]
    X_test_selected = X_test_scaled[top_features]

    X_train_selected.to_parquet('save_data_lab2/x_train_selected.parquet')
    X_val_selected.to_parquet('save_data_lab2/x_val_selected.parquet')
    X_test_selected.to_parquet('save_data_lab2/x_test_selected.parquet')
    
    rf_selected = RandomForestClassifier(n_estimators=100, random_state=24)
    rf_selected.fit(X_train_selected, y_train)
    acc_selected = cross_val_score(rf_selected, X_train_selected, y_train, cv=5).mean()
    mlflow.log_metric('cv_accuracy_selected_features', acc_selected)
    
    mlflow.sklearn.log_model(
        rf_selector,
        artifact_path='feature_selector',
        registered_model_name='wine_feature_selector'
    )

    mlflow.sklearn.log_model(
        rf_selected,
        artifact_path='selected_features_model',
        registered_model_name='wine_classifier_selected_features'
    )
    
    mlflow.end_run()

Successfully registered model 'wine_feature_selector'.
Created version '1' of model 'wine_feature_selector'.
Successfully registered model 'wine_classifier_selected_features'.
Created version '1' of model 'wine_classifier_selected_features'.


Top 3 models are random forest with max_features = 5 and n_estimators = 429, random forest model with max_features = 2 and n_estimators = 290, and random forest with max_features = 2 and n_estimators = 342.

In [14]:
with mlflow.start_run():
    mlflow.set_tags({
        "Model": "random-forest",
        "Data": "final-comprehensive-logging"
    })
    
    mlflow.log_params({
        "n_estimators": 429,
        "max_features": 5
    })
    
    rf = RandomForestClassifier(n_estimators=429, max_features=5, random_state=24)
    rf.fit(X_train_scaled, y_train)

    mlflow.log_artifact('save_data_lab2/x_train.parquet', 'training_data')
    mlflow.log_artifact('save_data_lab2/y_train.parquet', 'training_data')
    
    mlflow.log_artifact('save_data_lab2/x_val.parquet', 'validation_data')
    mlflow.log_artifact('save_data_lab2/y_val.parquet', 'validation_data')
    
    mlflow.log_artifact('save_data_lab2/x_test.parquet', 'test_data')
    mlflow.log_artifact('save_data_lab2/y_test.parquet', 'test_data')
    
    train_acc = accuracy_score(y_train, rf.predict(X_train_scaled))
    val_acc = accuracy_score(y_val, rf.predict(X_val_scaled))
    test_acc = accuracy_score(y_test, rf.predict(X_test_scaled))
    
    mlflow.log_metrics({
        'train_accuracy': train_acc,
        'validation_accuracy': val_acc,
        'test_accuracy': test_acc
    })
    
    mlflow.sklearn.log_model(
        rf, 
        artifact_path="random_forest_model",
        registered_model_name="wine_classifier_final"
    )
    
    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")

mlflow.end_run()

Successfully registered model 'wine_classifier_final'.
Created version '1' of model 'wine_classifier_final'.


Training Accuracy: 1.0000
Validation Accuracy: 1.0000
Test Accuracy: 1.0000
