In [22]:
import mlflow
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_recall_curve, auc, average_precision_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from mlflow.tracking import MlflowClient

In [2]:
mlflow.set_experiment("Benchmark_Models_Experiment")

<Experiment: artifact_location='file:///Users/ujandasgupta/Desktop/Applied%20ML/assignment-2/mlruns/631866937464807196', creation_time=1708345645962, experiment_id='631866937464807196', last_update_time=1708345645962, lifecycle_stage='active', name='Benchmark_Models_Experiment', tags={}>

## Load prepared data

In [3]:
# Load the datasets
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('validation.csv')
# Assuming test data is for final evaluation and not used in this training notebook
X_train = train_data['text']
y_train = train_data['spam']
X_val = val_data['text']
y_val = val_data['spam']

In [4]:
tfidf_vectorizer = TfidfVectorizer()

## Training and Evaluating a model and version controlling using MLflow

In [5]:
def evaluate_and_log_model(model, model_name):
    with mlflow.start_run(run_name=model_name):
        # Create a pipeline with TF-IDF and the model
        pipeline = make_pipeline(tfidf_vectorizer, model)
        
        # Fit the pipeline on the training data
        pipeline.fit(X_train, y_train)
        
        # Predict on validation set
        y_scores = pipeline.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else pipeline.decision_function(X_val)
        
        # Calculate AUCPR
        precision, recall, _ = precision_recall_curve(y_val, y_scores)
        aucpr = auc(recall, precision)
        
        # Log parameters, metrics, and model
        mlflow.log_params(model.get_params())
        mlflow.log_metric("AUCPR", aucpr)
        mlflow.sklearn.log_model(pipeline, "model")
        
        # Print AUCPR
        print(f"{model_name} AUCPR: {aucpr}")

        # Register the model
        mlflow.register_model(
            mlflow.get_artifact_uri("model"),
            model_name
        )

In [6]:
evaluate_and_log_model(LogisticRegression(), "Logistic_Regression")

Logistic_Regression AUCPR: 0.9965459454541511


Registered model 'Logistic_Regression' already exists. Creating a new version of this model...
Created version '4' of model 'Logistic_Regression'.


In [8]:
evaluate_and_log_model(MultinomialNB(), "Multinomial_NB")

Multinomial_NB AUCPR: 0.9880050585973483


Registered model 'Multinomial_NB' already exists. Creating a new version of this model...
Created version '4' of model 'Multinomial_NB'.


In [10]:
evaluate_and_log_model(RandomForestClassifier(), "Random_Forest")

Random_Forest AUCPR: 0.9971620370435094


Registered model 'Random_Forest' already exists. Creating a new version of this model...
Created version '4' of model 'Random_Forest'.


In [12]:
evaluate_and_log_model(SGDClassifier(loss='log'), "SGD_Classifier")



SGD_Classifier AUCPR: 0.9973448504146537


Registered model 'SGD_Classifier' already exists. Creating a new version of this model...
Created version '4' of model 'SGD_Classifier'.


In [13]:
client = MlflowClient()
for model_name in ["Logistic_Regression", "Multinomial_NB", "Random_Forest", "SGD_Classifier"]:
    runs = client.search_runs(
        experiment_ids=[mlflow.get_experiment_by_name("Benchmark_Models_Experiment").experiment_id],
        filter_string=f"tags.mlflow.runName = '{model_name}'"
    )
    for run in runs:
        # Safely access the AUCPR metric
        aucpr = run.data.metrics.get('AUCPR')
        if aucpr is not None:
            print(f"{model_name} AUCPR: {aucpr}")

Logistic_Regression AUCPR: 0.9965459454541511
Logistic_Regression AUCPR: 0.9965459454541511
Logistic_Regression AUCPR: 0.9965459454541511
Logistic_Regression AUCPR: 0.9965459454541511
Multinomial_NB AUCPR: 0.9880050585973483
Multinomial_NB AUCPR: 0.9880050585973483
Multinomial_NB AUCPR: 0.9880050585973483
Multinomial_NB AUCPR: 0.9880050585973483
Random_Forest AUCPR: 0.9971620370435094
Random_Forest AUCPR: 0.9963395532100702
Random_Forest AUCPR: 0.9966080608179311
Random_Forest AUCPR: 0.9975970670266049
SGD_Classifier AUCPR: 0.9973448504146537
SGD_Classifier AUCPR: 0.9973432748155386
SGD_Classifier AUCPR: 0.9973487588623465
SGD_Classifier AUCPR: 0.9973691787575261


In [14]:
! mlflow ui

[2024-02-19 19:37:24 +0530] [50879] [INFO] Starting gunicorn 21.2.0
[2024-02-19 19:37:24 +0530] [50879] [INFO] Listening at: http://127.0.0.1:5000 (50879)
[2024-02-19 19:37:24 +0530] [50879] [INFO] Using worker: sync
[2024-02-19 19:37:24 +0530] [50880] [INFO] Booting worker with pid: 50880
[2024-02-19 19:37:24 +0530] [50881] [INFO] Booting worker with pid: 50881
[2024-02-19 19:37:24 +0530] [50882] [INFO] Booting worker with pid: 50882
[2024-02-19 19:37:24 +0530] [50883] [INFO] Booting worker with pid: 50883
^C
[2024-02-19 19:46:29 +0530] [50879] [INFO] Handling signal: int
[2024-02-19 19:46:29 +0530] [50883] [INFO] Worker exiting (pid: 50883)
[2024-02-19 19:46:29 +0530] [50882] [INFO] Worker exiting (pid: 50882)
[2024-02-19 19:46:29 +0530] [50881] [INFO] Worker exiting (pid: 50881)
[2024-02-19 19:46:29 +0530] [50880] [INFO] Worker exiting (pid: 50880)


# Using the Best version of the SGD Classifier model to get "AUCPR" on test data

In [15]:
model_name = "SGD_Classifier"  
model_version = "4"  

In [16]:
model_uri = f"models:/{model_name}/{model_version}"
model = mlflow.sklearn.load_model(model_uri)

In [18]:
test_data = pd.read_csv('test.csv')

In [19]:
X_test = test_data['text']
y_test = test_data['spam']

In [20]:
y_pred = model.predict(X_test)

In [26]:
# Calculate AUCPR
precision, recall, _ = precision_recall_curve(y_pred, y_test)
aucpr = auc(recall, precision)

In [29]:
print(aucpr)

1.0
