# Assignment 2

### Ayush Yadav (MDS202315)

**Model version control and experiment tracking** <br>

In `train.ipynb`  track the experiments and model versions using mlflow
1) Build, track, and register 3 benchmark models using MLflow
2) Checkout and print the model selection metric AUCPR for each of the three benchmark models

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, roc_auc_score, roc_curve, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from scipy import sparse
from scipy.sparse import load_npz

import pickle
import warnings

import mlflow
import mlflow.sklearn

warnings.filterwarnings('ignore')

#### Importing the Data

In [28]:
TRAIN = pd.read_csv('./TRAIN.csv').dropna()
VAL = pd.read_csv('./VALIDATION.csv').dropna()
TEST = pd.read_csv('./TEST.csv').dropna()

bow_msgs = load_npz('./sms+spam+collection/bag_of_words.npz')

with open('./sms+spam+collection/bag_of_words.pkl','rb') as f:
    bag_of_words = pickle.load(f)

In [29]:
train_x = TRAIN.drop(columns=['label'])
train_y = TRAIN.label

val_x = VAL.drop(columns=['label'])
val_y = VAL.label

test_x = TEST.drop(columns=['label'])
test_y = TEST.label

In [30]:
train_x = bag_of_words.transform(train_x['preprocessed'])
val_x = bag_of_words.transform(val_x['preprocessed'])
test_x = bag_of_words.transform(test_x['preprocessed'])

In [31]:
### Vectorizing using TF-IDF Vectorize

tfidf_transformer = TfidfTransformer().fit(bow_msgs)

X_train = tfidf_transformer.transform(train_x)
print("Training data shape:",X_train.shape)

X_val = tfidf_transformer.transform(val_x)
print("Validation data shape:",X_val.shape)

X_test = tfidf_transformer.transform(test_x)
print("Testing data shape:",X_test.shape)

Training data shape: (3784, 7947)
Validation data shape: (946, 7947)
Testing data shape: (836, 7947)


##### Naive Bayes

In [54]:
def naive_bayes(X_train, y_train, X_val, y_val, X_test, y_test):

    clf=MultinomialNB(fit_prior=True, class_prior=None)

    clf_parameters = {
        'clf__alpha':(0,1),
        }

    pipeline = Pipeline([('clf', clf)])

    parameters={**clf_parameters}

    grid = GridSearchCV(pipeline,parameters,n_jobs=-1,scoring='f1',cv=10)

    grid.fit(X_train, y_train)

    clf = grid.best_estimator_

    val_preds = clf.predict(X_val)
    test_preds = clf.predict(X_test)
    y_test_prob = clf.predict_proba(X_test)[:, 1]


    return [grid, accuracy_score(y_val, val_preds), f1_score(y_val, val_preds), average_precision_score(y_test, y_test_prob)] 

##### Logistic Regression

In [52]:
def logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test):

    clf = LogisticRegression(class_weight='balanced', n_jobs=-1)
    clf_parameters = {
                    'clf__solver':('newton-cg','lbfgs','liblinear','saga'),
                }

    pipeline = Pipeline([('clf', clf)])

    parameters={**clf_parameters}

    grid = GridSearchCV(pipeline,parameters,n_jobs=-1,scoring='f1',cv=10)

    grid.fit(X_train, y_train)

    clf = grid.best_estimator_

    val_preds = clf.predict(X_val)
    test_preds = clf.predict(X_test)
    y_test_prob = clf.predict_proba(X_test)[:, 1]


    return [grid, accuracy_score(y_val, val_preds), f1_score(y_val, val_preds), average_precision_score(y_test, y_test_prob)] 

##### Random Forest

In [51]:
def random_forest(X_train, y_train, X_val, y_val, X_test, y_test):

    clf = RandomForestClassifier(class_weight='balanced', max_depth=10)
    clf_parameters = {
                'clf__criterion':('gini', 'entropy'), 
                'clf__max_features':('sqrt', 'log2'),   
                'clf__n_estimators':(10, 30,50,100,200),
                'clf__max_depth':(10,20),
                } 

    pipeline = Pipeline([('clf', clf)])

    parameters={**clf_parameters}

    grid = GridSearchCV(pipeline,parameters,n_jobs=-1,scoring='f1',cv=10)

    grid.fit(X_train, y_train)

    clf = grid.best_estimator_

    val_preds = clf.predict(X_val)
    test_preds = clf.predict(X_test)
    y_test_prob = clf.predict_proba(X_test)[:, 1]


    return [grid, accuracy_score(y_val, val_preds), f1_score(y_val, val_preds), average_precision_score(y_test, y_test_prob)] 

##### Gradient Boosting

In [50]:
def grad_boost(X_train, y_train, X_val, y_val, X_test, y_test):

    clf = GradientBoostingClassifier()
    clf_parameters = {
                'clf__loss':('log_loss','exponential'),        
                'clf__criterion':('friedman_mse', 'squared_error'), 
                'clf__max_features':('sqrt', 'log2'),   
                'clf__n_estimators':(50,100,200),
                'clf__max_depth':(5,10),
                } 

    pipeline = Pipeline([('clf', clf)])

    parameters={**clf_parameters}

    grid = GridSearchCV(pipeline,parameters,n_jobs=-1,scoring='f1',cv=10)

    grid.fit(X_train, y_train)

    clf = grid.best_estimator_

    val_preds = clf.predict(X_val)
    test_preds = clf.predict(X_test)
    y_test_prob = clf.predict_proba(X_test)[:, 1]


    return [grid, accuracy_score(y_val, val_preds), f1_score(y_val, val_preds), average_precision_score(y_test, y_test_prob)]

##### Support Vector Machine

In [49]:
def SVM(X_train, y_train, X_val, y_val, X_test, y_test):

    clf = SVC(class_weight='balanced', probability=True)  
    clf_parameters = {
        'clf__C':(0.1,0.5,1,2,10,50,100),
        'clf__kernel': ('linear', 'rbf','poly')
        }

    pipeline = Pipeline([('clf', clf)])

    parameters={**clf_parameters}

    grid = GridSearchCV(pipeline,parameters,n_jobs=-1,scoring='f1',cv=10)

    grid.fit(X_train, y_train)

    clf = grid.best_estimator_

    val_preds = clf.predict(X_val)
    test_preds = clf.predict(X_test)
    y_test_prob = clf.predict_proba(X_test)[:, 1]


    return [grid, accuracy_score(y_val, val_preds), f1_score(y_val, val_preds), average_precision_score(y_test, y_test_prob)] 

#### Model Tracking using MLFlow

In [55]:
MODELS = {"Naive Bayes": naive_bayes(X_train, train_y, X_val, val_y, X_test, test_y),
        "Logistic Regression": logistic_regression(X_train, train_y, X_val, val_y, X_test, test_y),
        "Random Forest":random_forest(X_train, train_y, X_val, val_y, X_test, test_y),
        "Gradient Boosting":grad_boost(X_train, train_y, X_val, val_y, X_test, test_y),
        "SVM":SVM(X_train, train_y, X_val, val_y, X_test, test_y) 
        }

In [57]:
mlflow.set_experiment("Benchmark_models")
input_example = X_test[:1].toarray() 
best_models = {}

2025/03/04 20:30:21 INFO mlflow.tracking.fluent: Experiment with name 'Benchmark_models' does not exist. Creating a new experiment.


In [58]:
for model in MODELS.keys():
    with mlflow.start_run(run_name=model):
        print(f"----------------------{model.upper()}---------------------")
        
        model_name = model

        best_model = MODELS[model_name][0].best_estimator_
        best_models[model_name] = best_model
        
        model_best_params = MODELS[model_name][0].best_params_
        model_accuracy = MODELS[model_name][1]
        model_f1_score = MODELS[model_name][2]
        model_aucpr = MODELS[model_name][3]

        print(f"Best Params: {model_best_params}")
        print(f"Validation Accuracy: {model_accuracy:.4f}")
        print(f"F1 Score: {model_f1_score:.4f}")
        print(f"AUCPR Score: {model_aucpr:.4f}")

        mlflow.log_params(model_best_params)
        mlflow.log_metric("Accuracy", model_accuracy)
        mlflow.log_metric("F-Score", model_f1_score)
        mlflow.log_metric("AUCPR", model_aucpr)

        mlflow.sklearn.log_model(sk_model=best_model,artifact_path=model_name,input_example=input_example)

        mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/{model_name}", model_name)

        print(f"{model_name.upper()} logged and registered in MLflow.\n\n\n")

print("LOGGING AND REGISTERING COMPLETE !!!")

----------------------NAIVE BAYES---------------------
Best Params: {'clf__alpha': 1}
Validation Accuracy: 0.9630
F1 Score: 0.8458
AUCPR Score: 0.9429


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Naive Bayes' already exists. Creating a new version of this model...
Created version '2' of model 'Naive Bayes'.


NAIVE BAYES logged and registered in MLflow.



----------------------LOGISTIC REGRESSION---------------------
Best Params: {'clf__solver': 'liblinear'}
Validation Accuracy: 0.9736
F1 Score: 0.9035
AUCPR Score: 0.9590


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Logistic Regression' already exists. Creating a new version of this model...
Created version '2' of model 'Logistic Regression'.


LOGISTIC REGRESSION logged and registered in MLflow.



----------------------RANDOM FOREST---------------------
Best Params: {'clf__criterion': 'gini', 'clf__max_depth': 20, 'clf__max_features': 'sqrt', 'clf__n_estimators': 200}
Validation Accuracy: 0.9778
F1 Score: 0.9150
AUCPR Score: 0.9636


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Random Forest' already exists. Creating a new version of this model...
Created version '2' of model 'Random Forest'.


RANDOM FOREST logged and registered in MLflow.



----------------------GRADIENT BOOSTING---------------------
Best Params: {'clf__criterion': 'squared_error', 'clf__loss': 'log_loss', 'clf__max_depth': 10, 'clf__max_features': 'sqrt', 'clf__n_estimators': 200}
Validation Accuracy: 0.9746
F1 Score: 0.8992
AUCPR Score: 0.9489


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Gradient Boosting' already exists. Creating a new version of this model...
Created version '2' of model 'Gradient Boosting'.


GRADIENT BOOSTING logged and registered in MLflow.



----------------------SVM---------------------
Best Params: {'clf__C': 1, 'clf__kernel': 'linear'}
Validation Accuracy: 0.9767
F1 Score: 0.9134
AUCPR Score: 0.9746


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

SVM logged and registered in MLflow.



LOGGING AND REGISTERING COMPLETE !!!


Registered model 'SVM' already exists. Creating a new version of this model...
Created version '2' of model 'SVM'.
