In [None]:
import mlflow
import mlflow.sklearn
from mlflow.data.numpy_dataset import from_numpy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
import os

%matplotlib inline
sns.set_style("whitegrid")

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score



### Globals

In [2]:
COLOR_MAP = "viridis" #"Greens" 

plt.rcParams["image.cmap"] = COLOR_MAP
sns.set_theme(style="whitegrid", palette= COLOR_MAP) 

current_dir = Path(os.getcwd()).resolve().parent
RAW_DATA_PATH = current_dir / "data" / "SMSSpamCollection.csv"

MLFLOW_TRACKING_URI = os.path.join(current_dir, "mlruns")

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

mlflow.set_experiment("spam_filter_basline")    #store all subsequent runs under this same experiment ID

<Experiment: artifact_location='/home/tiggi/Documents/IU_projects/model_engineering/spam/mlruns/378166278007602301', creation_time=1759241664366, experiment_id='378166278007602301', last_update_time=1759241664366, lifecycle_stage='active', name='spam_filter_basline', tags={}>

### Model Setting

In [3]:
# data cleaning transformer:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_special_chars(text):
    if text is None:
        return ""
    text = str(text).lower()
    tokens = word_tokenize(text)                    # keeps punctuation/numbers as separate tokens
    filtered = [w for w in tokens if w not in stop_words]
    lemmas = [lemmatizer.lemmatize(w, pos="v") for w in filtered]
    return " ".join(lemmas)

class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.apply(clean_special_chars)


In [4]:
# define the parameters for tuning

param_grid = {
    "vectorizers" : [CountVectorizer, TfidfVectorizer],
    "ngram_ranges" : [(1,1), (1,2)],                      # vectorizer relates combinations of 1 word or 1&2 words
    "max_dfs" : [0.95, 0.9],                              #  ignore too often words
    "min_dfs" : [1, 5],                                   # ignore too rare words
    "alphas" : [1.0, 0.1],                                # NB alpha: smoothing factor for bias handling 
    "scores": ["accuracy", "f1_macro", "precision_macro", "recall_macro", "roc_auc"]
}

In [None]:
# set pipeline
pipe = Pipeline([
    ("cleaner", TextCleaner()),              
    ("vectorizer", TfidfVectorizer()),       
    ("clf", MultinomialNB())                 
])

In [65]:
params = {"a": "b", "c":'d', 'f':'g'}

list(params.values())

['b', 'd', 'g']

StratifiedKFold keeps for every fold roughly the same ham/spam ratio as the full dataset
use of cross validation for increasing the size of the dataset for the training

In [6]:
# fix cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=90)

In [7]:
# set gridsearch to xplore the parameters with cross validation: 
#from sklearn.model_selection import train_test_split, GridSearchCV
#grid = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2)


### Load data

In [7]:
data = pd.read_csv(RAW_DATA_PATH, delimiter='\t', header=None, encoding='utf-8', names=['label', 'text'])


In [69]:
data.iloc[2]['text']

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [8]:
# split

X_train, X_holdout, y_train, y_holdout = train_test_split(
    data["text"], data["label"],
    test_size=0.25, stratify=data["label"], random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_holdout, y_holdout,
    test_size=0.2, stratify=y_holdout, random_state=42
)

### train and find the best parameters

In [None]:
# log datasets with mlflow
train_input = from_numpy(X_train, name="train_data", targets=y_train)
val_input = from_numpy(X_val, name="val_data", targets=y_val)
test_input = from_numpy(X_test, name="test_data", targets=y_test)

mlflow.log_input(train_input, context="training")
mlflow.log_input(val_input, context="validation")
mlflow.log_input(test_input, context="testing")

# set manual search 
for vect_class in param_grid['vectorizers']:
    for ngram in param_grid['ngram_ranges']:
        for max_df in param_grid['max_dfs']:
            for min_df in param_grid['min_dfs']:
                for alpha in param_grid['alphas']:
                    
                    scoring = {
                        "accuracy": "accuracy",
                        "f1_macro": "f1_macro",
                        "precision_macro": "precision_macro",
                        "recall_macro": "recall_macro",
                        "roc_auc": "roc_auc"
                    }
                    
                    # vectorizer
                    vect = vect_class(ngram_range=ngram, max_df=max_df, min_df=min_df)
                    
                    # pipeline
                    pipe.set_params(vectorizer=vect, clf__alpha=alpha)
                    
                    # run CV
                    
                    # log params & results
                    with mlflow.start_run(
                        run_name = f'params ngram={ngram}, maxdf={max_df}, mindf {min_df}, alpha={alpha}',
                        ):
                        scores = cross_validate(
                            pipe, 
                            X_train, 
                            y_train, 
                            cv=skf, 
                            scoring= ["accuracy", "f1_macro", "precision_macro", "recall_macro", "roc_auc"],
                            return_train_score=True,
                        )
                        print(scores)
                                              

                        # log params
                        mlflow.log_param("vectorizer", vect_class.__name__)
                        mlflow.log_param("ngram_range", ngram)
                        mlflow.log_param("max_df", max_df)
                        mlflow.log_param("min_df", min_df)
                        mlflow.log_param("alpha", alpha)

                        # log metrics
                        train_metrics = {
                            "train_accuracy": np.mean(scores["train_accuracy"]),
                            "train_f1_macro": np.mean(scores["train_f1_macro"]),
                            "train_precision_macro": np.mean(scores["train_precision_macro"]),
                            "train_recall_macro": np.mean(scores["train_recall_macro"]),
                            "train_roc_auc": np.mean(scores["train_roc_auc"]),
                        }

                        test_metrics = {
                            "test_accuracy": np.mean(scores["test_accuracy"]),
                            "test_f1_macro": np.mean(scores["test_f1_macro"]),
                            "test_precision_macro": np.mean(scores["test_precision_macro"]),
                            "test_recall_macro": np.mean(scores["test_recall_macro"]),
                            "test_roc_auc": np.mean(scores["test_roc_auc"]),
                        }

                        # log metrics
                        for key, value in train_metrics.items():
                            mlflow.log_metric(key, value)

                        for key, value in test_metrics.items():
                            mlflow.log_metric(key, value)
                        

                        print(f"{vect_class.__name__}, ngram={ngram}, max_df={max_df}, min_df={min_df}, alpha={alpha}, F1={np.mean(scores["test_f1_macro"]):.3f}")
 

In [30]:
best_run = mlflow.search_runs(order_by=["metrics.test_precision_macro DESC", "metrics.test_f1_macro DESC"])
best_run_id = best_run['run_id'][0]
best_run_id

'74412e79045745ca987a981d871eba2a'

In [35]:
client = mlflow.tracking.MlflowClient()
best_run = client.get_run(best_run_id)

best_params = best_run.data.params
best_params 

{'vectorizer': 'TfidfVectorizer',
 'max_df': '0.9',
 'ngram_range': '(1, 2)',
 'min_df': '1',
 'alpha': '0.1'}

In [38]:
param_grid['vectorizers']

[sklearn.feature_extraction.text.CountVectorizer,
 sklearn.feature_extraction.text.TfidfVectorizer]

In [49]:
def retrain_and_log_best_model(best_run_id, pipe, X_train, y_train, X_val, y_val):
    """find retrain and log the best model"""
    
    client = mlflow.tracking.MlflowClient()
    best_run = client.get_run(best_run_id)

    best_params = best_run.data.params

    vectorizer_map = {
    "CountVectorizer": CountVectorizer,
    "TfidfVectorizer": TfidfVectorizer
        }
    vect_class = vectorizer_map[best_params["vectorizer"]]
    vect = vect_class(
        ngram_range=eval(best_params["ngram_range"]),
        max_df=float(best_params["max_df"]),
        min_df=int(best_params["min_df"])
        )
    
    # update pipeline
    pipe.set_params(vectorizer=vect, clf__alpha=float(best_params["alpha"]))
    pipe.fit(X_train, y_train)

    # fit model
    with mlflow.start_run(run_name="best_model") as run:
        pipe.fit(X_train, y_train)
        y_val_pred = pipe.predict(X_val)
        y_val_proba = pipe.predict_proba(X_val)[:, 1]

        precision = precision_score(y_val, y_val_pred, average="macro")
        f1 = f1_score(y_val, y_val_pred, average="macro")
        recall = recall_score(y_val, y_val_pred, average="macro")
        roc_auc = roc_auc_score(y_val, y_val_proba)

        # Log metrics
        mlflow.log_metric("precision_macro", precision)
        mlflow.log_metric("f1_macro", f1)
        mlflow.log_metric("recall_macro", recall)
        mlflow.log_metric("roc_auc", roc_auc)

        mlflow.sklearn.log_model(pipe, name="model", input_example=X_val[:5].to_frame(name="text"))

        print(f"Final model trained and logged. Precision: {precision:.3f}, F1: {f1:.3f}, ROC-AUC: {roc_auc}")
        return pipe

In [50]:
mlflow.end_run()

In [51]:
trained_pipeline = retrain_and_log_best_model(
    best_run_id=best_run_id,
    pipe=pipe,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val
)



Final model trained and logged. Precision: 0.988, F1: 0.966, ROC-AUC: 0.9933195020746888
