In [None]:
import yaml 
import importlib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import pickle

In [47]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.base import TransformerMixin, BaseEstimator

In [48]:

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Custom transformer for text cleaning
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self.clean_text)
    
    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^\w\s$!]', '', text)
        words = text.split()
        words = [w for w in words if w not in stop_words]
        words = [lemmatizer.lemmatize(w) for w in words]
        return " ".join(words)


In [49]:


class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [50]:
df = pd.read_csv("notebooks/spamham.csv")

TEXT_COLUMN ="Message"
TARGET_COLUMN ="label"

In [51]:
X_train, X_test, y_train, y_test = train_test_split(df[TEXT_COLUMN], df[TARGET_COLUMN], test_size=0.27, random_state=42)

#Load model.yaml
with open(r"config\model.yaml","r") as f:
    config = yaml.safe_load(f)

In [52]:
grid_config = config['grid_search']
model_config = config['model_selection']

best_model = None
best_score = 0
best_model_name =""

In [53]:
# Loop through the defined models
for module_key, model_def in model_config.items():
    module_name = model_def['module']
    class_name = model_def["class"]
    params_grid = model_def["search_param_grid"]

    # Dynamically import the module and class
    model_class = getattr(importlib.import_module(module_name), class_name)
    model_instance = model_class()
    print(f"Training model: {class_name}")

    requires_dense = class_name in ["GaussianNB"]

    steps = [("preprocess", TextPreprocessor()),
             ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9, sublinear_tf=True))
            ]
    
    if requires_dense:
        steps.append(("to_dense", DenseTransformer()))
    steps.append(("classifier", model_instance))
    pipeline = Pipeline(steps)

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid={f"classifier__{key}": value for key, value in params_grid.items()},
        cv = grid_config['params']['cv'],
        n_jobs=-1,
        verbose=grid_config['params']['verbose']
    )


    print("Starting training...")
    
    grid_search.fit(X_train,y_train)
    print(f"Best parameters for {class_name}: {grid_search.best_params_}")
    print(f"Best cross-validation score for {class_name}: {grid_search.best_score_}")

    if grid_search.best_score_ > best_score:
        best_score = grid_search.best_score_
        best_model = grid_search.best_estimator_
        best_model_name = class_name


Training model: MultinomialNB
Starting training...
Fitting 6 folds for each of 5 candidates, totalling 30 fits
Best parameters for MultinomialNB: {'classifier__alpha': 0.01}
Best cross-validation score for MultinomialNB: 0.9618692049206244
Training model: GaussianNB
Starting training...
Fitting 6 folds for each of 3 candidates, totalling 18 fits
Best parameters for GaussianNB: {'classifier__var_smoothing': 0.059096}
Best cross-validation score for GaussianNB: 0.914742128294196
Training model: SVC
Starting training...
Fitting 6 folds for each of 5 candidates, totalling 30 fits
Best parameters for SVC: {'classifier__C': 1.22, 'classifier__kernel': 'linear'}
Best cross-validation score for SVC: 0.9740872878390542


In [54]:
from sklearn.metrics import f1_score, classification_report, accuracy_score, r2_score
y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print(f"Best model: {best_model_name} with F1 score: {f1} and accuracy: {accuracy_score(y_test,y_pred)}")


Best model: SVC with F1 score: 0.9387069689336692 and accuracy: 0.973502722323049


In [56]:
import joblib
joblib.dump(best_model, "artifact/best_model.pkl")
print(f"Best model saved: {best_model_name}")


# Training model: MultinomialNB
# Starting training...

# Training model: MultinomialNB
# Fitting 5 folds for each of 5 candidates, totalling 25 fits
# Best parameters for MultinomialNB: {'classifier__alpha': 0.05}
# Best cross-validation score for MultinomialNB: 0.9664341186598817
# Training model: GaussianNB
# Starting training...

# Training model: GaussianNB
# Fitting 5 folds for each of 3 candidates, totalling 15 fits
# Best parameters for GaussianNB: {'classifier__var_smoothing': 0.059096}
# Best cross-validation score for GaussianNB: 0.9219919679438927
# Training model: SVC
# Starting training...

# Training model: SVC
# Fitting 5 folds for each of 5 candidates, totalling 25 fits
# Best parameters for SVC: {'classifier__C': 1.22, 'classifier__kernel': 'linear'}
# Best cross-validation score for SVC: 0.977712802159911

Best model saved: SVC
