In [4]:
#imports
import joblib
import pandas as pd
import numpy as np
import re
import string
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv('../../data/analysis/emails_augmented.csv') 
assert 'body_no_stopwords' in df.columns and 'label' in df.columns, "Missing required columns."
X = df['body_no_stopwords']
y = df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('classifier', MultinomialNB()) 
])

param_grid = {
    'classifier__alpha': [0.01, 0.1, 0.5, 1.0],  
    'classifier__fit_prior': [True, False]
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=pipe,  
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train, y_train) 

best_pipeline = grid_search.best_estimator_

In [10]:
y_pred = best_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.98      0.93      7815
           1       0.98      0.89      0.93      8613

    accuracy                           0.93     16428
   macro avg       0.93      0.93      0.93     16428
weighted avg       0.94      0.93      0.93     16428



In [11]:
joblib.dump(best_pipeline, '../../output/models/MultinomialNB.joblib')

['../../output/models/MultinomialNB.joblib']