In [None]:
#imports
import joblib
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv('../../data/analysis/emails_augmented.csv') 
assert 'body_no_stopwords' in df.columns and 'label' in df.columns, "Missing required columns."
X = df['body_no_stopwords']
y = df['label']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
results = []

In [None]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('classifier', LogisticRegression()) 
])

param_grid = [
    {
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['liblinear'],
        'classifier__max_iter': [500]
    },
    {
        'classifier__penalty': ['l2'],
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['saga'],
        'classifier__max_iter': [1000]
    }
]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=pipe,  
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=16
)
grid_search.fit(X_train, y_train) 

best_pipeline = grid_search.best_estimator_

In [7]:
y_pred = best_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      7815
           1       0.98      0.98      0.98      8613

    accuracy                           0.98     16428
   macro avg       0.98      0.98      0.98     16428
weighted avg       0.98      0.98      0.98     16428



In [8]:
joblib.dump(best_pipeline, '../../output/models/LogisticRegression.joblib')

['../../output/models/LogisticRegression.joblib']