In [12]:
#imports
import joblib
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.base import BaseEstimator, TransformerMixin

In [22]:
df = pd.read_csv('../../data/analysis/emails_augmented.csv')
df = df.head(5000)
assert 'body_no_stopwords' in df.columns and 'label' in df.columns, "Missing required columns."
X = df['body_no_stopwords']
y = df['label']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.isnull().sum())
print(X_train.head())

0
4227    greetings ok time think got things right clean...
4676    dear advertiser unable process payment ads sus...
800     ihpitjoocyoozmkiyweaxdymadriihpitjoocyoozmkiyw...
3671    feb pm anton moiseev wrote dgrag drop plasmoid...
4193    deliver intense incredible pleasure woman righ...
Name: body_no_stopwords, dtype: object


In [24]:
# Custom transformer, accepting model_name for flexibility
class SBERTTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='all-MiniLM-L6-v2', batch_size=32):
        self.model_name = model_name
        self.batch_size = batch_size
        self.model = None
        
    def fit(self, X, y=None):
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer(self.model_name)
        return self

    def transform(self, X):
        return self.model.encode(list(X), batch_size=self.batch_size, show_progress_bar=False)

In [25]:
from sentence_transformers import SentenceTransformer
X_train_list = X_train.tolist() 
X_test_list  = X_test.tolist()
model = SentenceTransformer('all-MiniLM-L6-v2')
X_train_emb = model.encode(X_train_list, batch_size=64, show_progress_bar=True)
X_test_emb = model.encode(X_test_list, batch_size=64, show_progress_bar=True)

  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 63/63 [11:06<00:00, 10.58s/it]
Batches: 100%|██████████| 16/16 [02:57<00:00, 11.07s/it]


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold

pipe = Pipeline([
    ('classifier', LogisticRegression())
])

param_grid = [
    {
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['liblinear'],
        'classifier__max_iter': [500]
    },
    {
        'classifier__penalty': ['l2'],
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['saga'],
        'classifier__max_iter': [1000]
    }
]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=pipe,  
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=16          # <=== FULL PARALLELISM!
)
grid_search.fit(X_train_emb, y_train)

best_pipeline = grid_search.best_estimator_

In [27]:
y_pred = best_pipeline.predict(X_test_emb)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       426
           1       0.98      0.99      0.99       574

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000



In [6]:
#joblib.dump(best_pipeline, '../../output/models/LogisticRegression.joblib')

['../../output/models/LogisticRegression.joblib']