In [1]:
#imports
import joblib
import pandas as pd
import numpy as np
import re
import string
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [2]:
def clean_email(text: str) -> str:
    text = re.sub(r'<[^>]+>', '', text) # remove HTML tags
    text = re.sub(r'http\S+', '', text) # remove URLs
    # TODO: add a count of URLs to email data
    text = re.sub(r'\d+', '', text) # remove numerical text
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    text = text.lower().strip() # lowercase
    return text

def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """
    """
    df['clean_email'] = df['body'].astype(str).apply(clean_email)
    return df

In [3]:
df = pd.read_csv('../../data/analysis/emails_augmented.csv') 
assert 'body' in df.columns and 'label' in df.columns, "Missing required columns."
df = preprocess(df)
X = df['clean_email']
y = df['label']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
results = []

In [5]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier()) 
])

param_grid = {
    'classifier__n_estimators': [100, 200],              
    'classifier__max_depth': [10, 20, None],             
    'classifier__max_features': ['sqrt', 'log2'],        
    'classifier__min_samples_leaf': [1, 2]               
}


kfold = KFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=pipe,  
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=16
)
grid_search.fit(X_train, y_train) 

best_pipeline = grid_search.best_estimator_

Exception ignored in: <function ResourceTracker.__del__ at 0x14acec003920>
Traceback (most recent call last):
  File "/usr4/spclpgm/andrewhl/miniconda3/envs/project/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/usr4/spclpgm/andrewhl/miniconda3/envs/project/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/usr4/spclpgm/andrewhl/miniconda3/envs/project/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x151bfa72f920>
Traceback (most recent call last):
  File "/usr4/spclpgm/andrewhl/miniconda3/envs/project/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/usr4/spclpgm/andrewhl/miniconda3/envs/project/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/usr4/spclpgm/andrewhl/miniconda3/envs/project/lib/python3.12/multiprocessing/res

In [6]:
y_pred = best_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      7815
           1       0.98      0.97      0.98      8613

    accuracy                           0.98     16428
   macro avg       0.98      0.98      0.98     16428
weighted avg       0.98      0.98      0.98     16428



In [7]:
joblib.dump(best_pipeline, '../../output/models/RandomForest.joblib')

['../../output/models/RandomForest.joblib']

Exception ignored in: <function ResourceTracker.__del__ at 0x14d08ddb7920>
Traceback (most recent call last):
  File "/usr4/spclpgm/andrewhl/miniconda3/envs/project/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/usr4/spclpgm/andrewhl/miniconda3/envs/project/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/usr4/spclpgm/andrewhl/miniconda3/envs/project/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x14cb6ccbb920>
Traceback (most recent call last):
  File "/usr4/spclpgm/andrewhl/miniconda3/envs/project/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/usr4/spclpgm/andrewhl/miniconda3/envs/project/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/usr4/spclpgm/andrewhl/miniconda3/envs/project/lib/python3.12/multiprocessing/res