In [None]:
## Using TF-IDF
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import Parallel, delayed
data['Label'] = data['Email Type'].apply(lambda x: 1 if x == 'Phishing Email' else 0)
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'MultinomialNB': MultinomialNB()
}
vectorizer = TfidfVectorizer(max_features=3000)
X_vec = vectorizer.fit_transform(data['Processed_Text']).toarray()
y = data['Label']
def evaluate_model(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1
structure_theories = {
    'Basic Structure': 'Basic_Structure',
    'SVO Structure': 'SVO_Structure',
    'POS Complexity': 'POS_Complexity'
}
results = []
def process_combination(model_name, structure_name, structure_col):
    model = models[model_name]
    X = np.column_stack((X_vec, data[structure_col].values))
    accuracy, precision, recall, f1 = evaluate_model(X, y, model)
    return {
        'Model': model_name,
        'Structure Theory': structure_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
combinations = [
    (model_name, structure_name, structure_col)
    for model_name in models
    for structure_name, structure_col in structure_theories.items()
]
results = Parallel(n_jobs=-1)(
    delayed(process_combination)(model, structure, col)
    for model, structure, col in combinations
)
result_df = pd.DataFrame(results).sort_values(by='F1 Score', ascending=False).reset_index(drop=True)
print("\n🔍 Model Evaluation Results with Structure Theories:")
print(result_df)

In [None]:
## Using CountVectorizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from joblib import Parallel, delayed
data['Label'] = data['Email Type'].apply(lambda x: 1 if x == 'Phishing Email' else 0)
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'MultinomialNB': MultinomialNB()
}
vectorizer = CountVectorizer(max_features=3000)
X_vec = vectorizer.fit_transform(data['Processed_Text']).toarray()
y = data['Label']
def evaluate_model(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1
structure_theories = {
    'Basic Structure': 'Basic_Structure',
    'SVO Structure': 'SVO_Structure',
    'POS Complexity': 'POS_Complexity'
}
results = []
def process_combination(model_name, structure_name, structure_col):
    model = models[model_name]
    X = np.column_stack((X_vec, data[structure_col].values)
    accuracy, precision, recall, f1 = evaluate_model(X, y, model)
    return {
        'Model': model_name,
        'Structure Theory': structure_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
combinations = [
    (model_name, structure_name, structure_col)
    for model_name in models
    for structure_name, structure_col in structure_theories.items()
]
results = Parallel(n_jobs=-1)(
    delayed(process_combination)(model, structure, col)
    for model, structure, col in combinations
)
result_df = pd.DataFrame(results).sort_values(by='F1 Score', ascending=False).reset_index(drop=True)
print("\n🔍 Model Evaluation Results Using `CountVectorizer` with Structure Theories:")
print(result_df)