In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import joblib

In [2]:
df = pd.read_csv('meta-classifier-dataset.csv')

In [3]:
df.head()

Unnamed: 0,email_text_predictions,url_predictions,num_urls,email_length,label
0,0,0,0,1030,0
1,0,0,0,479,0
2,0,0,0,1245,0
3,1,0,1,688,1
4,1,0,0,441,1


In [11]:
features = df[['email_text_predictions', 'url_predictions', 'num_urls', 'email_length']]
target = df['label']

In [14]:
scaler = StandardScaler()

# Normalize the 'text_length' feature
features['email_length'] = scaler.fit_transform(features[['email_length']])
features['num_urls'] = scaler.fit_transform(features[['num_urls']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['email_length'] = scaler.fit_transform(features[['email_length']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['num_urls'] = scaler.fit_transform(features[['num_urls']])


In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [16]:
meta_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [17]:
meta_classifier.fit(X_train, y_train)

In [18]:
y_pred = meta_classifier.predict(X_test)

In [19]:
# Evaluate the meta-classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8135229407029783
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84      2209
           1       0.77      0.77      0.77      1518

    accuracy                           0.81      3727
   macro avg       0.81      0.81      0.81      3727
weighted avg       0.81      0.81      0.81      3727



In [22]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC()
}

In [24]:
# Evaluate models using cross-validation
for name, model in models.items():
    scores = cross_val_score(model, features, target, cv=5, scoring='accuracy')
    print(f'{name} Accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})')

Random Forest Accuracy: 0.8154 (+/- 0.0101)
Logistic Regression Accuracy: 0.8495 (+/- 0.0125)
SVM Accuracy: 0.8496 (+/- 0.0124)


In [25]:
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_regression_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8494767909847062
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88      2209
           1       0.84      0.78      0.81      1518

    accuracy                           0.85      3727
   macro avg       0.85      0.84      0.84      3727
weighted avg       0.85      0.85      0.85      3727



In [26]:
joblib.dump(logistic_regression_model, '../models/best_meta_classifier_model.pkl')

['../models/best_meta_classifier_model.pkl']

In [27]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam




In [28]:
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])




In [29]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [30]:
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [31]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.852159914140059
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88      2209
           1       0.84      0.78      0.81      1518

    accuracy                           0.85      3727
   macro avg       0.85      0.84      0.84      3727
weighted avg       0.85      0.85      0.85      3727



In [32]:
model.save('../models/neural_network_meta_classifier.h5')

  saving_api.save_model(


: 