In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [22]:
merged_data = pd.read_csv('merged_data_with_predictions.csv')

In [23]:
X = pd.get_dummies(merged_data[['predicted_class_encoded_email', 'predicted_class_url', 'url_len', '@', '?', '-', '=', '.', '#', '%', '+', '$', '!', '*', ',', '//', 'abnormal_url', 'https', 'digits', 'letters', 'Shortining_Service', 'having_ip_address']])

y = merged_data['label_email']

In [25]:
X.head()

Unnamed: 0,predicted_class_encoded_email,predicted_class_url,url_len,@,?,-,=,.,#,%,...,!,*,",",//,abnormal_url,https,digits,letters,Shortining_Service,having_ip_address
0,1,1,28,0,0,0,0,1,0,0,...,0,0,0,1,1,1,0,24,0,0
1,1,1,20,0,0,1,0,1,0,0,...,0,0,0,1,1,1,0,15,0,0
2,1,1,26,0,0,0,0,2,0,0,...,0,0,0,1,1,1,0,21,0,0
3,1,1,23,0,0,0,0,1,0,0,...,0,0,0,1,1,1,0,19,0,0
4,1,1,30,0,0,0,0,1,0,0,...,0,0,0,1,1,1,0,26,0,0


In [26]:
X.columns

Index(['predicted_class_encoded_email', 'predicted_class_url', 'url_len', '@',
       '?', '-', '=', '.', '#', '%', '+', '$', '!', '*', ',', '//',
       'abnormal_url', 'https', 'digits', 'letters', 'Shortining_Service',
       'having_ip_address'],
      dtype='object')

In [29]:
scaler = MinMaxScaler()
X[['url_len', 'digits', 'letters']] = scaler.fit_transform(X[['url_len', 'digits', 'letters']])

In [30]:
X.head()

Unnamed: 0,predicted_class_encoded_email,predicted_class_url,url_len,@,?,-,=,.,#,%,...,!,*,",",//,abnormal_url,https,digits,letters,Shortining_Service,having_ip_address
0,1,1,0.034783,0,0,0,0,1,0,0,...,0,0,0,1,1,1,0.0,0.065359,0,0
1,1,1,0.017391,0,0,1,0,1,0,0,...,0,0,0,1,1,1,0.0,0.035948,0,0
2,1,1,0.030435,0,0,0,0,2,0,0,...,0,0,0,1,1,1,0.0,0.055556,0,0
3,1,1,0.023913,0,0,0,0,1,0,0,...,0,0,0,1,1,1,0.0,0.04902,0,0
4,1,1,0.03913,0,0,0,0,1,0,0,...,0,0,0,1,1,1,0.0,0.071895,0,0


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
y_train.value_counts()

label_email
0    561
1    249
Name: count, dtype: int64

In [33]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC()
}

In [34]:
# Evaluate models using cross-validation
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f'{name} Accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})')

Random Forest Accuracy: 0.9950 (+/- 0.0153)
Logistic Regression Accuracy: 0.9950 (+/- 0.0153)
SVM Accuracy: 0.9950 (+/- 0.0153)


In [35]:
# Choosen Logistic Regression based on cross-validation results
best_model = LogisticRegression()  
best_model.fit(X_train, y_train)

# Make predictions and evaluate the best model
preds = best_model.predict(X_test)
acc = accuracy_score(y_test, preds)
print('Best Model Accuracy:', acc)

# Print classification report
print('Classification report:')
print(classification_report(y_test, preds))

# Print confusion matrix
print('Confusion matrix:')
print(confusion_matrix(y_test, preds))

Best Model Accuracy: 1.0
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       149
           1       1.00      1.00      1.00        54

    accuracy                           1.00       203
   macro avg       1.00      1.00      1.00       203
weighted avg       1.00      1.00      1.00       203

Confusion matrix:
[[149   0]
 [  0  54]]


In [36]:
import joblib 
# Save the best model using joblib
joblib.dump(best_model, '../models/best_meta_classifier_model.pkl')

['../models/best_meta_classifier_model.pkl']