In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score


In [None]:
pd.set_option('display.max_columns', None)
plt.rcParams['figure.figsize'] = (12, 6)

data = pd.read_csv("Phishing_Legitimate_full.csv")
data.rename(columns={'CLASS_LABEL': 'labels'}, inplace=True)
X = data.drop(['id', 'labels'], axis=1)
y = data['labels']

In [None]:
discrete_features = X.dtypes == int
mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features)
mi_scores = pd.Series(mi_scores, name='MI Scores', index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [None]:
top_n_features = mi_scores.head(32).index.tolist()
X = data[top_n_features]
y = data['labels']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [None]:
log_reg = LogisticRegression(max_iter=10000)
svc_clf = SVC(probability=True)
dt_clf = DecisionTreeClassifier(max_depth=32)

In [None]:
voting_clf_soft = VotingClassifier(estimators=[('lr', log_reg), ('svc', svc_clf), ('dt', dt_clf)], voting='soft')
voting_clf_soft.fit(X_train, y_train)

In [None]:
y_pred_soft = voting_clf_soft.predict(X_test)
accuracy_soft = accuracy_score(y_test, y_pred_soft)
precision_soft = precision_score(y_test, y_pred_soft)
recall_soft = recall_score(y_test, y_pred_soft)
f1_soft = f1_score(y_test, y_pred_soft)
report_soft = classification_report(y_test, y_pred_soft)


In [None]:
print("Soft Voting Classifier Results:")
print(f'Accuracy: {accuracy_soft}')
print(f'Precision: {precision_soft}')
print(f'Recall: {recall_soft}')
print(f'F1 Score: {f1_soft}')
print(f'Classification Report:\n{report_soft}')

Soft Voting Classifier Results:
Accuracy: 0.963
Precision: 0.9634387351778656
Recall: 0.9634387351778656
F1 Score: 0.9634387351778656
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       988
           1       0.96      0.96      0.96      1012

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000



In [None]:
voting_clf_hard = VotingClassifier(estimators=[('lr', log_reg), ('svc', svc_clf), ('dt', dt_clf)], voting='hard')
voting_clf_hard.fit(X_train, y_train)

In [None]:
y_pred_hard = voting_clf_hard.predict(X_test)
accuracy_hard = accuracy_score(y_test, y_pred_hard)
precision_hard = precision_score(y_test, y_pred_hard)
recall_hard = recall_score(y_test, y_pred_hard)
f1_hard = f1_score(y_test, y_pred_hard)
report_hard = classification_report(y_test, y_pred_hard)

In [None]:
print("\nHard Voting Classifier Results:")
print(f'Accuracy: {accuracy_hard}')
print(f'Precision: {precision_hard}')
print(f'Recall: {recall_hard}')
print(f'F1 Score: {f1_hard}')
print(f'Classification Report:\n{report_hard}')


Hard Voting Classifier Results:
Accuracy: 0.949
Precision: 0.9426070038910506
Recall: 0.9575098814229249
F1 Score: 0.95
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       988
           1       0.94      0.96      0.95      1012

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000

