In [12]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [13]:
# Load the dataset
data = pd.read_csv('emails.csv')

In [14]:
print(data.head())

  Email No.  the  to  ect  and  for  of    a  you  hou  ...  connevey  jay  \
0   Email 1    0   0    1    0    0   0    2    0    0  ...         0    0   
1   Email 2    8  13   24    6    6   2  102    1   27  ...         0    0   
2   Email 3    0   0    1    0    0   0    8    0    0  ...         0    0   
3   Email 4    0   5   22    0    5   1   51    2   10  ...         0    0   
4   Email 5    7   6   17    1    5   2   57    0    9  ...         0    0   

   valued  lay  infrastructure  military  allowing  ff  dry  Prediction  
0       0    0               0         0         0   0    0           0  
1       0    0               0         0         0   1    0           0  
2       0    0               0         0         0   0    0           0  
3       0    0               0         0         0   0    0           0  
4       0    0               0         0         0   1    0           0  

[5 rows x 3002 columns]


In [15]:
# Drop unnecessary columns if needed (like 'Email No.' if it has no predictive value)
data = data.drop(columns=['Email No.'], errors='ignore')

In [16]:
# Separate features and target variable
X = data.drop(columns=['Prediction'])
y = data['Prediction']

In [17]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Standardize the feature data for better performance with SVM and KNN
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
# 1. K-Nearest Neighbors (KNN) Classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

In [20]:
# Evaluate KNN model
print("K-Nearest Neighbors (KNN) Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

K-Nearest Neighbors (KNN) Performance:
Accuracy: 0.8454106280193237
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.80      0.88       739
           1       0.66      0.95      0.78       296

    accuracy                           0.85      1035
   macro avg       0.82      0.88      0.83      1035
weighted avg       0.89      0.85      0.85      1035

Confusion Matrix:
 [[593 146]
 [ 14 282]]


In [21]:
# 2. Support Vector Machine (SVM) Classifier
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

In [22]:
# Evaluate SVM model
print("Support Vector Machine (SVM) Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

Support Vector Machine (SVM) Performance:
Accuracy: 0.9468599033816425
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96       739
           1       0.90      0.92      0.91       296

    accuracy                           0.95      1035
   macro avg       0.93      0.94      0.94      1035
weighted avg       0.95      0.95      0.95      1035

Confusion Matrix:
 [[707  32]
 [ 23 273]]


Explanation
Data Preprocessing:
We load the dataset and inspect it.
The 'Email No.' column is dropped if it’s not useful for prediction.
We separate features (X) and target variable (y).
Data Splitting:
We split the data into training and testing sets, reserving 20% of the data for testing.
Feature Scaling:
We standardize the features using StandardScaler to improve the performance of KNN and SVM.
Model Training and Evaluation:
K-Nearest Neighbors:
We initialize and train a KNN classifier.
We make predictions and evaluate its performance with accuracy, classification report, and confusion matrix.
Support Vector Machine:
We initialize and train an SVM classifier with a linear kernel.
We make predictions and evaluate its performance similarly.