In [None]:
# Classify the email using the binary classification method. Email Spam
# detection has two states: 
# a) Normal State – Not Spam, 
# b) Abnormal State – Spam. 
# Use K-Nearest Neighbors and Support Vector Machine for classification. Analyze their performance.

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("C:/Users/yashr/OneDrive/Desktop/Sem 7 LA's/ML Lab Datasets/emails.csv")

# Check the structure of the dataframe
print(df.head())
print(df.columns)

# Feature Selection
X = df.iloc[:, 1:-1]  # All columns except the first and last
y = df.iloc[:, -1]    # The last column

# Convert to numeric, replacing any non-numeric values with NaN
X = X.apply(pd.to_numeric, errors='coerce')

# Drop any rows with NaN values
X = X.dropna()
y = y.loc[X.index]

# Reset index after dropping rows
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

# training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a function to evaluate model performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return {
        'confusion_matrix': cm,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

# KNN Classification
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
knn_results = evaluate_model(knn, X_test_scaled, y_test)

# SVM Classification
svm = SVC(kernel='rbf')
svm.fit(X_train_scaled, y_train)
svm_results = evaluate_model(svm, X_test_scaled, y_test)

# Print results
print()
print("KNN Results:")
print(f"Confusion Matrix:\n{knn_results['confusion_matrix']}")
print(f"Accuracy: {knn_results['accuracy']:.4f}")
print(f"Precision: {knn_results['precision']:.4f}")
print(f"Recall: {knn_results['recall']:.4f}")
print(f"F1 Score: {knn_results['f1_score']:.4f}")

print("\nSVM Results:")
print(f"Confusion Matrix:\n{svm_results['confusion_matrix']}")
print(f"Accuracy: {svm_results['accuracy']:.4f}")
print(f"Precision: {svm_results['precision']:.4f}")
print(f"Recall: {svm_results['recall']:.4f}")
print(f"F1 Score: {svm_results['f1_score']:.4f}")

# Compare the two algorithms
print("\nComparison:")
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
for metric in metrics:
    knn_value = knn_results[metric]
    svm_value = svm_results[metric]
    better_model = "KNN" if knn_value > svm_value else "SVM"
    print(f"{metric.upper()}: {better_model} performs better")

# Analyze confusion matrices
knn_cm = knn_results['confusion_matrix']
svm_cm = svm_results['confusion_matrix']

print(f"\nKNN Confusion Matrix:\n{knn_cm}")
print(f"SVM Confusion Matrix:\n{svm_cm}")

  Email No.  the  to  ect  and  for  of    a  you  hou  ...  connevey  jay  \
0   Email 1    0   0    1    0    0   0    2    0    0  ...         0    0   
1   Email 2    8  13   24    6    6   2  102    1   27  ...         0    0   
2   Email 3    0   0    1    0    0   0    8    0    0  ...         0    0   
3   Email 4    0   5   22    0    5   1   51    2   10  ...         0    0   
4   Email 5    7   6   17    1    5   2   57    0    9  ...         0    0   

   valued  lay  infrastructure  military  allowing  ff  dry  Prediction  
0       0    0               0         0         0   0    0           0  
1       0    0               0         0         0   1    0           0  
2       0    0               0         0         0   0    0           0  
3       0    0               0         0         0   0    0           0  
4       0    0               0         0         0   1    0           0  

[5 rows x 3002 columns]
Index(['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a