In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
# Load our data
data = pd.read_csv("emails.csv")
print(data.head())

In [None]:
# Extract features and labels
X = data['content']
y = data['priority']

In [None]:
# # Split into 80% training 10% validation 10% test data
# X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.2, random_state=0)
# X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=0)
# Split into 80% training 20% validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
# Create a bag of words using CountVectorizer
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)

In [None]:
# Use PCA to visualize the data
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_vectorized.toarray())
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
plt.title('Plot of emails using PCA')
plt.colorbar(scatter, label='Priority')
plt.show()

In [None]:
# Train a KNN classifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train_vectorized, y_train)

In [None]:
# Predict on validation data
y_pred = model.predict(X_val_vectorized)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)
print("Accuracy: ", accuracy)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=model.classes_)
disp.plot()
plt.show()
print("Classification Report: \n", class_report)