In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split   
#train_test_split is a function used to split datasets into two subsets: a training set and a testing set.

from sklearn.svm import SVC    
#SVC is a supervised machine learning algorithm used for classification tasks. It aims to find a hyperplane that best separates classes in a high-dimensional space.

from sklearn.neighbors import KNeighborsClassifier   
#KNeighborsClassifier is a supervised machine learning algorithm used for classification based on the k-nearest neighbors principle.
#It can handle both classification and regression problems.

from sklearn import metrics
#It includes functions to calculate metrics such as accuracy, precision, recall, F1-score, confusion matrix, etc.

In [10]:
df = pd.read_csv(r'C:\Users\HP\Downloads\emails.csv') 
df

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


In [11]:
 df.shape

(5172, 3002)

In [12]:
 df.isnull().any()

Email No.     False
the           False
to            False
ect           False
and           False
              ...  
military      False
allowing      False
ff            False
dry           False
Prediction    False
Length: 3002, dtype: bool

In [13]:
df.drop(columns='Email No.', inplace=True)
df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,0,0
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,0,1,0,0
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,1
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,0,1,0,1


In [14]:
df.columns

Index(['the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou', 'in',
       ...
       'connevey', 'jay', 'valued', 'lay', 'infrastructure', 'military',
       'allowing', 'ff', 'dry', 'Prediction'],
      dtype='object', length=3001)

In [15]:
df.Prediction.unique()   #used to retrieve the unique values present in the 'Prediction' column of a Pandas DataFrame df.

array([0, 1], dtype=int64)

In [16]:
df['Prediction'] = df['Prediction'].replace({0:'Not spam', 1:'Spam'})
df

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,Not spam
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,1,0,Not spam
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,Not spam
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,Not spam
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,1,0,Not spam
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,0,Not spam
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,0,1,0,Not spam
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,Spam
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,0,1,0,Spam


In [17]:
X = df.drop(columns='Prediction',axis = 1)
Y = df['Prediction']

In [18]:
 X.columns

Index(['the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou', 'in',
       ...
       'enhancements', 'connevey', 'jay', 'valued', 'lay', 'infrastructure',
       'military', 'allowing', 'ff', 'dry'],
      dtype='object', length=3000)

In [19]:
Y.head()

0    Not spam
1    Not spam
2    Not spam
3    Not spam
4    Not spam
Name: Prediction, dtype: object

In [20]:
 x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=1)


In [21]:
KN = KNeighborsClassifier
knn = KN(n_neighbors=7)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [22]:
print("Prediction: \n")
print(y_pred)


Prediction: 

['Not spam' 'Spam' 'Not spam' ... 'Not spam' 'Not spam' 'Not spam']


In [23]:
M = metrics.accuracy_score(y_test,y_pred)
print("KNN accuracy: ", M)


KNN accuracy:  0.8714975845410629


In [24]:
C = metrics.confusion_matrix(y_test,y_pred)
print("Confusion matrix: ", C)


Confusion matrix:  [[635  84]
 [ 49 267]]


In [25]:
model = SVC(C = 1) # cost C = 1
model.fit(x_train, y_train)
y_pred = model.predict(x_test) 

In [26]:
kc = metrics.confusion_matrix(y_test, y_pred)
print("SVM accuracy: ", kc)

SVM accuracy:  [[700  19]
 [189 127]]
