In [1]:
import string
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


<h1> Data Preprocessing </h1>

In [2]:
df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
df['spam'].value_counts()

spam
0    4360
1    1368
Name: count, dtype: int64

In [4]:
df['text'].value_counts

<bound method IndexOpsMixin.value_counts of 0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5728, dtype: object>

In [5]:
## Removing Stopwords

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
def preprocess(text):
    #remove punctuation and lowercase
    text = "".join([t.lower() for t in text if t not in string.punctuation])

    #tokenize
    tokens = text.split(" ") # 'hello world' == 'hello' , 'world'

    #filter out stopwords
    return " ".join(t for t in tokens if t not in ENGLISH_STOP_WORDS)

df['text'] = df['text'].apply(lambda text: preprocess(text))
df.head()

Unnamed: 0,text,spam
0,subject naturally irresistible corporate ident...,1
1,subject stock trading gunslinger fanny merril...,1
2,subject unbelievable new homes easy im wantin...,1
3,subject 4 color printing special request addi...,1
4,subject money software cds software compati...,1


In [7]:
x = df['text']
y = df['spam']

<h1> Vectorizing the Text </h1>
<i>Using TfidVectorizer </i>:<br>
Converts a collection of raw documents to a matrix of TF-IDF features.
<ul><li> TF- Term Frequency: checks frequency of a term in a message</li>
<li>IDF- Inverse Document Frequency: Checks the whole document </li>

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(x)
x_vectors = tfidf.transform(x)

In [9]:
x_vectors

<5728x37023 sparse matrix of type '<class 'numpy.float64'>'
	with 508390 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x_vectors, y, test_size=0.2, random_state=42
)

print(x_train.shape)
print(y_train.shape)

(4582, 37023)
(4582,)


<h1> Training KNN Classifier </h1>

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier()
knn_classifier.fit(x_train, y_train)

In [12]:
y_pred1 = knn_classifier.predict(x_test)
y_pred1

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [14]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

#Accucary, precision and recall scores
print("Accuracy:", accuracy_score(y_test,y_pred1))
print("Precision:", precision_score(y_test,y_pred1))
print("Recall:", recall_score(y_test,y_pred1))


Accuracy: 0.9729493891797557
Precision: 0.9814126394052045
Recall: 0.9103448275862069


<h1> Saving Model to a File </h1>

In [15]:
from joblib import dump

dump(knn_classifier, 'spam_model.joblib')
dump(tfidf, 'tfidf.joblib')

['tfidf.joblib']