In [1]:
#importing useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Preparing Data

In [2]:
sms = pd.read_csv('spam.csv', encoding = "ISO-8859-1")

In [3]:
sms.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
sms.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)

In [5]:
sms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
sms.rename(columns={'v1':'label','v2':'message'}, inplace=True)

In [7]:
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# first 5 message

for m in sms['message'][:5]:
    print(m, end = '\n\n')

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...

Ok lar... Joking wif u oni...

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

U dun say so early hor... U c already then say...

Nah I don't think he goes to usf, he lives around here though



In [9]:
# COnvert 'label' to target feature: 'spam' -> 1
sms['target'] = (sms['label'] == 'spam').astype(int)
sms.drop('label', axis=1, inplace=True)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, accuracy_score, confusion_matrix

## Text Vector-Representation (Bag of Words)
- in bag of words, we treat every word in `corpus` as a feature
- and for every document, we count how many times these word appears in the document

This is done in two steps:

- Learn the vocabulary in the corpus: this is done using the fit method
- Use that vocabulary to produce the vector representation for each document: this is done using the transform method

Scikit learn provides the `fit_transform` method to perform the 2 steps at the same time.

In [11]:
corpus = [
    'This is the first document',
    'This is the second second document',
    'And the third one. Yes, yes, yes this',
    'Is this the first document?'
]

In [12]:
#Step 1: Learn the vocabulary of training data (corpus in this case)
vectorizer = CountVectorizer()
vectorizer.fit(corpus)

In [13]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this', 'yes'], dtype=object)

In [14]:
# Step 2: Vectorize the documents (dtm: document-token matrix)
# document-term matrix
X_dtm = vectorizer.transform(corpus)

In [15]:
# Bag of words representation
pd.DataFrame(data=X_dtm.toarray(), columns=vectorizer.get_feature_names_out(), index=['doc'+str(i+1) for i in range(len(corpus))])

Unnamed: 0,and,document,first,is,one,second,the,third,this,yes
doc1,0,1,1,1,0,0,1,0,1,0
doc2,0,1,0,1,0,2,1,0,1,0
doc3,1,0,0,0,1,0,1,1,1,3
doc4,0,1,1,1,0,0,1,0,1,0


- most common way to produce features from text data

## 2. Bag of words for sms

In [16]:
target_name ='target'
X = sms['message']
y = sms[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

In [17]:
# Instantiate the vectorizer
vect = CountVectorizer()

# Producing the document-token matrix (in one step)
# learning the vocabulary and producing the document term matrix
# applying this to the messages contain in X_train set
X_train_dtm = vect.fit_transform(X_train)

# transform testing data (using fitted vocabulary) into a document-token matrix
# only using transform n=because these have learned the vocab and 
# will use to corss-check from text messages that are contained in X_train
X_test_dtm = vect.transform(X_test)

In [18]:
# to produce a descent looking confusion matrix
def CMatrix(CM, labels=['ham','spam']):
    df = pd.DataFrame(data=CM, index=labels, columns=labels)
    df.index.name = "TRUE"
    df.columns.name = 'PREDICTION'
    df.loc['Total'] = df.sum()
    df['Total'] = df.sum(axis=1)
    return df

## 3. Building the classifier

- will use `Multinomial Naive Bayes` because all/ most of the features are distinct/ count

In [19]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

nb.fit(X_train_dtm, y_train)

y_pred_test = nb.predict(X_test_dtm)
precision = precision_score(y_pred = y_pred_test, y_true = y_test)
accuracy = accuracy_score(y_pred = y_pred_test, y_true = y_test)

CM = confusion_matrix(y_pred = y_pred_test, y_true = y_test)
print('Precision: {:0.2f}%'.format(100*precision))
print('Accuracy: {:0.2f}%'.format(100*accuracy))
CMatrix(CM)

Precision: 97.22%
Accuracy: 98.83%


PREDICTION,ham,spam,Total
TRUE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ham,962,4,966
spam,9,140,149
Total,971,144,1115


## 4. Let's predict the class for the following sms
1. "Today is your lucky day! claim $100 of free gas now! just text back saying YES."
2. "I have been calling you all day, r u comming back before dinner?"

In [20]:
def spam_filter(text_message):
    # accept a string containing text and classify it as spam or ham
    prediction = nb.predict(vect.transform([text_message]))[0]
    print(text_message)
    if prediction:
        return "SPAM"
    else:
        return "HAM"

In [21]:
sms1 = "Today is your lucky day! claim $100 of free gas now! just text back saying YES."
sms2 = "I have been calling you all day, r u comming back before dinner?"

In [22]:
spam_filter(sms1)

Today is your lucky day! claim $100 of free gas now! just text back saying YES.


'SPAM'

In [23]:
spam_filter(sms2)

I have been calling you all day, r u comming back before dinner?


'HAM'