In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
# Draw plot and present chart in cell
%matplotlib inline

In [2]:
# Read data
data = pd.read_csv("spam.csv")

In [3]:
# Display data
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# Display first 5 data
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Display last 5 data
data.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [6]:
# Display rows and columns of data
data.shape

(5572, 2)

In [7]:
# Display basic statistical details
#data.describe()
data.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [8]:
# Print full summary
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [9]:
# Check each column unique values
data.nunique()

Category       2
Message     5157
dtype: int64

In [10]:
# Display columns
data.columns

Index(['Category', 'Message'], dtype='object')

In [11]:
# Check for duplicates
data.duplicated().sum()

415

In [12]:
# Check number of rows where particular columns of null values
# df.isna().sum()
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [13]:
# % of missing data
total = np.product(data.shape)
total_null = data.isnull().sum().sum()
percentage_missing = (total_null/total)*100
percentage_missing

0.0

In [14]:
# Text convert columns (category and message) into numbers for ml model
# Category: ham (0) & spam (1)
data['spam']=data['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [15]:
# Added new spam column, display data
data

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


## ML

In [17]:
# Split dataset into train & test
from sklearn.model_selection import train_test_split

# Extract features from data
from sklearn.feature_extraction.text import CountVectorizer

# x_train_count: email column text converted to numbers
from sklearn.naive_bayes import MultinomialNB

#### Naive Bayes

In [19]:
# Split dataset into train & test
# 25% training & 25% testing, feature word count & label spam or ham
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.Message, data.spam, test_size=0.25)

In [20]:
# Check X_train
x_train

1187     Goodmorning, Today i am late for  &lt;#&gt; min.
124     I am going to sao mu today. Will be done only ...
5312    Here got ur favorite oyster... N got my favori...
25      Just forced myself to eat a slice. I'm really ...
1284                            Yes i thought so. Thanks.
                              ...                        
1498    I'm putting it on now. It should be ready for ...
4539     Dare i ask... Any luck with sorting out the car?
3063                            Fine. Do you remember me.
3224                    Well that must be a pain to catch
5239          Jay wants to work out first, how's 4 sound?
Name: Message, Length: 4179, dtype: object

In [21]:
# Check X_train
x_train.describe()

count                       4179
unique                      3933
top       Sorry, I'll call later
freq                          25
Name: Message, dtype: object

In [22]:
# Find word count and store data as a matrix
# Convert cv into word count
# Convert fit transform into matrix
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

In [23]:
x_train_count

<4179x7493 sparse matrix of type '<class 'numpy.int64'>'
	with 55704 stored elements in Compressed Sparse Row format>

In [24]:
x_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

#### Train Naive Bayes

In [26]:
# Train model
# model.fit(word count, train numerical feature)
model = MultinomialNB()
model.fit(x_train_count,y_train)

In [27]:
# Ham email text
emails_ham = [
    "Hi X, how are you?",
    "Buffet promotion 1 for 1 deal at x shopping mall."
]

# Ham text convert into word count
emails_ham_count = cv.transform(emails_ham)
model.predict(emails_ham_count)

array([0, 0])

Emails (0) are not spams.

In [29]:
# Spam email text
emails_spam = [
    "aaa survey link free vouchers",
    "bbb survey link free rewards"
]

# Ham text convert into word count
emails_spam_count = cv.transform(emails_spam)
model.predict(emails_spam_count)

array([1, 1])

In [30]:
# Test model and measure accuracy
# score predict
x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)

0.9842067480258435