### Credit: https://www.youtube.com/watch?v=G4UVJoGFAv0&t=1768s

In [1]:
# Import pandas library and the data set
import pandas as pd
import nltk
import numpy as np

df = pd.read_csv('Spam.csv')

In [2]:
# Have a look at the first five rows
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Check the values of spam and ham
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
# Encode class label to binary
df = pd.get_dummies(data=df, columns=['Category'], drop_first=True)

In [5]:
df.head()

Unnamed: 0,Message,Category_spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
X = df.Message.copy()
y = df.drop(columns='Message')

In [7]:
# Use Regex to replace email, url, phone numbers, numbers, and symbols
# Replace email by 'email'
X = X.str.replace(r'(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?', 'email')
# Replace url by 'url'
X = X.str.replace(r'^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$', 'url')
# Replace phone numbers by 'phone#'
X = X.str.replace(r'(^1300\d{6}$)|(^1800|1900|1902\d{6}$)|(^0[2|3|7|8]{1}[0-9]{8}$)|(^13\d{4}$)|(^04\d{2,3}\d{6}$)', 'phone#')
# Replace numners by 'numbers'
X = X.str.replace(r'[0-9]+', 'numbers')
# Replace symbols by 'symnols'
X = X.str.replace(r'[/@/#/$/%/^/&/*/(/)/-/=/+/:/;/</>/`]+', 'symbols')
# Remove punctuations
X = X.str.replace(r'[^\w\d\s]+', ' ')
# Replace whitespaces by a single space
X = X.str.replace(r'\s+', ' ')
# Remove leading and trailing whitespaces
X = X.str.replace(r'^\s+|\s+?$', '')
# Lower case
X = X.str.lower()

In [8]:
# Remove stop words
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
'''
First, split the text into a list of words.
Then, if the words not in stopwords, join them into a string seperated by a single white space
'''
X = X.apply(lambda text: ' '.join(word for word in text.split() if word not in stopwords))

In [9]:
# Verify that stop words were removed
X.head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry numbers wkly comp win fa cup final ...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
Name: Message, dtype: object

In [10]:
# Import PorterStemmer
stemmer = nltk.PorterStemmer()
'''
First, split the text into a list of words.
Then, if the words not in stopwords, join them into a string seperated by a single white space
'''
X = X.apply(lambda text: ' '.join(stemmer.stem(word) for word in text.split()))

In [11]:
'''
First, tokenize words in text into a list of words
Then, add them into a bag of words
Then, find the frequent distribution of words'''
from nltk.tokenize import word_tokenize
bag_of_words = []
for text in X:
    list_of_words = word_tokenize(text)
    for word in list_of_words:
        bag_of_words.append(word)
all_words = nltk.FreqDist(bag_of_words)

In [12]:
# Print the number of words and the 10 most common words
print('Number of words: {}'.format(len(all_words)))
print('10 most common words: {}'.format(all_words.most_common(10)))

Number of words: 7070
10 most common words: [('number', 2816), ('u', 1191), ('call', 674), ('symbol', 563), ('go', 454), ('get', 449), ('ur', 391), ('come', 302), ('ok', 292), ('free', 279)]


In [13]:
# Use the 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [14]:
'''
Create a function to:
1. Create a list of words from text
2. For every word in the word_features, return a boolean value in a dict such that
3. If the word does exist in the list_of_words, key = word and value = True (if exists)
'''
def find_features(text):
    list_of_words = word_tokenize(text)
    features = {}
    for word in word_features:
        features[word] = (word in list_of_words)
    return features

In [15]:
# Combine texts and labels into messages
messages = list(zip(X, y))

# Prepare to shuffle the messages
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

# Call find_features for the whole data set
feature_sets = [(find_features(text), label) for (text, label) in messages]

In [17]:
# Split training and testing sets using sklearn
from sklearn.model_selection import train_test_split
training, testing = train_test_split(feature_sets, test_size=0.2, random_state=seed)

ValueError: With n_samples=1, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.