In [2]:
import random
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [3]:
# read the data
spam = pd.read_csv("./SMSSpamCollection.csv", names=["label", "message"])

In [4]:
spam

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
# we will create a list of tuples that contain the message, along with whether its spam of ham
data_set = []
for index,row in spam.iterrows():
    data_set.append((row.message,row.label))

In [6]:
data_set

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  'ham'),
 ('Ok lar... Joking wif u oni...', 'ham'),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  'spam'),
 ('U dun say so early hor... U c already then say...', 'ham'),
 ("Nah I don't think he goes to usf, he lives around here though", 'ham'),
 ("FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
  'spam'),
 ('Even my brother is not like to speak with me. They treat me like aids patent.',
  'ham'),
 ("As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
  'ham'),
 ('WINNER!! As a valued network customer you have been selected to receivea £900 prize 

In [7]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def preprocess(document, stem = True):
    document = document.lower()
    words = word_tokenize(document) # split
    words = [word for word in words if word not in stopwords.words("english")] # removed bad ones
    if stem :
        words = [stemmer.stem(word) for word in words]
    else:
        words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    words = [word for word in words if len(word)>=3] # we only put a word in if it has more than three letters
    return words
# this is our preprocessor


In [8]:
message_set = []
for (message, label) in data_set:
    words = preprocess(message)
    message_set.append((words,label))
message_set

[(['jurong',
   'point',
   'crazy..',
   'avail',
   'bugi',
   'great',
   'world',
   'buffet',
   '...',
   'cine',
   'got',
   'amor',
   'wat',
   '...'],
  'ham'),
 (['lar', '...', 'joke', 'wif', 'oni', '...'], 'ham'),
 (['free',
   'entri',
   'wkli',
   'comp',
   'win',
   'cup',
   'final',
   'tkt',
   '21st',
   'may',
   '2005.',
   'text',
   '87121',
   'receiv',
   'entri',
   'question',
   'std',
   'txt',
   'rate',
   'appli',
   '08452810075over18'],
  'spam'),
 (['dun', 'say', 'earli', 'hor', '...', 'alreadi', 'say', '...'], 'ham'),
 (['nah', "n't", 'think', 'goe', 'usf', 'live', 'around', 'though'], 'ham'),
 (['freemsg',
   'hey',
   'darl',
   'week',
   'word',
   'back',
   'like',
   'fun',
   'still',
   'xxx',
   'std',
   'chg',
   'send',
   '£1.50',
   'rcv'],
  'spam'),
 (['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
  'ham'),
 (['per',
   'request',
   "'mell",
   'mell',
   'oru',
   'minnaminungint',
   'nurungu',
   'vett

Now, we will create a list of all the words in our dataset, without duplicates

In [9]:
# this will give us a list of all the words, with duplicates
def get_words_in_messages(messages):
    all_words=[]
    for (message, label) in messages:
        all_words.extend(message)
    return all_words

In [12]:
# this will take that list, and take the unique words
def make_unique_list(words):
    words = nltk.FreqDist(words)
    word_list = words.keys()
    return word_list

In [31]:
features = make_unique_list(get_words_in_messages(message_set))
print(features)

dict_keys(['message..no', 'responce..what', 'happend', 'ummmmmaah', 'mani', 'happi', 'return', 'day', 'dear', 'sweet', 'heart..', 'birthday', 'back', 'work', '2morro', 'half', 'term', '2nite', 'sexi', 'passion', 'chat', '09099726481', 'luv', 'dena', 'call', '£1/minmobsmorelkpobox177hp51fl', 'cant', 'pick', 'phone', 'right', 'send', 'messag', 'fuck', 'sake', 'like', 'tallahasse', 'nokia', 'lovly..', 'want', 'funk', 'fone', 'weekli', 'new', 'tone', 'repli', 'tones2u', 'text', 'www.ringtones.co.uk', 'origin', 'best', '3gbp', 'network', 'oper', 'rate', 'appli', 'arm', 'feel', 'weak', 'cuz', 'got', 'shot', 'anoth', 'time', 'surli', 'ill', 'give', 'come', 'review', 'dai', 'naal', 'eruku', 'hey', 'realli', 'horni', 'see', 'nake', 'hot', '69698', 'charg', '150pm', 'unsubscrib', 'stop', 'prob', '...', 'food', 'mail', 'xma', '100', 'free', 'video', 'price', 'line', 'rental', '0800', '0721072', 'find', 'msg', 'get', 'gnarl', 'barkley', 'crazi', 'rington', 'total', 'cedar', 'key', 'anyway', 'tho',

<b>Create train and test set</b>

In [32]:
sliceIndex = int((len(message_set)*.8)) #finding length of 80% of the dataset
random.shuffle(message_set) #shuffling the set
train_messages, test_messages = message_set[:sliceIndex], message_set[sliceIndex:] #splitting

<b> We will now create a map that, for each message will tell us if a certain word is present, and if its spam of ham</b>

In [36]:
def extract_features(document):
    document_words = set(document)
    feature = {}
    for words in features:
        feature['contains(%s)' % words] = (words in document)
    return feature

In [37]:
training_set = nltk.classify.apply_features(extract_features, train_messages)
testing_set = nltk.classify.apply_features(extract_features, test_messages)
print(training_set[1])

({'contains(message..no)': False, 'contains(responce..what)': False, 'contains(happend)': False, 'contains(ummmmmaah)': False, 'contains(mani)': False, 'contains(happi)': False, 'contains(return)': False, 'contains(day)': False, 'contains(dear)': False, 'contains(sweet)': False, 'contains(heart..)': False, 'contains(birthday)': False, 'contains(back)': False, 'contains(work)': False, 'contains(2morro)': False, 'contains(half)': False, 'contains(term)': False, 'contains(2nite)': False, 'contains(sexi)': False, 'contains(passion)': False, 'contains(chat)': False, 'contains(09099726481)': False, 'contains(luv)': False, 'contains(dena)': False, 'contains(call)': False, 'contains(£1/minmobsmorelkpobox177hp51fl)': False, 'contains(cant)': False, 'contains(pick)': False, 'contains(phone)': False, 'contains(right)': False, 'contains(send)': False, 'contains(messag)': False, 'contains(fuck)': False, 'contains(sake)': False, 'contains(like)': False, 'contains(tallahasse)': False, 'contains(nokia

<b>Training</b>

In [38]:
spamClassifier = nltk.NaiveBayesClassifier.train(training_set)

<b>Evaluate</b>

In [39]:
#accuracy of test_set
print(nltk.classify.accuracy(spamClassifier, testing_set))

0.9910313901345291


In [40]:
#accuracy of train_set
print(nltk.classify.accuracy(spamClassifier, training_set))

0.9925959165357864


In [41]:
m = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
print('Classification result : ', spamClassifier.classify(extract_features(m.split())))

Classification result :  spam


All in all, pretty neat

Now, we will store our model

In [42]:
import pickle
f = open('nb_spam_classifier.pickle', 'wb')
pickle.dump(spamClassifier,f)
print('Classifier stored at ', f.name)
f.close()

Classifier stored at  nb_spam_classifier.pickle
