# Installation

In [None]:
!pip install keras

In [None]:
!pip install collection

# Import

In [1]:
import os
import numpy as np
import pandas as pd
import string
# import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam


Using TensorFlow backend.


# Implementation

In [2]:
# read data

data = pd.read_csv("spam.csv", encoding = "latin-1", usecols = [0, 1], header = None)
data

Unnamed: 0,0,1
0,v1,v2
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...
5569,ham,Will Ì_ b going to esplanade fr home?
5570,ham,"Pity, * was in mood for that. So...any other s..."
5571,ham,The guy did some bitching but I acted like i'd...


In [3]:
# rename header

data.iloc[0][0] = "tags"
data.iloc[0][1] = "text"
header = data.iloc[0]
data = data[1:]
data = data.rename(columns = header)
data

Unnamed: 0,tags,text
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...
5569,ham,Will Ì_ b going to esplanade fr home?
5570,ham,"Pity, * was in mood for that. So...any other s..."
5571,ham,The guy did some bitching but I acted like i'd...


In [4]:
# remove punctuation

def remove_punct(text):
    text_only = ''.join([char for char in text if char not in string.punctuation])
    return text_only



data["no_punct"] = data["text"].apply(lambda x: remove_punct(x))
data

Unnamed: 0,tags,text,no_punct
1,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
2,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
5,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...
...,...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...,This is the 2nd time we have tried 2 contact u...
5569,ham,Will Ì_ b going to esplanade fr home?,Will Ì b going to esplanade fr home
5570,ham,"Pity, * was in mood for that. So...any other s...",Pity was in mood for that Soany other suggest...
5571,ham,The guy did some bitching but I acted like i'd...,The guy did some bitching but I acted like id ...


In [5]:
# remove stopwords

stop_words = set(stopwords.words("English"))

def tokenize(text):
    return [word.lower() for word in text.split() if word.lower() not in stop_words]



data["tokens"] = data["no_punct"].apply(lambda x: tokenize(x))
data

Unnamed: 0,tags,text,no_punct,tokens
1,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, jurong, point, crazy, available, bugis, n..."
2,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
4,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, early, hor, u, c, already, say]"
5,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, dont, think, goes, usf, lives, around, t..."
...,...,...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...,This is the 2nd time we have tried 2 contact u...,"[2nd, time, tried, 2, contact, u, u, å£750, po..."
5569,ham,Will Ì_ b going to esplanade fr home?,Will Ì b going to esplanade fr home,"[ì, b, going, esplanade, fr, home]"
5570,ham,"Pity, * was in mood for that. So...any other s...",Pity was in mood for that Soany other suggest...,"[pity, mood, soany, suggestions]"
5571,ham,The guy did some bitching but I acted like i'd...,The guy did some bitching but I acted like id ...,"[guy, bitching, acted, like, id, interested, b..."


In [6]:
# labels to int   spam = 1, ham = 0

def label_to_num(label):
    return 1 if label == "spam" else 0
data["labels"] = data["tags"].apply(lambda x: label_to_num(x))



data

Unnamed: 0,tags,text,no_punct,tokens,labels
1,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, jurong, point, crazy, available, bugis, n...",0
2,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]",0
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",1
4,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, early, hor, u, c, already, say]",0
5,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, dont, think, goes, usf, lives, around, t...",0
...,...,...,...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...,This is the 2nd time we have tried 2 contact u...,"[2nd, time, tried, 2, contact, u, u, å£750, po...",1
5569,ham,Will Ì_ b going to esplanade fr home?,Will Ì b going to esplanade fr home,"[ì, b, going, esplanade, fr, home]",0
5570,ham,"Pity, * was in mood for that. So...any other s...",Pity was in mood for that Soany other suggest...,"[pity, mood, soany, suggestions]",0
5571,ham,The guy did some bitching but I acted like i'd...,The guy did some bitching but I acted like id ...,"[guy, bitching, acted, like, id, interested, b...",0


In [7]:
# count unique words

def count_words(textset):
    count = Counter()
    max_tkn = 0
    for row in textset:
        i = 0
        for token in row:
            count[token] += 1
            i += 1
        max_tkn = i if i > max_tkn else max_tkn
    return count, max_tkn

words_statistic, max_token = count_words(data["tokens"])
print(max_token)
print(len(words_statistic))
words_statistic

80
9431


Counter({'go': 278,
         'jurong': 1,
         'point': 13,
         'crazy': 15,
         'available': 16,
         'bugis': 7,
         'n': 143,
         'great': 111,
         'world': 33,
         'la': 7,
         'e': 84,
         'buffet': 2,
         'cine': 7,
         'got': 238,
         'amore': 1,
         'wat': 96,
         'ok': 277,
         'lar': 38,
         'joking': 6,
         'wif': 27,
         'u': 1119,
         'oni': 4,
         'free': 275,
         'entry': 26,
         '2': 478,
         'wkly': 14,
         'comp': 11,
         'win': 71,
         'fa': 4,
         'cup': 8,
         'final': 18,
         'tkts': 4,
         '21st': 3,
         'may': 46,
         '2005': 3,
         'text': 188,
         '87121': 4,
         'receive': 38,
         'questionstd': 2,
         'txt': 163,
         'ratetcs': 2,
         'apply': 31,
         '08452810075over18s': 2,
         'dun': 55,
         'say': 91,
         'early': 33,
         'hor': 2,
   

In [8]:
# training and testing dataset

X_train_temp, X_test_temp, y_train, y_test = train_test_split(data["tokens"], data["labels"], test_size = 0.2, shuffle = True, random_state = 0, stratify = data["labels"])

In [9]:
# tokenizer

tokenizer = Tokenizer(num_words = max_token)
tokenizer.fit_on_texts(X_train_temp)
word_index = tokenizer.word_index
word_index

{'u': 1,
 'call': 2,
 '2': 3,
 'im': 4,
 'ur': 5,
 'get': 6,
 '4': 7,
 'ltgt': 8,
 'go': 9,
 'free': 10,
 'ok': 11,
 'dont': 12,
 'know': 13,
 'ill': 14,
 'got': 15,
 'like': 16,
 'good': 17,
 'come': 18,
 'time': 19,
 'day': 20,
 'send': 21,
 'want': 22,
 'love': 23,
 'text': 24,
 'one': 25,
 'r': 26,
 'need': 27,
 'going': 28,
 'txt': 29,
 'home': 30,
 'back': 31,
 'lor': 32,
 'still': 33,
 'today': 34,
 'sorry': 35,
 'stop': 36,
 'n': 37,
 'tell': 38,
 'see': 39,
 'reply': 40,
 'mobile': 41,
 'later': 42,
 'hi': 43,
 'new': 44,
 'think': 45,
 'cant': 46,
 'please': 47,
 'da': 48,
 'phone': 49,
 'well': 50,
 'week': 51,
 'ì': 52,
 'happy': 53,
 'take': 54,
 'night': 55,
 'hey': 56,
 'dear': 57,
 'oh': 58,
 'great': 59,
 'pls': 60,
 'claim': 61,
 'much': 62,
 'hope': 63,
 'make': 64,
 'work': 65,
 'give': 66,
 'thats': 67,
 'number': 68,
 'prize': 69,
 'say': 70,
 'way': 71,
 'ask': 72,
 'said': 73,
 'wat': 74,
 'right': 75,
 'already': 76,
 'yes': 77,
 '1': 78,
 'cos': 79,
 'yeah': 8

In [10]:
# train sequences

X_train_sequences = tokenizer.texts_to_sequences(X_train_temp)
X_train = pad_sequences(X_train_sequences, maxlen = max_token, padding = "post", truncating = "post")
X_train.shape

(4457, 80)

In [11]:
# test sequences

X_test_sequences = tokenizer.texts_to_sequences(X_test_temp)
X_test = pad_sequences(X_test_sequences, maxlen = max_token, padding = "post", truncating = "post")
X_test.shape

(1115, 80)

In [16]:
# model

model = Sequential()
model.add(Embedding(len(words_statistic), 32, input_length = max_token))
model.add(LSTM(64, dropout = .1))
model.add(Dense(1, activation = "sigmoid"))

optimizer = Adam(lr=7e-5)

model.compile(loss = "binary_crossentropy", optimizer = optimizer, metrics = ["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 80, 32)            301792    
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 326,689
Trainable params: 326,689
Non-trainable params: 0
_________________________________________________________________


In [17]:
result = model.fit(X_train, y_train, epochs = 20, validation_data=(X_test, y_test))

Train on 4457 samples, validate on 1115 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# index = dict([(value, key) for (key, value) in word_index.items()])
# def get_original_text(text):
#     return "".join([index.get(i, "") for i in text])