# Installation

In [None]:
!pip install keras

In [None]:
!pip install collection

# Imports

In [21]:
import os
import numpy as np
import pandas as pd
import string
# import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense


# Implementation

In [3]:
# read data

data = pd.read_csv("spam.csv", encoding = "latin-1", usecols = [0, 1], header = None)

In [4]:
# rename header

data.iloc[0][0] = "tags"
data.iloc[0][1] = "text"
header = data.iloc[0]
data = data[1:]
data = data.rename(columns = header)

In [5]:
# remove punctuation

def remove_punct(text):
    text_only = ''.join([char for char in text if char not in string.punctuation])
    return text_only



data["no_punct"] = data["text"].apply(lambda x: remove_punct(x))

In [6]:
# remove stopwords

stop_words = set(stopwords.words("English"))

def tokenize(text):
    return [word.lower() for word in text.split() if word.lower() not in stop_words]



data["tokens"] = data["no_punct"].apply(lambda x: tokenize(x))

In [7]:
# labels to int   spam = 1, ham = 0

def label_to_num(label):
    return 1 if label == "spam" else 0
data["labels"] = data["tags"].apply(lambda x: label_to_num(x))



data

Unnamed: 0,tags,text,no_punct,tokens,labels
1,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, jurong, point, crazy, available, bugis, n...",0
2,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]",0
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",1
4,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, early, hor, u, c, already, say]",0
5,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, dont, think, goes, usf, lives, around, t...",0
...,...,...,...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...,This is the 2nd time we have tried 2 contact u...,"[2nd, time, tried, 2, contact, u, u, å£750, po...",1
5569,ham,Will Ì_ b going to esplanade fr home?,Will Ì b going to esplanade fr home,"[ì, b, going, esplanade, fr, home]",0
5570,ham,"Pity, * was in mood for that. So...any other s...",Pity was in mood for that Soany other suggest...,"[pity, mood, soany, suggestions]",0
5571,ham,The guy did some bitching but I acted like i'd...,The guy did some bitching but I acted like id ...,"[guy, bitching, acted, like, id, interested, b...",0


In [8]:
# count unique words

def count_words(textset):
    count = Counter()
    max_tkn = 0
    for row in textset:
        i = 0
        for token in row:
            count[token] += 1
            i += 1
        max_tkn = i if i > max_tkn else max_tkn
    return count, max_tkn

words_statistic, max_token = count_words(data["tokens"])
print(max_token)
print(len(words_statistic))
words_statistic

80
9431


Counter({'go': 278,
         'jurong': 1,
         'point': 13,
         'crazy': 15,
         'available': 16,
         'bugis': 7,
         'n': 143,
         'great': 111,
         'world': 33,
         'la': 7,
         'e': 84,
         'buffet': 2,
         'cine': 7,
         'got': 238,
         'amore': 1,
         'wat': 96,
         'ok': 277,
         'lar': 38,
         'joking': 6,
         'wif': 27,
         'u': 1119,
         'oni': 4,
         'free': 275,
         'entry': 26,
         '2': 478,
         'wkly': 14,
         'comp': 11,
         'win': 71,
         'fa': 4,
         'cup': 8,
         'final': 18,
         'tkts': 4,
         '21st': 3,
         'may': 46,
         '2005': 3,
         'text': 188,
         '87121': 4,
         'receive': 38,
         'questionstd': 2,
         'txt': 163,
         'ratetcs': 2,
         'apply': 31,
         '08452810075over18s': 2,
         'dun': 55,
         'say': 91,
         'early': 33,
         'hor': 2,
   

In [9]:
# training and testing dataset

X_train_temp, X_test_temp, y_train, y_test = train_test_split(data["tokens"], data["labels"], test_size = 0.2, shuffle = True, random_state = 0, stratify = data["labels"])

In [10]:
# tokenizer

tokenizer = Tokenizer(num_words = max_token)
tokenizer.fit_on_texts(X_train_temp)
word_index = tokenizer.word_index

In [11]:
# train sequences

X_train_sequences = tokenizer.texts_to_sequences(X_train_temp)
X_train = pad_sequences(X_train_sequences, maxlen = max_token, padding = "post", truncating = "post")
X_train.shape

(4457, 80)

In [12]:
# test sequences

X_test_sequences = tokenizer.texts_to_sequences(X_test_temp)
X_test = pad_sequences(X_test_sequences, maxlen = max_token, padding = "post", truncating = "post")
X_test.shape

(1115, 80)

In [22]:
# model

model = Sequential()
model.add(Embedding(len(words_statistic), 32, input_length = max_token))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 80, 32)            301792    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 76, 128)           20608     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 323,701
Trainable params: 323,701
Non-trainable params: 0
_________________________________________________________________


In [23]:
result = model.fit(X_train, y_train, epochs = 10, validation_data=(X_test, y_test), batch_size=10)




Train on 4457 samples, validate on 1115 samples
Epoch 1/10





Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
