In [1]:
import pandas as pd
import numpy as np
from numpy import asarray
from numpy import zeros
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(42)

In [3]:
df_train = pd.read_csv('../data/sms_train.csv', encoding='latin')

In [4]:
df_train['Label'] = df_train['Label'].map({'info': 0, 'ham': 1, 'spam': 2})

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.utils.vis_utils import plot_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [6]:
# word tokenizer
t = Tokenizer(num_words=1000)
t.fit_on_texts(df_train['Message'])
vocab_size = len(t.word_index) + 1

In [7]:
vocab_size

39319

In [8]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(df_train['Label'])

LabelEncoder()

In [9]:
# split data

In [10]:
X_train, X_val = train_test_split(df_train)

In [11]:
# integer encode the documents
encoded_docs_train = t.texts_to_sequences(X_train['Message'])
encoded_docs_test = t.texts_to_sequences(X_val['Message'])
#print(encoded_docs)
# pad documents to a max length of 20 words
max_length = 20
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=max_length, padding='post')
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')
print(padded_docs_train.shape, padded_docs_test.shape)

(22500, 20) (7500, 20)


In [12]:
encoded_Y_train = encoder.transform(X_train['Label'])
encoded_Y_test = encoder.transform(X_val['Label'])
# convert integers to dummy variables (i.e. one hot encoded)
train_labels = np_utils.to_categorical(encoded_Y_train)
test_labels = np_utils.to_categorical(encoded_Y_test)

In [13]:
train_labels.shape, test_labels.shape

((22500, 3), (7500, 3))

In [14]:
output_class = train_labels.shape[1]

In [15]:
output_class

3

In [16]:
model = Sequential()
e = Embedding(vocab_size, 100, input_length=max_length)
model.add(e)
model.add(Flatten())
model.add(Dense(output_class, activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 100)           3931900   
_________________________________________________________________
flatten_1 (Flatten)          (None, 2000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 6003      
Total params: 3,937,903
Trainable params: 3,937,903
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
# plot_model(model, show_shapes=True, show_layer_names=True)

In [18]:
padded_docs_train.shape, padded_docs_test.shape

((22500, 20), (7500, 20))

In [19]:
# fit the model
model.fit(padded_docs_train, train_labels, epochs=5, verbose=1)
# evaluate the model

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11693aeb8>

In [20]:
loss, accuracy = model.evaluate(padded_docs_train, train_labels, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 99.991111


In [21]:
loss, accuracy = model.evaluate(padded_docs_test, test_labels, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 99.626667
