In [1]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder

In [2]:
import chardet
file = "SpamSentiment1.csv"
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

data = pd.read_csv('SpamSentiment1.csv', encoding='Windows-1252')
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
print(data.groupby('v1').nunique())

        v2
v1        
ham   4515
spam   653


In [4]:
# Keeping only the neccessary columns
data = data[['v1','v2']]
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
print(type(data['v2'][0]))
print(data['v2'][1])
print(data['v2'][2])

<class 'str'>
Ok lar... Joking wif u oni...
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


In [6]:
data['v2'] = data['v2'].apply(lambda x: str(x).lower()) # all string to lowercase Read more about lambda() "https://realpython.com/python-lambda/"
data['v2'] = data['v2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x))) # using regular expression preprocess the text by removing everything that is not [a-zA-z0-9\s]

In [7]:
print(type(data['v2']))

<class 'pandas.core.series.Series'>


In [8]:
print(data['v2'][0])
print(data['v2'][1])
print(data['v2'][2])

go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
ok lar joking wif u oni
free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s


In [9]:
max_fatures = 1000

tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['v2'].values)

In [10]:
# Transforms each text in texts to a sequence of integers.
# Only top num_words-1 most frequent words will be taken into account. Only words known by the tokenizer will be taken into account.

# Transforms each text in texts to a sequence of integers. 
# So it basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary.
X = tokenizer.texts_to_sequences(data['v2'].values)

In [11]:
# tokenizer.get_config()
# Returns the tokenizer configuration as Python dictionary.
print(tokenizer.get_config().keys())

# word_counts: A dictionary of words and their counts.
print("\ntokenizer.word_counts") 
print(tokenizer.word_counts) 

# document_count:An integer count of the total number of documents that were used to fit the Tokenizer.
print("\ntokenizer.document_count")
print(tokenizer.document_count) 

# word_index: A dictionary of words and their uniquely assigned integers.
print("\ntokenizer.word_index")
print(tokenizer.word_index)

# word_docs: A dictionary of words and how many documents each appeared in.
print("\ntokenizer.word_docs")
print(tokenizer.word_docs)



print('\nLen() of X:', len(X))
print('\n', X[:2])

dict_keys(['num_words', 'filters', 'lower', 'split', 'char_level', 'oov_token', 'document_count', 'word_counts', 'word_docs', 'index_docs', 'index_word', 'word_index'])

tokenizer.word_counts

tokenizer.document_count
5571

tokenizer.word_index

tokenizer.word_docs

Len() of X: 5571

 [[46, 442, 794, 711, 680, 64, 9, 90, 119, 356, 154, 67, 58, 137], [48, 312, 443, 6]]


In [12]:
# Check the sequence of the text, Do we need to Pad ??
for i in range(4):
  print(X[i])
  print('len=', len(X[i]))

[46, 442, 794, 711, 680, 64, 9, 90, 119, 356, 154, 67, 58, 137]
len= 14
[48, 312, 443, 6]
len= 4
[50, 459, 9, 22, 4, 749, 899, 1, 179, 625, 261, 71, 1, 1, 313, 459, 79, 382]
len= 18
[6, 229, 142, 24, 357, 6, 160, 143, 60, 142]
len= 10


In [13]:
X = pad_sequences(X) # Pads sequences to the same length.
print('X.shape = ', X.shape)

X.shape =  (5571, 143)


In [14]:
# Check the sequence after padding, Which padding pre or post??
for i in range(4):
  print(X[i])
  print('len=', len(X[i]))

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0  46 442 794 711 680  64   9  90 119 356 154  67  58 137]
len= 143
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0 

In [15]:
embed_dim = 143
lstm_out = 196

def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim, input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    model.summary()
    return model

In [16]:
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['v1'])
y = to_categorical(integer_encoded)

X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [17]:
model = createmodel()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 143, 143)          143000    
_________________________________________________________________
lstm (LSTM)                  (None, 196)               266560    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 409,954
Trainable params: 409,954
Non-trainable params: 0
_________________________________________________________________


In [18]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 1)


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x231cdc81430>

In [19]:
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
print(score)
print(acc)
print(model.metrics_names)

58/58 - 3s - loss: 0.0939 - accuracy: 0.9777
0.09385243058204651
0.9777052998542786
['loss', 'accuracy']


In [20]:
model.save("modelx.h5")

In [21]:
from keras.models import load_model
 
# load model
modelx = load_model('modelx.h5')

In [22]:
import numpy as np
X=modelx.predict(X_test[0])
print(X_test[0])
print(Y_test[1])
print("Actual Value:",Y_test[1])
print("Predicted Value",np.argmax(X[1]))

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0 809  85  25  65 142  19   2  63 620
  14   6 141  21   7 183 512   6  97  14  10  29 838  44   4 992 709]
[1. 0.]
Actual Value: [1. 0.]
Predicted Value 0
