1. Create a virtual env.

```
mkdir env
python3 -m venv env/

source env/bin/activate

```
2. Install the packages in requirements.
3. Have fun!



In [22]:
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Flatten
from tensorflow.keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import Dense, Input, Flatten

from keras.utils import pad_sequences
from keras.utils.np_utils import to_categorical
import pandas as pd

from tqdm import tqdm
import numpy as np


df = pd.read_csv('data/ham-vs-spam.csv')
df.head()

Unnamed: 0,IsSpam,Text
0,0,key issues going forwarda year end reviews rep...
1,0,congrats contratulations the execution the cen...
2,0,key issues going forwardall under control set...
3,0,epmi files protest entergy transcoattached our...
4,0,california power please contact kristin walsh ...


In [11]:
df = df.drop_duplicates()
df.groupby('IsSpam').describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
IsSpam,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,499,499,key issues going forwarda year end reviews rep...,1
1,500,500,take the reinsbecomeyour employer substantial ...,1


# Using pretrained word2vec vectors

In [12]:
# https://www.kaggle.com/datasets/danielwillgeorge/glove6b100dtxt

%%time
embeddings_index = {}

path = 'models/'

f = open(path+'glove.6B.100d.txt')

for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print ()
print ('Found %s word vectors.' % len(embeddings_index))

400000it [00:08, 49214.40it/s]


Found 400000 word vectors.
CPU times: user 7.82 s, sys: 653 ms, total: 8.47 s
Wall time: 8.64 s





### This is what the file glove.6B.100d.txt looks like:
```
the -0.038194 -0.24487 0.72812...
of -0.1529 -0.24279 0.89837...
and -0.071953 0.23127 0.023731...
in 0.085703 -0.22201 0.16569...
a -0.27086 0.044006 -0.02026...
for -0.14401 0.32554 0.14257...
```

In [18]:
X = df['Text']
y = df['IsSpam']

X_text = X



max_words = 20000
MAX_SEQUENCE_LENGTH = 500

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

y = to_categorical(np.array(y))

In [19]:
word_index = tokenizer.word_index
print(len(word_index))

26187


In [20]:
## HIS CODE
EMBEDDING_DIM = 100

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    # words not found in embedding index will be all-zeros.
    embedding_matrix[i] = embedding_vector

print (embedding_matrix.shape)

print (embedding_matrix[0][:10])

(26188, 100)
[0.92889177 0.16019044 0.17384576 0.28648751 0.66645798 0.67291318
 0.40877759 0.96809338 0.54092357 0.32167323]


In [33]:
embedding_layer = Embedding(len(word_index)+1, 
                            100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(2, activation='sigmoid'))
print(model.summary())


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 500, 100)          2618800   
                                                                 
 conv1d_3 (Conv1D)           (None, 496, 128)          64128     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 248, 128)         0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 31744)             0         
                                                                 
 dense_2 (Dense)             (None, 2)                 63490     
                                                                 
Total params: 2,746,418
Trainable params: 127,618
Non-trainable params: 2,618,800
______________________________________

In [35]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X, y,validation_split=0.2, epochs=20, batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [69]:
embedding_layer = Embedding(len(word_index)+1, 
                            100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)


model = Sequential() 
model.add(Embedding(max_words, 32, input_length=MAX_SEQUENCE_LENGTH)) 
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
model.summary()



Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    (None, 500, 32)           640000    
                                                                 
 flatten_6 (Flatten)         (None, 16000)             0         
                                                                 
 dense_11 (Dense)            (None, 128)               2048128   
                                                                 
 dense_12 (Dense)            (None, 2)                 258       
                                                                 
Total params: 2,688,386
Trainable params: 2,688,386
Non-trainable params: 0
_________________________________________________________________


In [70]:
history = model.fit(X, y,validation_split=0.2, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [76]:
cleaned_text = 'Why pay more for expensive meds when you can order them online and save $$$?'
print(cleaned_text)

sequence = tokenizer.texts_to_sequences([cleaned_text])
padded_sequence = keras.utils.pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
model.predict(padded_sequence)

Why pay more for expensive meds when you can order them online and save $$$?


array([[0.14961217, 0.8496825 ]], dtype=float32)

In [77]:
cleaned_text = 'hey jon, I cannot make the meeting tomorrow. can you please send me a time that you can meet on tuesday. thanks, harry'
print(cleaned_text)

sequence = tokenizer.texts_to_sequences([cleaned_text])
padded_sequence = keras.utils.pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
model.predict(padded_sequence)



hey jon, I cannot make the meeting tomorrow. can you please send me a time that you can meet on tuesday. thanks, harry


array([[0.86722744, 0.18174823]], dtype=float32)

In [78]:
cleaned_text = 'Can you attend a code review on Tuesday? Need to make sure the logic is rock solid.'
print(cleaned_text)

sequence = tokenizer.texts_to_sequences([cleaned_text])
padded_sequence = keras.utils.pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
model.predict(padded_sequence)

Can you attend a code review on Tuesday? Need to make sure the logic is rock solid.


array([[0.7578366 , 0.30653045]], dtype=float32)

In [79]:
cleaned_text = df[df.IsSpam==0]['Text'].iloc[0:5].values[4]
print(cleaned_text)

# cleaned_text = 'important news for usavity customers dear cheapsoft customer name annie kincaid and work cheapsoft llc you are important you spend your money and time cheapsoft and want let you know that have finished update our programs store want remind you that are offering now more than popularsoftware for low price with your personal customer discount please spend few moments yours precious time check our updated softwarestore http www dutyfreesoft all infowith regards customer service department annie kincaid'
sequence = tokenizer.texts_to_sequences([cleaned_text])
padded_sequence = keras.utils.pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
model.predict(padded_sequence).round(3)

california power please contact kristin walsh robert johnston for further clarification executive summary utility bankruptcy appears increasingly likely next week unless the state can clear three hurdles agreement payback for the bailout rate increases and further short term funding for dwr purchases power disagreement persists between gov davis and democrats the legislature how the state should paid back for its bailout the utilities the split over stock warrant plan versus state ownership utility transmission assets the economics the long term contracts appear show that rate hikes are unavoidable because the need amortize the undercollected rates the utilities during the recent rate freeze period air quality management district regulations are under review but offer limited scope for providing additional generation capacity legislature democrats are feeling intense pressure from the left wing consumer groups and are being forced least slow not stop davis bailout and rate hike plans s

array([[1., 0.]], dtype=float32)