In [35]:
import csv
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional
from sklearn import preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import train_test_split

In [36]:
file1 = "./data/spam.csv"
file2 = "./data/imdb.csv"
file3 = "./data/yelp.csv"

df = pd.read_csv(file1,delimiter=',')


In [37]:
df = df.sample(frac=1)
X = df['content']
Y = df['category']

In [38]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
if Y.dtype == 'int64':
    Y = np.array(Y, dtype='str')

[nltk_data] Downloading package stopwords to /Users/chris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)

In [40]:
vocab_size = 5000 # make the top list of words (common words)
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # OOV = Out of Vocabulary
training_portion = .8
units = 1024

In [41]:

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [42]:
train_sequences = tokenizer.texts_to_sequences(X_train)

In [43]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [44]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

validation_sequences = tokenizer.texts_to_sequences(X_test)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


In [45]:
labels = set()
for l in Y_train:
    labels.add(l)

In [46]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(Y_train))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(Y_test))

In [47]:
from keras.layers import Dense, Embedding, Dropout, GRU
from keras.models import Sequential
from keras import layers


model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(0.5))
model.add(GRU(embedding_dim))
model.add(layers.Dense(5, activation='sigmoid'))
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)



In [48]:
num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs,verbose=2)

Epoch 1/10
148/148 - 7s - loss: 0.5208 - accuracy: 0.8573
Epoch 2/10
148/148 - 5s - loss: 0.4019 - accuracy: 0.8640
Epoch 3/10
148/148 - 6s - loss: 0.4012 - accuracy: 0.8640
Epoch 4/10
148/148 - 5s - loss: 0.4027 - accuracy: 0.8640
Epoch 5/10
148/148 - 5s - loss: 0.3988 - accuracy: 0.8640
Epoch 6/10
148/148 - 5s - loss: 0.4006 - accuracy: 0.8640
Epoch 7/10
148/148 - 5s - loss: 0.3404 - accuracy: 0.8718
Epoch 8/10
148/148 - 6s - loss: 0.0643 - accuracy: 0.9848
Epoch 9/10
148/148 - 5s - loss: 0.0361 - accuracy: 0.9911
Epoch 10/10
148/148 - 5s - loss: 0.0280 - accuracy: 0.9924


In [49]:
pred = model.predict(validation_padded)
pred = pred.argmax(axis=-1)
pred

array([2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1,
       2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1,

In [50]:
test = []
for i in validation_label_seq:
    test.append(i[0])
test = np.asarray(test)

In [51]:
metrics.accuracy_score(pred, test)

0.9808612440191388