In [1]:
import os
import numpy as np
import keras
from keras.datasets import reuters, imdb
from keras.models import Sequential
from keras.layers import LSTM, SimpleRNN, GRU, Dense, Dropout, Activation, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
EMBEDDING_DIM = 50
# load in training/test set
data = pd.read_csv('tweets.160k.random.csv', encoding='utf-8')
data.head()



Unnamed: 0,label,id,date,query,user,text
0,4,1985770747,Sun May 31 17:44:25 PDT 2009,NO_QUERY,vozabala,Getting ready for another week of fun and game...
1,0,2322735567,Wed Jun 24 23:10:08 PDT 2009,NO_QUERY,liannecab,"http://twitpic.com/8cp6u - I want it, sooo bad"
2,0,1972997427,Sat May 30 10:16:49 PDT 2009,NO_QUERY,nadhirarchangel,iloveyousincethe1stgradeitsthefirsttimewemet ...
3,0,2230992481,Thu Jun 18 17:53:46 PDT 2009,NO_QUERY,doughamlin,@extendr I can add :skype links but :aim links...
4,4,2053227537,Sat Jun 06 03:46:32 PDT 2009,NO_QUERY,Mariallama,just woke up at to rain. . . on the plus side ...


In [3]:
data['label'].value_counts()

4    80259
0    79741
Name: label, dtype: int64

In [4]:
vocab_size = 20000
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
word_index = tokenizer.word_index
tweets = sequence.pad_sequences(sequences, padding='post', maxlen=50)


In [5]:
labels = data['label']
labels = labels.replace(4,1) # replace label '4' with '1' to facilitate one-hot encoding
x_train, x_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2)

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')



128000 train sequences
32000 test sequences


In [6]:
y_train = keras.utils.to_categorical(y_train) # 2 classes
y_test = keras.utils.to_categorical(y_test)

embeddings_index = {}
f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))



Found 400000 word vectors.


In [7]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# Simple RNN

In [8]:
model = Sequential()
model.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
model.add(SimpleRNN(128))
model.add(Dense(2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.build()


In [9]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 50)          6910550   
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128)               22912     
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
_________________________________________________________________
activation (Activation)      (None, 2)                 0         
Total params: 6,933,720
Trainable params: 23,170
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [10]:
history = model.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score = model.evaluate(x_test, y_test, batch_size=128, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.6753479838371277
Test accuracy: 0.5880625247955322


# LSTM

- `lstm1`: dropout rate 0.1
- `lstm2`: dropout rate 0.2
- `lstm3`: dropout rate 0.5
- `lstm4`: dropout rate 0.8
- `lstm5`: dropout rate 1

## `lstm1`
dropout rate = 0.1

In [11]:
lstm1 = Sequential()
lstm1.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
lstm1.add(Dropout(0.1))
lstm1.add(LSTM(128))
lstm1.add(Dense(2))
lstm1.add(Activation('softmax'))

lstm1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

lstm1.build()

print(lstm1.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
dropout (Dropout)            (None, None, 50)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               91648     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [12]:
history1 = lstm1.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score1 = lstm1.evaluate(x_test, y_test, batch_size=128, verbose=1)


Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.6753479838371277
Test accuracy: 0.5880625247955322


In [25]:
print('Test score:', score1[0])
print('Test accuracy:', score1[1])

Test score: 0.5283139944076538
Test accuracy: 0.7469062209129333


## `lstm2`
dropout rate = 0.2

In [13]:
lstm2 = Sequential()
lstm2.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
lstm2.add(Dropout(0.2))
lstm2.add(LSTM(128))
lstm2.add(Dense(2))
lstm2.add(Activation('softmax'))

lstm2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

lstm2.build()

print(lstm2.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 50)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [14]:
history2 = lstm2.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score2 = lstm2.evaluate(x_test, y_test, batch_size=128, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.6753479838371277
Test accuracy: 0.5880625247955322


In [26]:
print('Test score:', score2[0])
print('Test accuracy:', score2[1])

Test score: 0.5322213768959045
Test accuracy: 0.734499990940094


## `lstm3`
dropout rate = 0.5

In [15]:
lstm3 = Sequential()
lstm3.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
lstm3.add(Dropout(0.5))
lstm3.add(LSTM(128))
lstm3.add(Dense(2))
lstm3.add(Activation('softmax'))

lstm3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

lstm3.build()

print(lstm3.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 50)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_3 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [16]:
history3 = lstm3.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score3 = lstm3.evaluate(x_test, y_test, batch_size=128, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.6753479838371277
Test accuracy: 0.5880625247955322


In [27]:
print('Test score:', score3[0])
print('Test accuracy:', score3[1])

Test score: 0.5443196892738342
Test accuracy: 0.7303125262260437


## `lstm4`
dropout rate = 0.8

In [19]:
lstm4 = Sequential()
lstm4.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
lstm4.add(Dropout(0.8))
lstm4.add(LSTM(128))
lstm4.add(Dense(2))
lstm4.add(Activation('softmax'))

lstm4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

lstm4.build()

print(lstm4.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
dropout_4 (Dropout)          (None, None, 50)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_5 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [20]:
history4 = lstm4.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score4 = lstm4.evaluate(x_test, y_test, batch_size=128, verbose=1)


Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.6753479838371277
Test accuracy: 0.5880625247955322


In [28]:
print('Test score:', score4[0])
print('Test accuracy:', score4[1])

Test score: 0.6183657646179199
Test accuracy: 0.6611875295639038


## `lstm5`
dropout rate = 1

In [23]:
lstm5 = Sequential()
lstm5.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
lstm5.add(LSTM(128))
lstm5.add(Dense(2))
lstm5.add(Activation('softmax'))

lstm5.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

lstm5.build()

print(lstm5.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_6 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [24]:
history5 = lstm5.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score5 = lstm5.evaluate(x_test, y_test, batch_size=128, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.6753479838371277
Test accuracy: 0.5880625247955322


In [29]:
print('Test score:', score5[0])
print('Test accuracy:', score5[1])

Test score: 0.504899799823761
Test accuracy: 0.7541875243186951


# LSTM
- add dropout layer after LSTM
- `lstm_1`: dropout rate = 0.1
- `lstm_2`: dropout rate = 0.2
- `lstm_3`: dropout rate = 0.5
- `lstm_4`: dropout rate = 0.8
- `lstm_5`: dropout rate = 1

## `lstm_1`
dropout rate = 0.1

In [31]:
lstm_1 = Sequential()
lstm_1.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
lstm_1.add(LSTM(128))
lstm_1.add(Dropout(0.1))
lstm_1.add(Dense(2))
lstm_1.add(Activation('softmax'))

lstm_1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

lstm_1.build()

print(lstm_1.summary())

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
lstm_6 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_7 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [32]:
history_1 = lstm_1.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score_1 = lstm_1.evaluate(x_test, y_test, batch_size=128, verbose=1)

print('Test score:', score_1[0])
print('Test accuracy:', score_1[1])

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.5097532868385315
Test accuracy: 0.7508437633514404


## `lstm_2`
dropout rate = 0.2

In [33]:
lstm_2 = Sequential()
lstm_2.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
lstm_2.add(LSTM(128))
lstm_2.add(Dropout(0.2))
lstm_2.add(Dense(2))
lstm_2.add(Activation('softmax'))

lstm_2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

lstm_2.build()

print(lstm_2.summary())

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, None, 50)          6910550   
_________________________________________________________________
lstm_7 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_8 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [34]:
history_2 = lstm_2.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score_2 = lstm_2.evaluate(x_test, y_test, batch_size=128, verbose=1)

print('Test score:', score_2[0])
print('Test accuracy:', score_2[1])

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.5117943286895752
Test accuracy: 0.7513437271118164


## `lstm_3`
dropout rate = 0.5

In [35]:
lstm_3 = Sequential()
lstm_3.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
lstm_3.add(LSTM(128))
lstm_3.add(Dropout(0.5))
lstm_3.add(Dense(2))
lstm_3.add(Activation('softmax'))

lstm_3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

lstm_3.build()

print(lstm_3.summary())

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, None, 50)          6910550   
_________________________________________________________________
lstm_8 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dropout_9 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_9 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [37]:
history_3 = lstm_3.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score_3 = lstm_3.evaluate(x_test, y_test, batch_size=128, verbose=1)

print('Test score:', score_3[0])
print('Test accuracy:', score_3[1])

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.5093563795089722
Test accuracy: 0.7519687414169312


## lstm_4
dropout rate = 0.8

In [38]:
lstm_4 = Sequential()
lstm_4.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
lstm_4.add(LSTM(128))
lstm_4.add(Dropout(0.8))
lstm_4.add(Dense(2))
lstm_4.add(Activation('softmax'))

lstm_4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

lstm_4.build()

print(lstm_4.summary())

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 50)          6910550   
_________________________________________________________________
lstm_9 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 2)                 258       
_________________________________________________________________
activation_10 (Activation)   (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [39]:
history_4 = lstm_4.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score_4 = lstm_4.evaluate(x_test, y_test, batch_size=128, verbose=1)

print('Test score:', score_4[0])
print('Test accuracy:', score_4[1])

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.516812264919281
Test accuracy: 0.7491875290870667


# LSTM
- `lstm1_`: dropout rate = 0.1
- `lstm2_`: dropout rate = 0.2
- `lstm3_`: dropout rate = 0.5
- `lstm4_`: dropout rate = 0.8

## `lstm1_`
dropout rate = 0.1

In [40]:
lstm1_ = Sequential()
lstm1_.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
lstm1_.add(LSTM(128))
lstm1_.add(Dense(2))
lstm1_.add(Dropout(0.1))
lstm1_.add(Activation('softmax'))

lstm1_.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

lstm1_.build()

print(lstm1_.summary())

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, None, 50)          6910550   
_________________________________________________________________
lstm_10 (LSTM)               (None, 128)               91648     
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 258       
_________________________________________________________________
dropout_11 (Dropout)         (None, 2)                 0         
_________________________________________________________________
activation_11 (Activation)   (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None
