In [32]:
from __future__ import print_function
import numpy as np
import keras
from keras.datasets import reuters, imdb
from keras.models import Sequential
from keras.layers import LSTM, SimpleRNN, GRU, Dense, Dropout, Activation, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pandas as pd

In [33]:
# load in training/test set
data = pd.read_csv('tweets.160k.random.csv', encoding='utf-8')
data.head()

Unnamed: 0,label,id,date,query,user,text
0,4,1985770747,Sun May 31 17:44:25 PDT 2009,NO_QUERY,vozabala,Getting ready for another week of fun and game...
1,0,2322735567,Wed Jun 24 23:10:08 PDT 2009,NO_QUERY,liannecab,"http://twitpic.com/8cp6u - I want it, sooo bad"
2,0,1972997427,Sat May 30 10:16:49 PDT 2009,NO_QUERY,nadhirarchangel,iloveyousincethe1stgradeitsthefirsttimewemet ...
3,0,2230992481,Thu Jun 18 17:53:46 PDT 2009,NO_QUERY,doughamlin,@extendr I can add :skype links but :aim links...
4,4,2053227537,Sat Jun 06 03:46:32 PDT 2009,NO_QUERY,Mariallama,just woke up at to rain. . . on the plus side ...


In [34]:
data['label'].value_counts()

4    80259
0    79741
Name: label, dtype: int64

In [35]:
vocab_size = 20000
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
tweets = sequence.pad_sequences(sequences, padding='post', maxlen=50)

In [36]:
tweets
# labels

array([[ 128,  206,   11, ...,    0,    0,    0],
       [  39,  150,   56, ...,    0,    0,    0],
       [ 664,    0,    0, ...,    0,    0,    0],
       ...,
       [ 749,   46,   88, ...,    0,    0,    0],
       [  55,    7,   64, ...,    0,    0,    0],
       [  97,    3, 2995, ...,    0,    0,    0]], dtype=int32)

In [37]:
labels = data['label']
labels = labels.replace(4,1) # replace label '4' with '1' to facilitate one-hot encoding
x_train, x_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2)

In [38]:
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

128000 train sequences
32000 test sequences


In [39]:
y_train = keras.utils.to_categorical(y_train) # 2 classes
y_test = keras.utils.to_categorical(y_test)

In [40]:
print(x_train.shape)
x_train

(128000, 50)


array([[  0,   0,   0, ...,   0,   0,   0],
       [104,   1,  57, ...,   0,   0,   0],
       [ 31, 194, 262, ...,   0,   0,   0],
       ...,
       [ 19,  46,   2, ...,   0,   0,   0],
       [  1,  80,   1, ...,   0,   0,   0],
       [599,   1,  20, ...,   0,   0,   0]], dtype=int32)

In [41]:
 model = Sequential()
model.add(Embedding(vocab_size, 128))
model.add(SimpleRNN(128))
model.add(Dense(2))
model.add(Activation('softmax'))

In [42]:
model.compile(loss='categorical_crossentropy',
 optimizer='adam',
 metrics=['accuracy'])

In [11]:
history = model.fit(x_train, y_train,
 batch_size=128,
 epochs=3,
 verbose=1,
 validation_split=0.2)

Train on 102400 samples, validate on 25600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [12]:
score = model.evaluate(x_test, y_test,
 batch_size=128, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.5414284701347352
Test accuracy: 0.7451875


## LSTM

In [52]:
model_lstm = Sequential()
model_lstm.add(Embedding(vocab_size, 128))
model_lstm.add(LSTM(128))
model_lstm.add(Dense(2))
model_lstm.add(Activation('softmax'))

In [53]:
model_lstm.compile(loss='categorical_crossentropy',
 optimizer='adam',
 metrics=['accuracy'])

In [54]:
history = model_lstm.fit(x_train, y_train, batch_size=128,
 epochs=3,
 verbose=1,
 validation_split=0.2)

Train on 102400 samples, validate on 25600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [55]:
score_lstm = model_lstm.evaluate(x_test, y_test,
 batch_size=128, verbose=1)
print('Test score:', score_lstm[0])
print('Test accuracy:', score_lstm[1])

Test score: 0.6931576075553894
Test accuracy: 0.5058125


## GRU

In [56]:
model_gru = Sequential()
model_gru.add(Embedding(vocab_size, 128))
model_gru.add(GRU(128))
model_gru.add(Dense(2))
model_gru.add(Activation('softmax'))

In [57]:
model_gru.compile(loss='categorical_crossentropy',
 optimizer='adam',
 metrics=['accuracy'])

In [58]:
history_gru = model_gru.fit(x_train, y_train, batch_size=128,
 epochs=3,
 verbose=1,
 validation_split=0.2)

Train on 102400 samples, validate on 25600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [59]:
score_gru = model_gru.evaluate(x_test, y_test,
 batch_size=128, verbose=1)
print('Test score:', score_gru[0])
print('Test accuracy:', score_gru[1])

Test score: 0.6930803761482239
Test accuracy: 0.50578125
