#### README: LSTM Tweet Classifier with GloVe(Global Vectors for Word) embeddings.

##### You may download 'glove.6B.50d.txt' from https://nlp.stanford.edu/projects/glove/

In [0]:
import os
import numpy as np
import keras
from keras.datasets import reuters, imdb
from keras.models import Sequential
from keras.layers import LSTM, SimpleRNN, GRU, Dense, Dropout, Activation, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pandas as pd

EMBEDDING_DIM = 50
# load in training/test set
data = pd.read_csv(colab_data_path + 'tweets.160k.random.csv', encoding='utf-8')
data.head()

# Index/label value
data['label'].value_counts()
vocab_size = 20000

# Vectorize text corpus by turning text into a sequence of integers/vector
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(data['text'])

# Transforms each text into a sequence of integers
sequences = tokenizer.texts_to_sequences(data['text'])
word_index = tokenizer.word_index
tweets = sequence.pad_sequences(sequences, padding='post', maxlen=50)

labels = data['label'] # Either 0,4 
labels = labels.replace(4,1) # replace label '4' with '1' to facilitate one-hot encoding
x_train, x_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2)

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

y_train = keras.utils.to_categorical(y_train) # 2 classes
y_test = keras.utils.to_categorical(y_test)

embeddings_index = {}
GLOVE_DIR = "./"
f = open(os.path.join(GLOVE_DIR, colab_data_path + 'glove.6B.50d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

model = Sequential()
model.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
model.add(LSTM(
	units= 128,
	dropout = 0.0, # Adding dropout to embedding layer (same)
	recurrent_dropout = 0.0
	))
model.add(Dropout(0.1))
model.add(Dense(2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

history = model.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score = model.evaluate(x_test, y_test, batch_size=128, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Using TensorFlow backend.


128000 train sequences
32000 test sequences
Found 400000 word vectors.










Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-t