In [13]:
# LSTM with Dropout for sequence classification in the IMDB dataset
import numpy as np
import pandas as pd
import mlflow
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras import optimizers
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.utils import to_categorical
import time
import datetime
# fix random seed for reproducibility
np.random.seed(7)

In [2]:
df = pd.read_csv('/var/data/tweets_labelled_40k.csv')
df.dropna(inplace=True)
df.region = df.region.astype(int)
df['text'] = df['text'].apply(lambda x:x.lower())
X = df['text'].tolist()
y = df['region'].tolist()
df_counts = df.groupby('region').count()
df_counts

Unnamed: 0_level_0,text
region,Unnamed: 1_level_1
0,517
1,176
2,44
3,2387
4,2202
5,1726
6,624
7,3760
8,1034
9,699


In [27]:
# Set Parameters
V = 50000
x_length = 50
training_ratio = .75
training_size = int(len(X)*training_ratio)
num_classes = 23
embedding_vector_length = 200
num_layers = 2
H = 200
epochs = 100
optimizer = 'rmsprop'
batch_size = 128
learning_rate = .001
dropout = 0.2

In [28]:
mlflow.set_experiment('Twitter 40k word-level')
mlflow.start_run()
mlflow.log_param('learning_rate', learning_rate)
mlflow.log_param('vocabulary', V)
mlflow.log_param('number_of_layers', num_layers)
mlflow.log_param('x_length', x_length)
mlflow.log_param('embedding_vector', embedding_vector_length)
mlflow.log_param('H', H)
mlflow.log_param('optimizer', optimizer)
mlflow.log_param('dropout', dropout)
mlflow.log_param('epochs', epochs)
mlflow.log_param('batch_size', batch_size)
mlflow.log_param('train_size', training_size)
mlflow.log_param('test_size', len(y)-training_size)

In [29]:
# Convert text to integer indices, separate test and training sets
t = text.Tokenizer(num_words=V, lower=True)
t.fit_on_texts(X)
X_seq = t.texts_to_sequences(X)
X_pad = sequence.pad_sequences(X_seq, maxlen=x_length)
X_train = X_pad[:training_size]
X_test = X_pad[training_size:]
y_train = y[:training_size]
y_test = y[training_size:]
one_hot_y_train = to_categorical(y_train, num_classes=num_classes)
one_hot_y_test = to_categorical(y_test, num_classes=num_classes)

print("Training set has {} examples, test set has {} examples".format(len(X_train), len(X_test)))

Training set has 30000 examples, test set has 10000 examples


In [None]:
# Build and run the model
opt = optimizers.RMSprop(lr=learning_rate, rho=0.9, epsilon=None, decay=0.0)

start_time = time.time()
model = Sequential()
model.add(Embedding(V, embedding_vector_length, input_length=x_length))
if num_layers > 1:
    for m in range(num_layers-1):
        model.add(LSTM(H, return_sequences=True))
    model.add(LSTM(H))
else:
    model.add(LSTM(H))

model.add(Dropout(dropout))
model.add(Dense(num_classes, activation='softmax'))
model.compile(optimizer=opt,
              loss='categorical_crossentropy',
              metrics=['accuracy'])
print(model.summary())
callbacks = [EarlyStopping(monitor='val_loss', patience=2),
             ModelCheckpoint(filepath='/var/models/twitter_40k_wordlevel_lstm_chk.h5', monitor='val_loss', save_best_only=True)]
model.fit(X_train, one_hot_y_train, epochs=epochs, callbacks=callbacks, batch_size=batch_size,
          validation_data=(X_test, one_hot_y_test))
# Final evaluation of the model
scores = model.evaluate(X_test, one_hot_y_test, verbose=0)
end_time = time.time()
run_time = datetime.timedelta(seconds=end_time-start_time)
print("Accuracy: %.2f%%" % (scores[1]*100))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 50, 200)           10000000  
_________________________________________________________________
lstm_5 (LSTM)                (None, 50, 200)           320800    
_________________________________________________________________
lstm_6 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout_5 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 23)                4623      
Total params: 10,646,223
Trainable params: 10,646,223
Non-trainable params: 0
_________________________________________________________________
None
Train on 30000 samples, validate on 10000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100

In [26]:
model_name = 'twitter_40k_wordlevel_lstm'
mlflow.log_param('model_name', model_name)
mlflow.log_param('notes', 'None')
mlflow.log_param('run_time', run_time)
mlflow.log_metric('accuracy', scores[1]*100)
mlflow.end_run()
model.save('/var/models/{}.h5'.format(model_name))

In [14]:
import random
test = ["Man I love a good bowl of borscht",
       "Did anyone see the Packers game last night?",
       "Did anyone see the Raiders game last night?",
       "Football is da bomb!",
       "Nothing like a poutine and some beer with the game",
       "The Bulls suck this year",
       "Bitch better have my money",
       "Dude that's hella tight",
       "The metro/non-metro split nationalized: it is, at this point essentially the politics of every state. The metro areas in Texas moved almost as sharply away from the Trump-led GOP as they did in PA, NJ or MN. Trump only won 13/100 largest US counties and 1/2 of those moved D",
       "The people who are going to pay for Jeff Bezo’s helipad in NYC"]

#
Xt = random.sample(X[training_size:], 100)

t_test = text.Tokenizer(num_words=V, lower=True)

t_test.fit_on_texts(Xt)
sequences = t_test.texts_to_sequences(Xt)
test_padded = sequence.pad_sequences(sequences, maxlen=pad_size)
predictions = model.predict_on_batch(test_padded)
np.argmax(predictions, axis=1)

array([ 7,  7, 14,  7, 14,  7, 14, 14,  7,  7,  7, 14, 14, 14,  7,  7, 14,
        7, 14,  7, 14,  7,  7,  7, 14,  7,  7,  7,  7,  7, 10,  7,  7,  7,
        7, 14,  7,  7, 14,  7,  7, 14,  7, 14, 14, 14, 14,  7,  7,  7, 14,
       14,  7, 14,  7,  7, 14,  7, 14, 14, 14,  7, 14, 14, 14,  7, 14, 14,
       14, 14,  7, 14,  7, 14, 14, 14,  7, 14,  7,  7, 14, 14,  7, 14,  7,
       14,  7, 14,  7, 14, 14,  7, 14,  7, 14,  7, 14, 14, 14,  7])