In [1]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.utils import to_categorical
import numpy as np
import pandas as pd

Using TensorFlow backend.


In [5]:
df = pd.read_csv('/var/data/tweets_labelled_40k.csv')
df.dropna(inplace=True)
df.region = df.region.astype(int)
df['text'] = df['text'].apply(lambda x:x.lower())
X = df['text'].tolist()
y = df['region'].tolist()

In [6]:
# Set Parameters
x_length = 200
training_ratio = .75
training_size = int(len(X)*training_ratio)
num_classes = 23
num_unique_symbols = 500
num_layers = 2
H = 200
epochs = 100
optimizer = 'rmsprop'
batch_size = 64
learning_rate = .0001
dropout = 0.2

In [7]:
t = text.Tokenizer(
    char_level=True,
    filters=None,
    lower=True,
    num_words=num_unique_symbols-1,
    oov_token='unk'
)

t.fit_on_texts(X)
X_seq = t.texts_to_sequences(X)
X_padded = sequence.pad_sequences(X_seq, maxlen=x_length, value=0)
X_train = X_padded[:training_size]
X_test = X_padded[training_size:]
y_train = y[:training_size]
y_test = y[training_size:]
one_hot_y_train = to_categorical(y_train, num_classes=num_classes)
one_hot_y_test = to_categorical(y_test, num_classes=num_classes)

In [8]:
X_trimmed = [x[:-1] for x in X_seq]
X_trimmed_padded = sequence.pad_sequences(X_trimmed, maxlen=x_length, value=0)
X_train_trimmed = X_trimmed_padded[:training_size]

In [9]:
from keras.utils import Sequence

class OneHotBatchIn(Sequence):
  def __init__(self, X_data, y_data, batch_size, num_chars, num_classes):
    self.X_data = X_data
    self.y_data = y_data
    self.batch_size = batch_size
    self.num_chars = num_chars
    self.num_classes = num_classes

  def __len__(self):
     return int(np.ceil(len(self.X_data) / float(self.batch_size)))

  def __getitem__(self, batch_id):
    start = batch_id * self.batch_size
    finish = start + self.batch_size
    X = to_categorical(self.X_data[start:finish], num_classes=self.num_chars)
    y = to_categorical(self.y_data[start:finish], num_classes=self.num_chars)
    return [X, X], y

In [10]:
class OneHotBatchOut(Sequence):
  def __init__(self, X_data, y_data, batch_size, num_chars, num_classes):
    self.X_data = X_data
    self.y_data = y_data
    self.batch_size = batch_size
    self.num_chars = num_chars
    self.num_classes = num_classes

  def __len__(self):
     return int(np.ceil(len(self.X_data) / float(self.batch_size)))

  def __getitem__(self, batch_id):
    start = batch_id * self.batch_size
    finish = start + self.batch_size
    X = to_categorical(self.X_data[start:finish], num_classes=self.num_chars)
    y = to_categorical(self.y_data[start:finish], num_classes=self.num_classes)
    return [X, X], y

In [12]:
# Generators
encoder_input_generator = OneHotBatchIn(X_train, X_train_trimmed, batch_size=batch_size, 
                                      num_chars=num_unique_symbols, num_classes=num_classes)

validation_generator = OneHotBatchOut(X_test, y_test, batch_size=batch_size, num_chars=num_unique_symbols, num_classes=num_classes)

In [13]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_unique_symbols))
encoder = LSTM(H, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_unique_symbols))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(H, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_unique_symbols, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [14]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
print(model.summary())
model.fit_generator(generator=encoder_input_generator, epochs=epochs, validation_data=validation_generator, verbose=1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 500)    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 500)    0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 200), (None, 560800      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 200),  560800      input_2[0][0]                    
                                                                 lstm_1[0][1]                     
          

KeyboardInterrupt: 