In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, LSTM, concatenate, Input, Dropout
from tensorflow.keras.models import Model
tf.enable_eager_execution()

In [2]:
print("TensorFlow version: {}".format(tf.__version__))
print("Eager execution: {}".format(tf.executing_eagerly()))

TensorFlow version: 1.14.0
Eager execution: True


In [3]:
data = pd.read_csv('/home/vijjus/datasets/cdc_train_fixed.csv')

In [4]:
labels = np.array(data['event'])

In [5]:
events = sorted(set(labels))

In [6]:
event_dict = {}
for i, e in enumerate(events):
    if e not in event_dict:
        event_dict[e] = i

Looks like there are many words that only occur in one category, and similarly many that occur in a small number of categories.

The first option would be to try an LSTM encoder that feeds into a feedforward layer with a final 48-way softmax to produce the right value.

* Input: fixed length vector of text {batch_size x seq_length x embedding_size}
* N x GRU unit: taking in each input and producing a final hidden vector without dropout
* Feedforward Layer, taking input & output of GRU units to produce output dimension 48
* Softmax Layer: final layer for output calculation

In [7]:
glove_path="/home/vijjus/glove/glove.6B.50d.txt"
embedding_size=50
text_len=150
dropout=0.3
learning_rate=0.01
max_gradient_norm=10
batch_size=32
num_labels=len(events)

In [8]:
# Load embedding matrix and vocab mappings
from vocab import get_glove
emb_matrix, word2id, id2word = get_glove(glove_path, embedding_size)

  0%|          | 1018/400000 [00:00<00:39, 10179.67it/s]

Loading GLoVE vectors from file: /home/vijjus/glove/glove.6B.50d.txt


100%|██████████| 400000/400000 [00:06<00:00, 63898.81it/s]


In [9]:
def get_XY(data):
    unknowns = 0
    total_len = len(data)
    y = np.zeros((total_len,num_labels), dtype=np.float32)
    X = np.zeros((total_len,text_len,embedding_size), dtype=np.float32)
    for i in tqdm_notebook(range(total_len)):
        y[i][event_dict[data.iloc[i]['event']]] = 1
        text = data.iloc[i]['text'].lower()
        tokens = text.split(" ")
        for j, token in enumerate(tokens):
            if token in word2id:
                X[i][j] = emb_matrix[word2id[token]]
            else:
                raise ValueError("Token %s not found" %(token))
    print("{} tokens not in embedding dictionary".format(unknowns))
    return X, y               

In [10]:
X,Y = get_XY(data)

HBox(children=(IntProgress(value=0, max=153956), HTML(value='')))


0 tokens not in embedding dictionary


In [11]:
train_size = int(X.shape[0] * 0.7)

In [12]:
X_train, y_train = X[:train_size], Y[:train_size]
X_test, y_test = X[train_size:], Y[train_size:]

In [13]:
X_train.shape

(107769, 150, 50)

In [14]:
X_test.shape

(46187, 150, 50)

In [15]:
# generate batches
def get_batch(X, Y):
    indices = np.random.randint(len(X), size=batch_size)
    return X[indices], Y[indices]

In [20]:
inputs = Input(shape=(text_len,embedding_size))
output_1 = LSTM(100)(inputs)
output_2 = LSTM(100,go_backwards=True)(inputs)
d = concatenate([output_1, output_2])
d = Dropout(dropout)(d)
d = Dense(100)(d)
predictions = Dense(num_labels, activation='softmax', name='main_output')(d)

In [21]:
model = Model(inputs=inputs, outputs=predictions)

In [22]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 150, 50)]    0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 100)          60400       input_3[0][0]                    
__________________________________________________________________________________________________
lstm_3 (LSTM)                   (None, 100)          60400       input_3[0][0]                    
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 200)          0           lstm_2[0][0]                     
                                                                 lstm_3[0][0]               

In [23]:
model.trainable_variables

[<tf.Variable 'lstm_2/kernel:0' shape=(50, 400) dtype=float32, numpy=
 array([[ 0.08828513,  0.09555852,  0.10969433, ...,  0.05291204,
         -0.05420067, -0.00229403],
        [ 0.06844737,  0.07254052,  0.01860087, ...,  0.09238875,
         -0.00921651, -0.06601463],
        [ 0.10740297, -0.11267237,  0.00013068, ..., -0.04958347,
         -0.044721  , -0.0586406 ],
        ...,
        [-0.00141679, -0.06255758, -0.00490578, ..., -0.00791551,
          0.0258543 , -0.09531318],
        [-0.05129797,  0.10187654,  0.10733593, ...,  0.0045872 ,
         -0.04059001, -0.1099046 ],
        [-0.07093757,  0.10397485,  0.07792698, ..., -0.02630445,
         -0.0554563 ,  0.06727421]], dtype=float32)>,
 <tf.Variable 'lstm_2/recurrent_kernel:0' shape=(100, 400) dtype=float32, numpy=
 array([[ 0.05381787, -0.04817013,  0.04572508, ...,  0.06341285,
         -0.01958219, -0.00405331],
        [-0.00866438,  0.0121907 , -0.01266572, ...,  0.02328013,
          0.05462762,  0.04961293],
  

In [24]:
n_epochs = 5
#n_steps = len(X_train) // batch_size
n_steps = 200
optimizer = tf.keras.optimizers.Nadam(lr=learning_rate)
loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
mean_loss = tf.keras.metrics.Mean()

In [25]:
def loss(model, x, y):
    y_ = model(x)

    return loss_object(y_true=y, y_pred=y_)

In [26]:
def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables)

In [42]:
X_batch, y_batch = get_batch(X_train, y_train)
loss_value, grads = grad(model, X_batch, y_batch)
print("Step: {}, Initial Loss: {}".format(optimizer.iterations.numpy(),
                                          loss_value.numpy()))

optimizer.apply_gradients(zip(grads, model.trainable_variables))

print("Step: {}, Loss: {}".format(optimizer.iterations.numpy(),
                                  loss(model, X_batch, y_batch).numpy()))

Step: 0, Initial Loss: 3.4878227710723877
Step: 1, Loss: 3.255227565765381


In [None]:
train_loss_results = []
train_accuracy_results = []

for epoch in range(n_epochs + 1):
    epoch_loss_avg = tf.keras.metrics.Mean()
    epoch_accuracy = tf.keras.metrics.Accuracy()

    print("Epoch: {}/{}".format(epoch, n_epochs))
    for step in range(1, n_steps + 1):
        # Optimize the model
        x, y = get_batch(X_train, y_train)
        loss_value, grads = grad(model, x, y)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Track progress
        epoch_loss_avg(loss_value)  # Add current batch loss
        # Compare predicted label to actual label
        epoch_accuracy(y, model(x))
        
        if step % 10 == 0:
            hashes = step // 10
            ticker = '=' * hashes + ' ' * (20 - hashes)
            print("Step {}/{}: [{}] Loss: {:.3f}, Accuracy: {:.3%}".format(step, n_steps,
                                                                        ticker,
                                                                        epoch_loss_avg.result(),
                                                                        epoch_accuracy.result()),
                 end = '\r')

    # End epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch,
                                                                epoch_loss_avg.result(),
                                                                epoch_accuracy.result()))

Epoch: 0/5
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 000: Loss: 3.752, Accuracy: 0.320%s: 3.752, Accuracy: 0.320%
Epoch: 1/5
Epoch 001: Loss: 3.757, Accuracy: 0.310%s: 3.757, Accuracy: 0.310%
Epoch: 2/5
Epoch 002: Loss: 3.747, Accuracy: 0.332%s: 3.747, Accuracy: 0.332%
Epoch: 3/5

In [29]:
X_train_small, y_train_small = X_train[:1000], y_train[:1000]

In [30]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [31]:
model.fit(X_train_small, y_train_small, epochs=10, batch_size=32, steps_per_epoch=20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f376ae97358>