In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, concatenate
from tensorflow.keras.models import Model

In [2]:
data = pd.read_csv('train.csv')

In [3]:
labels = np.array(data['event'])

In [4]:
events = sorted(set(labels))

In [5]:
event_dict = {}
for i, e in enumerate(events):
    if e not in event_dict:
        event_dict[e] = i

Open Questions:

*  Are there any words in this vocabulary that are not in embedding vocabularies?
*  Should I use the word piece tokenizer?
*  I should try to get a sense of how the words are distributed across labels.

Looks like there are many words that only occur in one category, and similarly many that occur in a small number of categories.

The first option would be to try an LSTM encoder that feeds into a feedforward layer with a final 48-way softmax to produce the right value.

* Input: fixed length vector of text {batch_size x seq_length x embedding_size}
* N x GRU unit: taking in each input and producing a final hidden vector without dropout
* Feedforward Layer, taking input & output of GRU units to produce output dimension 48
* Softmax Layer: final layer for output calculation

In [10]:
glove_path="/Users/vijay/MIDS/w266/Project/AnsweringMachines/dataset/glove.6B.100d.txt"
embedding_size=100
text_len=60
hidden_size_encoder=200
dropout=0.3
learning_rate=0.01
max_gradient_norm=10
batch_size=32

In [11]:
from vocab import get_glove

In [12]:
# Load embedding matrix and vocab mappings
emb_matrix, word2id, id2word = get_glove(glove_path, embedding_size)

  1%|          | 3790/400000 [00:00<00:10, 37893.53it/s]

Loading GLoVE vectors from file: /Users/vijay/MIDS/w266/Project/AnsweringMachines/dataset/glove.6B.100d.txt


100%|██████████| 400000/400000 [00:09<00:00, 41948.31it/s]


In [13]:
def get_XY(data):
    total_len = len(data)
    y = np.zeros((total_len,48))
    X = np.zeros((total_len,60,100))
    for i in range(total_len):
        y[i][event_dict[data.iloc[i]['event']]] = 1
        text = data.iloc[i]['text'].lower()
        tokens = text.split(" ")
        for j, token in enumerate(tokens):
            if token in word2id:
                X[i][j] = emb_matrix[word2id[token]]
            else:
                X[i][j] = np.random.normal(size=100)
    return X, y               

In [14]:
X,Y = get_XY(data)

In [15]:
X.shape

(153956, 60, 100)

In [16]:
Y.shape

(153956, 48)

In [20]:
X_train, y_train = X[:100000], Y[:100000]
X_test, y_test = Y[100000:], Y[100000:]

In [21]:
X_train.shape

(100000, 60, 100)

In [None]:
inputs = Input(shape=(60,100))
output_1 = LSTM(100)(inputs)
output_2 = LSTM(100,go_backwards=True)(inputs)
d = concatenate([output_1, output_2])
d = Dense(100)(d)
predictions = Dense(48, activation='sigmoid', name='main_output')(d)

In [None]:
model = Model(inputs=inputs, outputs=predictions)

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32, steps_per_epoch=20)