In [64]:
# import pandas as pd
# import numpy as np

# from keras import backend as K

# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.text import one_hot
# from keras.preprocessing.sequence import pad_sequences

# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Flatten
# from keras.layers import Embedding
# from keras.layers import LSTM
# from keras.layers import Dropout
# from keras.layers import Activation

# from keras.utils import to_categorical

# from gensim.models import KeyedVectors


# from sklearn.preprocessing import LabelEncoder

# import h5py

import pickle as pkl

## Question Classification

In this notebook I build a series of deep learning models that are used to classify questions based on their answer type and their detailed answer type. 

These models will initially be evaluated using a developement/test set, but then the entire dataset will be used for training, and the models will be evaluated by the utility of their predictions for the main downstream task - deduplicating question intent. 

Thank you Dr. Jason Brownlee for [this great post](https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/) on how to use pretrained word embeddings!

### Contents

#### 0. Load `TREC` dataset



In [2]:
trec_train = pd.read_csv("../data/TREC/processed/train.csv")
trec_test = pd.read_csv("../data/TREC/processed/test.csv")

In [3]:
trec_train.head(10)

Unnamed: 0.1,Unnamed: 0,question,label,extended_label
0,1,How did serfdom develop in and then leave Russ...,DESC,DESC:manner
1,2,What films featured the character Popeye Doyle ?,ENTY,ENTY:cremat
2,3,How can I find a list of celebrities' real nam...,DESC,DESC:manner
3,4,What fowl grabs the spotlight after the Chines...,ENTY,ENTY:animal
4,5,What is the full form of . com ?,ABBR,ABBR:exp
5,6,What contemptible scoundrel stole the cork fro...,HUM,HUM:ind
6,7,What team did baseball's St . Louis Browns bec...,HUM,HUM:gr
7,8,What is the oldest profession ?,HUM,HUM:title
8,9,What are liver enzymes ?,DESC,DESC:def
9,10,Name the scar - faced bounty hunter of The Old...,HUM,HUM:ind


#### 0.1 Shuffle TREC dataset

In [4]:
trec_train = trec_train.sample(frac=1.0, random_state = 550)
trec_test = trec_test.sample(frac = 1.0, random_state = 550)

---

### 1. Prepare document encoding 

Using Kera's `Tokenizer` class, create a dictionary of all the types in both the quora and the TREC datasets. Encode each document as a vector of indecies of the corresponding types in the dictionary. 

#### 1.1  `Quora` dataset

In [5]:
train_quora = pd.read_csv("../data/processed/train.csv")

In [6]:
train_quora.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,1,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,2,1,3,4,What is the story of Kohinoor Koh - i - Noor D...,What would happen if the Indian government sto...,0
2,3,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,4,3,7,8,Why am I mentally very lonely ? How can I solv...,Find the remainder when math 23 24 math is div...,0
4,5,4,9,10,"Which one dissolve in water quickly sugar , sa...",Which fish would survive in salt water ?,0


#### 1.2 Get all the questions, from both datasets, in Numpy array


In [7]:
all_questions = trec_train['question'].append(trec_test['question']).append(
    train_quora['question1']).append(train_quora['question2']).values


In [8]:
all_questions[0]

'Who comprised the now - defunct comic book team known as the Champions ? '

In [9]:
# Prepare tokenizer 
tokenizer = Tokenizer()

In [10]:
# Create a dictionary for all the quesions in the joint datasets
tokenizer.fit_on_texts(all_questions)

In [11]:
# The number of types in the joint datset
vocab_size = len(tokenizer.word_index) + 1
vocab_size

93261

In [12]:
# integer encode the questions
encoded_questions = tokenizer.texts_to_sequences(all_questions)

In [13]:
# an example of a question integer embedding
print(all_questions[0])
encoded_questions[0]

Who comprised the now - defunct comic book team known as the Champions ? 


[39, 28303, 1, 165, 35455, 3838, 161, 765, 582, 46, 1, 7202]

In [14]:
# Another example. Notice how the word `how` is consistently endoded as the integer 5. 
print(all_questions[2])
encoded_questions[2]

What is the wingspan of a condor ? 


[2, 3, 1, 54958, 10, 6, 41954]

#### 1.3 Save the tokenizer


In [68]:
with open('../models/tokenizer.pickle', 'wb') as handle:
    pkl.dump(tokenizer, handle, protocol=pkl.HIGHEST_PROTOCOL)

---

### 2. Prepare the embedding matrix

In [15]:
# load the pretrained fasttext embeddings (this takes a while)
embedding_model = KeyedVectors.load_word2vec_format('../data/embeddings/wiki.en.vec')

In [16]:
# Each row in the matrix is the embedding of one word in the joint datasets. 
# The row index corresponds to the integer ecoding of that word. 
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    if word in embedding_model:
        embedding_matrix[i] = embedding_model[word]

---

### 3. An LSTM experiment parameter class

I will likely exeperiment with many different models, with many different hyperparameters. It will be useful to keep all those parameters contained in one place

In [17]:
class LstmParams(object):
    
    def __init__(self,
                 sequence_length = 10, 
                 labels = "basic"):
        
        # Number of tokens to include in each sequence
        self.sequence_length = sequence_length
        # whether to use basic or extended labels
        self.labels = labels
        
        # Keep track of the raw training and test questions
        self.train_questions = trec_train['question'].values
        self.test_questions = trec_test['question'].values
        
        # Encode the training and test quetions
        self.train_questions_encoded = tokenizer.texts_to_sequences(self.train_questions)
        self.test_questions_encoded = tokenizer.texts_to_sequences(self.test_questions)
        
        # Store a label encode for each class. This will be particular to label type of the problem
        self.label_encoder = self.get_encoder()
        
       
    
    '''
    Fit a label encoder to the data, to represent the labels as one-hot vectors
    The dimension of these vectors will depend on if the user decided to predict
    basic or extended labels, which have 6 and 50 potential categories, respectively. 
    '''
    def get_encoder(self):
        # get all the labels, train and test sets, in one list
        all_labels = self.get_train_labels().tolist() + self.get_test_labels().tolist()
        # fit a label encoder to those labelse
        encoder = LabelEncoder().fit(np.array(all_labels))
        return(encoder)
    
    
    '''
    Decode one-hot representation of labels back to regular labels.
    input: an array of one-hot label arrays. 
    outpu: an array of regular character labels. 
    '''
    def decode_labels(self,onehot_labels):
        # first, get the labels as integers
        integer_labels = [np.where(r==1)[0][0] for r in onehot_labels]
        # now, return the decoded label
        return(self.label_encoder.inverse_transform(integer_labels))
    
    '''
    Return the labels as a numpy array of strings 
    Which labels to return depends on the objects 'labels' parameter. 
    '''
    def get_train_labels(self):
        if self.labels == "basic":
            return(trec_train['label'].values)
        elif self.labels == "extended":
            return(trec_train["extended_label"].values)
        else:
            print("Invalid `labels` parameter '%s'. Returning basic labels.") % (self.labels)
            return(trec_train['label'].values)
        
    def get_test_labels(self):
        if self.labels == "basic":
            return(trec_test['label'].values)
        elif self.labels == "extended":
            return(trec_test["extended_label"].values)
        else:
            print("Invalid `labels` parameter '%s'. Returning basic labels.") % (self.labels)
            return(trec_test['label'].values)
        

    '''
    Return the labels of the as a numpy ndarray, using one-hot encoding. 
    This is for transparency in my Neural Network archetecture. 
    '''
    def get_train_labels_onehot(self):
        return(to_categorical(self.label_encoder.transform(self.get_train_labels())))

        
    def get_test_labels_onehot(self):
        return(to_categorical(self.label_encoder.transform(self.get_test_labels())))
        
    
    '''
    Return the encoded questions after padding. 
    Padding (or truncating) amount depends on attribute `self.sequence_length`
    '''
    def get_train_padded(self):
        padded = pad_sequences(self.train_questions_encoded, 
                              maxlen = self.sequence_length,
                              padding = "post", 
                              truncating = "post")
        return(padded)
    
    def get_test_padded(self):
        padded = pad_sequences(self.test_questions_encoded, 
                              maxlen = self.sequence_length,
                              padding = "post", 
                              truncating = "post")
        return(padded)
    
    '''
    Get the number of classes (output layer dimension). 
    This is the number of unique classes. 
    '''
    def get_num_classes(self):
        n_unique = len(np.unique(self.get_test_labels().tolist() + self.get_train_labels().tolist()))
        return(n_unique)

--- 

### 4. A first model for the basic labels

First, I'll try the most basic vanilla LSTM for the TREC classification problem (simple labels)


#### 4.0 Initialize Model parameters

In [18]:
params1 = LstmParams(sequence_length = 10,
                    labels = "basic")

In [19]:
model1 = Sequential()

#### 4.1 Add layers

In [20]:
# Add the word embedding layer
model1.add(Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = params1.sequence_length,
                     trainable = False))

In [21]:
# Add an LSTM layer
model1.add(LSTM(params1.get_num_classes(), activation="softmax"))

In [22]:
model1.compile(loss = 'categorical_crossentropy',
               optimizer='adam',
               metrics = ['accuracy'])

In [23]:
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 300)           27978300  
_________________________________________________________________
lstm_1 (LSTM)                (None, 6)                 7368      
Total params: 27,985,668
Trainable params: 7,368
Non-trainable params: 27,978,300
_________________________________________________________________


#### 4.2 Train!

In [28]:
# fit model
model1.fit(x = params1.get_train_padded(),
           y = params1.get_train_labels_onehot(),
           epochs = 200
          )

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x23b269b90>

In [29]:
print "Test Set Accuracy: {0}%".format(model1.evaluate(params1.get_test_padded(),
                params1.get_test_labels_onehot(),
                verbose=0)[1]*100)

Test Set Accuracy: 81.0000000477%


---

### 5. A first model for the extended labels

On the very first try, I got a test set accuracy of 80% when predicting the basic labels. Although without the use of a developmement set and without using proper experimentation I can't claim anything about the generalizability of this first classifier, it does show me that the proble of prdicting the basic label is a very tractable one. 

Now, I'll train this same basic classifier to predict the extended label to see how much harder that problem is, if at all.

In [335]:
params2 = LstmParams(sequence_length = 10, labels="extended")

In [339]:
model2 = Sequential()

In [340]:
# Add the word embedding layer
model2.add(Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = params2.sequence_length,
                     trainable = False))

In [341]:
# Add an LSTM layer
model2.add(LSTM(params2.get_num_classes(), activation="softmax"))

In [342]:
model2.compile(loss = 'categorical_crossentropy',
               optimizer='adam',
               metrics = ['accuracy'])

In [343]:
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 10, 300)           27936600  
_________________________________________________________________
lstm_10 (LSTM)               (None, 50)                70200     
Total params: 28,006,800
Trainable params: 70,200
Non-trainable params: 27,936,600
_________________________________________________________________


In [344]:
# fit model
model2.fit(x = params2.get_train_padded(),
           y = params2.get_train_labels_onehot(),
           epochs = 200
          )

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x2d238ce10>

In [345]:
print "Test Set Accuracy: {0}%".format(model2.evaluate(params2.get_test_padded(),
                params2.get_test_labels_onehot(),
                verbose=0)[1]*100)

Test Set Accuracy: 55.8000000477%


As suspected, predicting the fine grained label is a much harder problem. The training accuracy never exceeds 65%, which indicates that we may need to include a wider window, and perhaps a more complex archetecture. 

Further, we can already see signs of overfitting, as the test error is much higher than the training error. If we add more complex layers, it may be worth adding some dropout units in order to combat overfitting.

---

### 6. Adding a dense layer when predicting basic labels

It might be worth adding a dense layer to the basic label classifier to learn more complex functions. If there are signs of overfitting, then I'll add some dropout units to the LSTM cells.  

In [30]:
params3 = LstmParams(sequence_length = 10,
                    labels = "basic")

In [31]:
model3 = Sequential()

#### 6.1 Add embedding and LSTM layers

In [32]:
# Add the word embedding layer
model3.add(Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = params3.sequence_length,
                     trainable = False))

I'll use a dense hidden layer of 50 nodes after the LSTM

In [33]:
# Add an LSTM layer
model3.add(LSTM(50))

In [34]:
# add a Dense laer, and apply the softmax activation on their outputs. 
model3.add(Dense(params3.get_num_classes(), activation='softmax'))

In [35]:
model3.compile(loss = 'categorical_crossentropy',
               optimizer='adam',
               metrics = ['accuracy'])

In [36]:
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 10, 300)           27978300  
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                70200     
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 306       
Total params: 28,048,806
Trainable params: 70,506
Non-trainable params: 27,978,300
_________________________________________________________________


#### 6.2 Train that model!

In [37]:
# fit model
model3.fit(x = params3.get_train_padded(),
           y = params3.get_train_labels_onehot(),
           epochs = 100
          )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1f2c7cf10>

In [38]:
print "Test Set Accuracy: {0}%".format(model3.evaluate(params3.get_test_padded(),
                params3.get_test_labels_onehot(),
                verbose=0)[1]*100)

Test Set Accuracy: 83.0%


Clearly this network is overfit. WE achieve almost perfect accuracy on the training set, but the test accuracy is no better than the simple LSTM modle. 

#### 6.3 Adding a dropout layer

I'll re-use the same archetecutre, but this time use dropout layers between the embedding and LSTM layers, as well as between the LSTM and dense layers.

Later, it might be worth using Within-cell recurrent dropout, provided via the `Keras` interface. 

In [24]:
params4 = LstmParams(sequence_length = 10,
                    labels = "basic")

In [25]:
model4 = Sequential()

In [26]:
# Add the word embedding layer
model4.add(Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = params4.sequence_length,
                     trainable = False))

In [27]:
# add a droput layer
model4.add(Dropout(.2))

In [28]:
# Add an LSTM layer
model4.add(LSTM(50))

In [29]:
# another dropout before a dense layer
model4.add(Dropout(.2))

In [30]:
# add a Dense layer, and apply the softmax activation on their outputs. 
model4.add(Dense(params4.get_num_classes(), activation='softmax'))

In [31]:
model4.compile(loss = 'categorical_crossentropy',
               optimizer='adam',
               metrics = ['accuracy'])

In [32]:
model4.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 10, 300)           27978300  
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 300)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                70200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 306       
Total params: 28,048,806
Trainable params: 70,506
Non-trainable params: 27,978,300
_________________________________________________________________


In [33]:
# fit model
model4.fit(x = params4.get_train_padded(),
           y = params4.get_train_labels_onehot(),
           epochs = 125
          )

Epoch 1/125
Epoch 2/125
Epoch 3/125
Epoch 4/125
Epoch 5/125
Epoch 6/125
Epoch 7/125
Epoch 8/125
Epoch 9/125
Epoch 10/125
Epoch 11/125
Epoch 12/125
Epoch 13/125
Epoch 14/125
Epoch 15/125
Epoch 16/125
Epoch 17/125
Epoch 18/125
Epoch 19/125
Epoch 20/125
Epoch 21/125
Epoch 22/125
Epoch 23/125
Epoch 24/125
Epoch 25/125
Epoch 26/125
Epoch 27/125
Epoch 28/125
Epoch 29/125
Epoch 30/125
Epoch 31/125
Epoch 32/125
Epoch 33/125
Epoch 34/125
Epoch 35/125
Epoch 36/125
Epoch 37/125
Epoch 38/125
Epoch 39/125
Epoch 40/125
Epoch 41/125
Epoch 42/125
Epoch 43/125
Epoch 44/125
Epoch 45/125
Epoch 46/125
Epoch 47/125
Epoch 48/125
Epoch 49/125
Epoch 50/125
Epoch 51/125
Epoch 52/125
Epoch 53/125
Epoch 54/125
Epoch 55/125
Epoch 56/125
Epoch 57/125
Epoch 58/125
Epoch 59/125
Epoch 60/125
Epoch 61/125
Epoch 62/125
Epoch 63/125
Epoch 64/125
Epoch 65/125
Epoch 66/125
Epoch 67/125
Epoch 68/125
Epoch 69/125
Epoch 70/125
Epoch 71/125
Epoch 72/125
Epoch 73/125
Epoch 74/125
Epoch 75/125
Epoch 76/125
Epoch 77/125
Epoch 78

<keras.callbacks.History at 0x1f2ced290>

In [74]:
print "Test Set Accuracy: {0}%".format(model4.evaluate(params4.get_test_padded(),
                params4.get_test_labels_onehot(),
                verbose=0)[1]*100)

Test Set Accuracy: 86.4000000477%


In [62]:
# Save the model for later use
model4.save("../models/trec_lstm1.h5")

In [76]:
# Save the associated encoder
with open('../models/encoder1.pickle', 'wb') as handle:
    pkl.dump(params4.label_encoder, handle)

#### 6.4 Trying dropout and recurrent dropout within the LSTM cells

This will mask some of the time-specific idiosyncracies in the training data

In [35]:
params5 = LstmParams(sequence_length = 10,
                    labels = "basic")

In [36]:
model5 = Sequential()

In [37]:
# Add the word embedding layer
model5.add(Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = params5.sequence_length,
                     trainable = False))

In [38]:
# Add an LSTM layer
model5.add(LSTM(50,dropout=0.2, recurrent_dropout=0.2))

In [39]:
# add a Dense layer, and apply the softmax activation on their outputs. 
model5.add(Dense(params5.get_num_classes(), activation='softmax'))

In [40]:
model5.compile(loss = 'categorical_crossentropy',
               optimizer='adam',
               metrics = ['accuracy'])

In [41]:
model5.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 10, 300)           27978300  
_________________________________________________________________
lstm_3 (LSTM)                (None, 50)                70200     
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 306       
Total params: 28,048,806
Trainable params: 70,506
Non-trainable params: 27,978,300
_________________________________________________________________


In [42]:
# fit model
model5.fit(x = params5.get_train_padded(),
           y = params5.get_train_labels_onehot(),
           epochs = 150
          )

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x1f2f6e310>

In [43]:
print "Test Set Accuracy: {0}%".format(model5.evaluate(params5.get_test_padded(),
                params5.get_test_labels_onehot(),
                verbose=0)[1]*100)

Test Set Accuracy: 88.1999999046%


There's no telling whether this increase in performance really means that using internal dropout leads to a better classifier - I'm certainly overfitting to the test data by tuning my hyperparameters according to the test error. 

It does show, however, that internal droput seems to combat overfitting reasonably effectively. Still not enough. 

In [77]:
# Save model
model5.save("../models/trec_lstm2.h5")

#also write associated encoder
with open("../models/encoder2.pickle", "wb") as handle:
    pkl.dump(params5.label_encoder, handle)

#### 6.4 expanding the word context window. 

So far, I've been working with a context window of 10 words. Perhaps 15 will be better? I'll try it with my two most trecent model configurations, `model4` and `model5`

In [44]:
params6 = LstmParams(sequence_length = 15,
                    labels = "basic")

In [45]:
model6 = Sequential()

In [46]:
# Add the word embedding layer
model6.add(Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = params6.sequence_length,
                     trainable = False))

In [47]:
# add a dropout layer
model6.add(Dropout(.2))

In [48]:
model6.add(LSTM(50))

In [49]:
model6.add(Dropout(.2))

In [50]:
# add a Dense layer, and apply the softmax activation on their outputs. 
model6.add(Dense(params6.get_num_classes(), activation='softmax'))

In [51]:
model6.compile(loss = 'categorical_crossentropy',
               optimizer='adam',
               metrics = ['accuracy'])

In [88]:
model6.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 15, 300)           27978300  
_________________________________________________________________
dropout_3 (Dropout)          (None, 15, 300)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 50)                70200     
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 306       
Total params: 28,048,806
Trainable params: 70,506
Non-trainable params: 27,978,300
_________________________________________________________________


In [53]:
# fit model
model6.fit(x = params6.get_train_padded(),
           y = params6.get_train_labels_onehot(),
           epochs = 100
          )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x23cb1c2d0>

In [54]:
print "Test Set Accuracy: {0}%".format(model6.evaluate(params6.get_test_padded(),
                params6.get_test_labels_onehot(),
                verbose=0)[1]*100)

Test Set Accuracy: 85.8000000477%


It makes no obvious difference. 

Because the data is so small, and I know my LSTM has a propensity to overfit, I prefer a smaller context window over a larger one (Occam's razor). 

Intuitively, it would make sense that the neccessary window for learning the broad categorization of sentence will be small, becuas seeing the words _What is the..._ versus _Who is the..._ might already tell you that the first question is a entity, while the second is an human. 

#### 6.5 Stacked LSTMs

I'll now try stacking two LSTM's on top of one another, and using recurrent dropout. Hopefully, the second LSTM will learn some other (more intersting) features than the dense layer did. 

In [52]:
params7 = LstmParams(sequence_length = 10,
                    labels = "basic")

In [53]:
model7 = Sequential()

In [54]:
# Add the word embedding layer
model7.add(Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = params7.sequence_length,
                     trainable = False))

In [55]:
# Add an LSTM layer
model7.add(LSTM(100,dropout=0.2, recurrent_dropout=0.2, return_sequences=True))

In [56]:
model7.add(LSTM(params7.get_num_classes(), dropout=0.2, recurrent_dropout=0.2, activation='softmax'))

In [57]:
model7.compile(loss = 'categorical_crossentropy',
               optimizer='adam',
               metrics = ['accuracy'])

In [58]:
model7.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 10, 300)           27978300  
_________________________________________________________________
lstm_5 (LSTM)                (None, 10, 100)           160400    
_________________________________________________________________
lstm_6 (LSTM)                (None, 6)                 2568      
Total params: 28,141,268
Trainable params: 162,968
Non-trainable params: 27,978,300
_________________________________________________________________


In [59]:
#fit model
model7.fit(x = params7.get_train_padded(),
           y = params7.get_train_labels_onehot(),
           epochs = 100
          )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x245709ad0>

In [78]:
print "Test Set Accuracy: {0}%".format(model7.evaluate(params7.get_test_padded(),
                params7.get_test_labels_onehot(),
                verbose=0)[1]*100)

Test Set Accuracy: 87.0%


In [79]:
# Save model
model7.save("../models/trec_lstm3.h5")

#also write associated encoder
with open("../models/encoder3.pickle", "wb") as handle:
    pkl.dump(params7.label_encoder, handle)

---

### 7. Merge, Re-train and Save

As I did not use a validation set, I can not make any claim about the usefulness of the models above. However, there are three that are intuitively best - specifically those that combat overfitting with dropout units. 

I will merge the training and test datasets into one dataset. Then I will fit each of these model configurations again to the augmented dataset, and save the resulting models. These are the models I will apply to the Quora data later on. 

####  7.0 Merge the datasets

In [84]:
# All the data in one place
trec_merged = pd.concat([trec_train, trec_test])

#### 7.1 Train a label encoder on the merged dataset

This will be used to convert labels to one-hot encodings, and back. 

In [95]:
# Train a label encoder
encoder = LabelEncoder().fit(trec_merged['label'].values)

In [98]:
# Save the label encoder
with open("../models/trec_label_encoder.pickle", "wb") as handle:
    pkl.dump(encoder, handle)

#### 7.2 Encode questions as padded index vectors

In [87]:
# A function which takes in a numpy array of quesitions (strings)
# and returns padded index vectors usable by deep learning models. 
def encode_and_pad(questions, sequence_length = 10):
    # questions encoded as index vectors
    encoded = tokenizer.texts_to_sequences(questions)
    # padded squences to be of length [sequence_length]
    padded = pad_sequences(encoded, 
                            maxlen = sequence_length,
                            padding = "post", 
                            truncating = "post")
    return(padded)

#### 7.3 Convert labels to one-hot encodings

In [96]:
def one_hot_labels(labels):
    return(to_categorical(encoder.transform(labels)))

#### 7.4 Retrain and save models

##### Model 1 : 


In [99]:
model1_merged = Sequential()

In [100]:
# Add the word embedding layer
model1_merged.add(Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = 10,
                     trainable = False))

In [101]:
# add a droput layer
model1_merged.add(Dropout(.2))

In [102]:
model1_merged.add(LSTM(50))

In [103]:
# add a droput layer
model1_merged.add(Dropout(.2))

In [105]:
# add a Dense layer, and apply the softmax activation on their outputs. 
model1_merged.add(Dense(6, activation='softmax'))

In [106]:
model1_merged.compile(loss = 'categorical_crossentropy',
               optimizer='adam',
               metrics = ['accuracy'])

In [107]:
model1_merged.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 10, 300)           27978300  
_________________________________________________________________
dropout_5 (Dropout)          (None, 10, 300)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 50)                70200     
_________________________________________________________________
dropout_6 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 306       
Total params: 28,048,806
Trainable params: 70,506
Non-trainable params: 27,978,300
_________________________________________________________________


In [109]:
# fit model
model1_merged.fit(x = encode_and_pad(trec_merged['question'].values),
           y = one_hot_labels(trec_merged['label'].values),
           epochs = 125
          )

Epoch 1/125
Epoch 2/125
Epoch 3/125
Epoch 4/125
Epoch 5/125
Epoch 6/125
Epoch 7/125
Epoch 8/125
Epoch 9/125
Epoch 10/125
Epoch 11/125
Epoch 12/125
Epoch 13/125
Epoch 14/125
Epoch 15/125
Epoch 16/125
Epoch 17/125
Epoch 18/125
Epoch 19/125
Epoch 20/125
Epoch 21/125
Epoch 22/125
Epoch 23/125
Epoch 24/125
Epoch 25/125
Epoch 26/125
Epoch 27/125
Epoch 28/125
Epoch 29/125
Epoch 30/125
Epoch 31/125
Epoch 32/125
Epoch 33/125
Epoch 34/125
Epoch 35/125
Epoch 36/125
Epoch 37/125
Epoch 38/125
Epoch 39/125
Epoch 40/125
Epoch 41/125
Epoch 42/125
Epoch 43/125
Epoch 44/125
Epoch 45/125
Epoch 46/125
Epoch 47/125
Epoch 48/125
Epoch 49/125
Epoch 50/125
Epoch 51/125
Epoch 52/125
Epoch 53/125
Epoch 54/125
Epoch 55/125
Epoch 56/125
Epoch 57/125
Epoch 58/125
Epoch 59/125
Epoch 60/125
Epoch 61/125
Epoch 62/125
Epoch 63/125
Epoch 64/125
Epoch 65/125
Epoch 66/125
Epoch 67/125
Epoch 68/125
Epoch 69/125
Epoch 70/125
Epoch 71/125
Epoch 72/125
Epoch 73/125
Epoch 74/125
Epoch 75/125
Epoch 76/125
Epoch 77/125
Epoch 78

<keras.callbacks.History at 0x23d8a2d90>

In [110]:
# Save model
model1_merged.save("../models/trec_lstm1.h5")

##### Model 2:


In [119]:
model2_merged = Sequential()

In [120]:
# Add the word embedding layer
model2_merged.add(Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = 10,
                     trainable = False))

In [121]:
# Add an LSTM layer
model2_merged.add(LSTM(50,dropout=0.2, recurrent_dropout=0.2))

In [122]:
# add a Dense layer, and apply the softmax activation on their outputs. 
model2_merged.add(Dense(6, activation='softmax'))

In [123]:
model2_merged.compile(loss = 'categorical_crossentropy',
               optimizer='adam',
               metrics = ['accuracy'])

In [124]:
model2_merged.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 10, 300)           27978300  
_________________________________________________________________
lstm_9 (LSTM)                (None, 50)                70200     
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 306       
Total params: 28,048,806
Trainable params: 70,506
Non-trainable params: 27,978,300
_________________________________________________________________


In [125]:
# fit model
model2_merged.fit(x = encode_and_pad(trec_merged['question'].values),
           y = one_hot_labels(trec_merged['label'].values),
           epochs = 150
          )

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x263a5c9d0>

In [126]:
model2_merged.save("../models/trec_lstm2.h5")

##### Model 3:

In [127]:
model3_merged = Sequential()

In [128]:
# Add the word embedding layer
model3_merged.add(Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = 10,
                     trainable = False))

In [129]:
# Add an LSTM layer
model3_merged.add(LSTM(100,dropout=0.2, recurrent_dropout=0.2, return_sequences=True))

In [130]:
model3_merged.add(LSTM(6, dropout=0.2, recurrent_dropout=0.2, activation='softmax'))

In [131]:
model3_merged.compile(loss = 'categorical_crossentropy',
               optimizer='adam',
               metrics = ['accuracy'])

In [132]:
model3_merged.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 10, 300)           27978300  
_________________________________________________________________
lstm_10 (LSTM)               (None, 10, 100)           160400    
_________________________________________________________________
lstm_11 (LSTM)               (None, 6)                 2568      
Total params: 28,141,268
Trainable params: 162,968
Non-trainable params: 27,978,300
_________________________________________________________________


In [133]:
# fit model
model3_merged.fit(x = encode_and_pad(trec_merged['question'].values),
           y = one_hot_labels(trec_merged['label'].values),
           epochs = 125
          )

Epoch 1/125
Epoch 2/125
Epoch 3/125
Epoch 4/125
Epoch 5/125
Epoch 6/125
Epoch 7/125
Epoch 8/125
Epoch 9/125
Epoch 10/125
Epoch 11/125
Epoch 12/125
Epoch 13/125
Epoch 14/125
Epoch 15/125
Epoch 16/125
Epoch 17/125
Epoch 18/125
Epoch 19/125
Epoch 20/125
Epoch 21/125
Epoch 22/125
Epoch 23/125
Epoch 24/125
Epoch 25/125
Epoch 26/125
Epoch 27/125
Epoch 28/125
Epoch 29/125
Epoch 30/125
Epoch 31/125
Epoch 32/125
Epoch 33/125
Epoch 34/125
Epoch 35/125
Epoch 36/125
Epoch 37/125
Epoch 38/125
Epoch 39/125
Epoch 40/125
Epoch 41/125
Epoch 42/125
Epoch 43/125
Epoch 44/125
Epoch 45/125
Epoch 46/125
Epoch 47/125
Epoch 48/125
Epoch 49/125
Epoch 50/125
Epoch 51/125
Epoch 52/125
Epoch 53/125
Epoch 54/125
Epoch 55/125
Epoch 56/125
Epoch 57/125
Epoch 58/125
Epoch 59/125
Epoch 60/125
Epoch 61/125
Epoch 62/125
Epoch 63/125
Epoch 64/125
Epoch 65/125
Epoch 66/125
Epoch 67/125
Epoch 68/125
Epoch 69/125
Epoch 70/125
Epoch 71/125
Epoch 72/125
Epoch 73/125
Epoch 74/125
Epoch 75/125
Epoch 76/125
Epoch 77/125
Epoch 78

<keras.callbacks.History at 0x264a57210>

In [134]:
# Save model
model3_merged.save("../models/trec_lstm3.h5")