In [259]:
# import pandas as pd
# import numpy as np

from keras import backend as K

# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.text import one_hot
# from keras.preprocessing.sequence import pad_sequences

# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Flatten
# from keras.layers import Embedding
# from keras.layers import LSTM
# from keras.layers import Activation

# from keras.utils import to_categorical

# from gensim.models import KeyedVectors


from sklearn.preprocessing import LabelEncoder

## Question Classification

In this notebook I build a series of deep learning models that are used to classify questions based on their answer type and their detailed answer type. 

These models will initially be evaluated using a developement/test set, but then the entire dataset will be used for training, and the models will be evaluated by the utility of their predictions for the main downstream task - deduplicating question intent. 

Thank you Dr. Jason Brownlee for [this great post](https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/) on how to use pretrained word embeddings!

### Contents

#### 0. Load `TREC` dataset



In [2]:
trec_train = pd.read_csv("../data/TREC/processed/train.csv")
trec_test = pd.read_csv("../data/TREC/processed/test.csv")

In [9]:
trec_train.head(10)

Unnamed: 0.1,Unnamed: 0,question,label,extended_label
0,1,How did serfdom develop in and then leave Russ...,DESC,DESC:manner
1,2,What films featured the character Popeye Doyle ?,ENTY,ENTY:cremat
2,3,How can I find a list of celebrities ' real na...,DESC,DESC:manner
3,4,What fowl grabs the spotlight after the Chines...,ENTY,ENTY:animal
4,5,What is the full form of . com ?,ABBR,ABBR:exp
5,6,What contemptible scoundrel stole the cork fro...,HUM,HUM:ind
6,7,What team did baseball 's St . Louis Browns be...,HUM,HUM:gr
7,8,What is the oldest profession ?,HUM,HUM:title
8,9,What are liver enzymes ?,DESC,DESC:def
9,10,Name the scar - faced bounty hunter of The Old...,HUM,HUM:ind


In [228]:
to_categorical(LabelEncoder().fit_transform(trec_test['label'].values))


array([[ 0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.],
       ..., 
       [ 0.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.]])

#### 0.1 Shuffle TREC dataset

In [120]:
trec_train = trec_train.sample(frac=1.0, random_state = 550)
trec_test = trec_test.sample(frac = 1.0, random_state = 550)

---

### 1. Prepare document encoding 

Using Kera's `Tokenizer` class, create a dictionary of all the types in both the quora and the TREC datasets. Encode each document as a vector of indecies of the corresponding types in the dictionary. 

#### 1.1  `Quora` dataset

In [3]:
train_quora = pd.read_csv("../data/processed/train.csv")

In [11]:
train_quora.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,1,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,2,1,3,4,What is the story of Kohinoor Koh - i - Noor D...,What would happen if the Indian government sto...,0
2,3,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,4,3,7,8,Why am I mentally very lonely ? How can I solv...,Find the remainder when math 23 24 math is div...,0
4,5,4,9,10,"Which one dissolve in water quickly sugar , sa...",Which fish would survive in salt water ?,0


#### 1.2 Get all the questions, from both datasets, in Numpy array


In [40]:
all_questions = trec_train['question'].append(trec_test['question']).append(
    train_quora['question1']).append(train_quora['question2']).values


In [42]:
all_questions[0]

'How did serfdom develop in and then leave Russia ? '

In [38]:
# Prepare tokenizer 
tokenizer = Tokenizer()

In [43]:
# Create a dictionary for all the quesions in the joint datasets
tokenizer.fit_on_texts(all_questions)

In [44]:
# The number of types in the joint datset
vocab_size = len(tokenizer.word_index) + 1
vocab_size

93122

In [98]:
# integer encode the questions
encoded_questions = tokenizer.texts_to_sequences(all_questions)

In [99]:
# an example of a question integer embedding
print(all_questions[0])
encoded_questions[0]

How did serfdom develop in and then leave Russia ? 


[5, 49, 54929, 732, 8, 12, 254, 763, 656]

In [100]:
# Another example. Notice how the word `how` is consistently endoded as the integer 5. 
print(all_questions[2])
encoded_questions[2]

How can I find a list of celebrities ' real names ? 


[5, 13, 4, 84, 6, 446, 10, 2450, 2001, 191, 956]

---

### 2. Prepare the embedding matrix

In [58]:
# load the pretrained fasttext embeddings (this takes a while)
embedding_model = KeyedVectors.load_word2vec_format('../data/embeddings/wiki.en.vec')

In [101]:
# Each row in the matrix is the embedding of one word in the joint datasets. 
# The row index corresponds to the integer ecoding of that word. 
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    if word in embedding_model:
        embedding_matrix[i] = embedding_model[word]

---

### 3. An LSTM experiment parameter class

I will likely exeperiment with many different models, with many different hyperparameters. It will be useful to keep all those parameters contained in one place

In [229]:
class LstmParams(object):
    
    def __init__(self,
                 sequence_length = 10, 
                 labels = "basic"):
        
        # Number of tokens to include in each sequence
        self.sequence_length = sequence_length
        # whether to use basic or extended labels
        self.labels = labels
        
        # Keep track of the raw training and test questions
        self.train_questions = trec_train['question'].values
        self.test_questions = trec_test['question'].values
        
        # Encode the training and test quetions
        self.train_questions_encoded = tokenizer.texts_to_sequences(self.train_questions)
        self.test_questions_encoded = tokenizer.texts_to_sequences(self.test_questions)
        
        
    '''
    Return the labels as a numpy array of strings 
    Which labels to return depends on the objects 'labels' parameter. 
    '''
    def get_train_labels(self):
        if self.labels == "basic":
            return(trec_train['label'].values)
        elif self.labels == "extended":
            return(trec_train["extended_label"].values)
        else:
            print("Invalid `labels` parameter '%s'. Returning basic labels.") % (self.labels)
            return(trec_train['label'].values)
        
    def get_test_labels(self):
        if self.labels == "basic":
            return(trec_test['label'].values)
        elif self.labels == "extended":
            return(trec_test["extended_label"].values)
        else:
            print("Invalid `labels` parameter '%s'. Returning basic labels.") % (self.labels)
            return(trec_test['label'].values)
        

    '''
    Return the labels of the as a numpy ndarray, using one-hot encoding. 
    This is for transparency in my Neural Network archetecture. 
    '''
    def get_train_labels_onehot(self):
        if self.labels == "basic":
            return(to_categorical(LabelEncoder().fit_transform(trec_train['label'].values)))
        elif self.labels == "extended":
            return(to_categorical(LabelEncoder().fit_transform(trec_train['extended_label'].values)))
        else:
            print("Invalid `labels` parameter '%s'. Returning basic labels.") % (self.labels)
            return(to_categorical(LabelEncoder().fit_transform(trec_train['label'].values)))
        
    def get_test_labels_onehot(self):
        if self.labels == "basic":
            return(to_categorical(LabelEncoder().fit_transform(trec_test['label'].values)))
        elif self.labels == "extended":
            return(to_categorical(LabelEncoder().fit_transform(trec_test['extended_label'].values)))
        else:
            print("Invalid `labels` parameter '%s'. Returning basic labels.") % (self.labels)
            return(to_categorical(LabelEncoder().fit_transform(trec_test['label'].values)))
    '''
    Return the encoded questions after padding. 
    Padding (or truncating) amount depends on attribute `self.sequence_length`
    '''
    def get_train_padded(self):
        padded = pad_sequences(self.train_questions_encoded, 
                              maxlen = self.sequence_length,
                              padding = "post", 
                              truncating = "post")
        return(padded)
    
    def get_test_padded(self):
        padded = pad_sequences(self.test_questions_encoded, 
                              maxlen = self.sequence_length,
                              padding = "post", 
                              truncating = "post")
        return(padded)
    
    '''
    Get the number of classes (output layer dimension). 
    This is the number of unique classes. 
    '''
    def get_num_classes(self):
        n_unique = len(np.unique(self.get_test_labels().tolist() + self.get_train_labels().tolist()))
        return(n_unique)

--- 

### 4. A first model!

First, I'll try the most basic vanilla LSTM for the TREC classification problem (simple labels)


#### 4.0 Initialize Model parameters

In [268]:
params1 = LstmParams(sequence_length = 10,
                    labels = "basic")

In [269]:
model1 = Sequential()

#### 4.1 Add layers

In [270]:
# Add the word embedding layer
model1.add(Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = params1.sequence_length,
                     trainable = False))

In [271]:
# Add an LSTM layer
model1.add(LSTM(params1.get_num_classes(), activation="softmax"))

In [272]:
model1.compile(loss = 'categorical_crossentropy',
               optimizer='adam',
               metrics = ['accuracy'])

In [273]:
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 10, 300)           27936600  
_________________________________________________________________
lstm_6 (LSTM)                (None, 6)                 7368      
Total params: 27,943,968
Trainable params: 7,368
Non-trainable params: 27,936,600
_________________________________________________________________


#### 4.2 Train!

In [274]:
# fit model
model1.fit(x = params1.get_train_padded(),
           y = params1.get_train_labels_onehot(),
           epochs = 50
          )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2f8544ad0>

In [281]:
print "Test Set Accuracy: {0}%".format(model1.evaluate(params1.get_test_padded(),
                params1.get_test_labels_onehot(),
                verbose=0)[1]*100)

Test Set Accuracy: 73.0%
