## Question Classification Dataset

http://cogcomp.cs.illinois.edu/Data/QA/QC/


In [29]:
%%file load_qc.py
## Question Classification Dataset
## http://cogcomp.cs.illinois.edu/Data/QA/QC/

import numpy as np

def split_question(question):
    q = question.strip().split(" ")
    return (q[0],q[1:])
file

def load_question_file(filename):
    f = open(filename)
    X = list()
    Y = list()
    for line in f:
        (y,x) = split_question(line)
        Y.append(y)
        X.append(x)
    return (Y,X)

def build_dict(sentences):
#    from collections import OrderedDict

    '''
    Build dictionary of train words
    Outputs: 
     - Dictionary of word --> word index
     - Dictionary of word --> word count freq
    '''
    print 'Building dictionary..',
    wordcount = dict()
    #For each worn in each sentence, cummulate frequency
    for ss in sentences:
        for w in ss:
            if w not in wordcount:
                wordcount[w] = 1
            else:
                wordcount[w] += 1

    counts = wordcount.values() # List of frequencies
    keys = wordcount.keys() #List of words
    
    sorted_idx = reversed(np.argsort(counts))
    
    worddict = dict()
    for idx, ss in enumerate(sorted_idx):
        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
    print np.sum(counts), ' total words ', len(keys), ' unique words'

    return worddict, wordcount

def generate_sequence(sentences, dictionary):
    '''
    Convert tokenized text in sequences of integers
    '''
    seqs = [None] * len(sentences)
    for idx, ss in enumerate(sentences):
        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in ss]

    return seqs

def parse_label(label):
    t = label.split(":")
    return (t[0],t[1])

def load_corpus(path): 
    (Y_train_full,X_train_sentences) = load_question_file(path + "train_5500.label")
    (Y_test_full,X_test_sentences) = load_question_file(path + "TREC_10.label")
    return (Y_train_full,X_train_sentences), (Y_test_full,X_test_sentences)

def load_data(path): 
    (Y_train_full,X_train_sentences), (Y_test_full,X_test_sentences) = load_corpus(path)
    worddict, wordcount = build_dict(X_train_sentences)
    
    X_train = generate_sequence(X_train_sentences, worddict)
    X_test  = generate_sequence(X_test_sentences, worddict)
    
    Y_train_label = [parse_label(y)[0]  for y in Y_train_full]
    Y_test_label  = [parse_label(y)[0]  for y in Y_test_full]
    
    labels = set(Y_train_label + Y_test_label)
    catdict = {label: idx for (idx, label) in enumerate(labels)}
    
    Y_train = [catdict[y] for y in Y_train_label]
    Y_test  = [catdict[y] for y in Y_test_label]
    
    return (Y_train,X_train), (Y_test,X_test)


Overwriting load_qc.py


In [26]:
path = "/home/ec2-user/data/qc/"
(Y_train,X_train), (Y_test,X_test) = load_data(path)

Building dictionary.. 55635  total words  9448  unique words


In [34]:
nb_classes = len(set(Y_train + Y_test))
print('Num. classes:', nb_classes)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

maxlen_train = max([len(x) for x in X_train])
maxlen_test  = max([len(x) for x in X_test])
print('Max length train', maxlen_train)
print('Max length tet', maxlen_test)

('Num. classes:', 6)
(5452, 'train sequences')
(500, 'test sequences')
('Max length train', 37)
('Max length tet', 17)


In [None]:
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

In [None]:
from keras.utils import np_utils

print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
Y_train = np_utils.to_categorical(Y_train, nb_classes)
Y_test = np_utils.to_categorical(Y_test, nb_classes)
print('Y_train shape:', Y_train.shape)
print('Y_test shape:', Y_test.shape)

## Load models

In [61]:
# %load models.py

from keras.utils import np_utils

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding
from keras.layers import Convolution1D, MaxPooling1D
from keras.layers import LSTM, GRU, SimpleRNN

def build_lstm(max_features, embedding_dims, nb_classes):  
   model = Sequential()

   model.add(Embedding(max_features, 128, dropout=0.2))
   model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
   model.add(Dense(nb_classes))
   model.add(Activation('softmax'))

   return model


def build_cnn(embedding_dims, maxlen, nb_filter, filter_length, hidden_dims, nb_classes ) : 
    model = Sequential()
    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen,
                    dropout=0.2))

    # we add a Convolution1D, which will learn nb_filter
    # word group filters of size filter_length:
    model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))
    # we use max pooling:
    model.add(MaxPooling1D(pool_length=model.output_shape[1]))

    # We flatten the output of the conv layer,
    # so that we can add a vanilla dense layer:
    model.add(Flatten())

    # We add a vanilla hidden layer:
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    
    return model


def build_cnn_lstm(embedding_size, maxlen, nb_filter, filter_length, pool_length, lstm_output_size, nb_classes):
    model = Sequential()
    model.add(Embedding(max_features, embedding_size, input_length=maxlen))
    model.add(Dropout(0.25))
    model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))
    model.add(MaxPooling1D(pool_length=pool_length))
    model.add(LSTM(lstm_output_size))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
     
    return model     

## LSTM 

In [37]:
max_features = 10000
maxlen = 30  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

In [39]:
model = build_lstm(max_features, 128, nb_classes)
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_4 (Embedding)          (None, None, 128)     1280000     embedding_input_4[0][0]          
____________________________________________________________________________________________________
lstm_4 (LSTM)                    (None, 128)           131584      embedding_4[0][0]                
____________________________________________________________________________________________________
dense_3 (Dense)                  (None, 6)             774         lstm_4[0][0]                     
____________________________________________________________________________________________________
activation_3 (Activation)        (None, 6)             0           dense_3[0][0]                    
Total params: 1,412,358
Trainable params: 1,412,358
Non-trainable params: 0
_______________

In [42]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


In [43]:
print('Train...')
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=15,
          validation_data=(X_test, Y_test))
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 5452 samples, validate on 500 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
('Test score:', 0.55825018548965455)
('Test accuracy:', 0.85199999999999998)


## CNN

In [48]:
max_features = 10000 # vocabulary size
maxlen =  30         # max document length 
batch_size = 32      # minibatch size 
nb_epoch = 2         # 

#### Parameters

In [49]:
# set parameters:
embedding_dims = 128 # size of embedding dims
nb_filter = 65      # number of filters 
filter_length = 5    # 1d convolution size
hidden_dims = 250    # size of hidden layers 

In [53]:
cnn_model = build_cnn(embedding_dims, maxlen, nb_filter, filter_length, hidden_dims, nb_classes)

In [54]:
cnn_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_6 (Embedding)          (None, 30, 128)       1280000     embedding_input_6[0][0]          
____________________________________________________________________________________________________
convolution1d_2 (Convolution1D)  (None, 26, 65)        41665       embedding_6[0][0]                
____________________________________________________________________________________________________
maxpooling1d_2 (MaxPooling1D)    (None, 1, 65)         0           convolution1d_2[0][0]            
____________________________________________________________________________________________________
flatten_2 (Flatten)              (None, 65)            0           maxpooling1d_2[0][0]             
___________________________________________________________________________________________

In [55]:
cnn_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


In [56]:
print('Train...')
cnn_model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=15,
          validation_data=(X_test, Y_test))
score, acc = cnn_model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 5452 samples, validate on 500 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
('Test accuracy:', 0.87199999952316287)


## CNN - LSTM

In [62]:
pool_length = 4
lstm_output_size = 70

cnn_lstm_model = build_cnn_lstm(embedding_dims, maxlen, nb_filter, filter_length, pool_length, lstm_output_size, nb_classes)

In [63]:
cnn_lstm_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_8 (Embedding)          (None, 30, 128)       1280000     embedding_input_8[0][0]          
____________________________________________________________________________________________________
dropout_4 (Dropout)              (None, 30, 128)       0           embedding_8[0][0]                
____________________________________________________________________________________________________
convolution1d_4 (Convolution1D)  (None, 26, 65)        41665       dropout_4[0][0]                  
____________________________________________________________________________________________________
maxpooling1d_4 (MaxPooling1D)    (None, 6, 65)         0           convolution1d_4[0][0]            
___________________________________________________________________________________________

In [64]:
cnn_lstm_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


In [66]:
print('Train...')
cnn_lstm_model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=15,
          validation_data=(X_test, Y_test))
score, acc = cnn_lstm_model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 5452 samples, validate on 500 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
('Test accuracy:', 0.85599999999999998)
