## Question Classification Dataset

http://cogcomp.cs.illinois.edu/Data/QA/QC/


In [2]:
def split_question(question):
    q = question.strip().split(" ")
    return (q[0],q[1:])

split_question("LOC:other What is the longest suspension bridge in the U.S. ?\n")

def load_question_file(filename):
    f = open(filename)
    X = list()
    Y = list()
    for line in f:
        (y,x) = split_question(line)
        Y.append(y)
        X.append(x)
    return (Y,X)
    

In [14]:
path = "/home/ec2-user/data/qc/"
(Y_train_full,X_train_sentences) = load_question_file(path + "train_5500.label")
(Y_test_full,X_test_sentences) = load_question_file(path + "TREC_10.label")

In [9]:
import numpy as np

def build_dict(sentences):
#    from collections import OrderedDict

    '''
    Build dictionary of train words
    Outputs: 
     - Dictionary of word --> word index
     - Dictionary of word --> word count freq
    '''
    print 'Building dictionary..',
    wordcount = dict()
    #For each worn in each sentence, cummulate frequency
    for ss in sentences:
        for w in ss:
            if w not in wordcount:
                wordcount[w] = 1
            else:
                wordcount[w] += 1

    counts = wordcount.values() # List of frequencies
    keys = wordcount.keys() #List of words
    
    sorted_idx = reversed(np.argsort(counts))
    
    worddict = dict()
    for idx, ss in enumerate(sorted_idx):
        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
    print np.sum(counts), ' total words ', len(keys), ' unique words'

    return worddict, wordcount




In [15]:
worddict, wordcount = build_dict(X_train_sentences)

print(worddict['the'], wordcount['the'])


Building dictionary.. 55635  total words  9448  unique words
(3, 3611)


In [13]:
def generate_sequence(sentences, dictionary):
    '''
    Convert tokenized text in sequences of integers
    '''
    seqs = [None] * len(sentences)
    for idx, ss in enumerate(sentences):
        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in ss]

    return seqs

In [20]:
X_train = generate_sequence(X_train_sentences, worddict)
X_test  = generate_sequence(X_test_sentences, worddict)

In [22]:
def parse_label(label):
    t = label.split(":")
    return (t[0],t[1])

parse_label("ENT:Person")

('ENT', 'Person')

In [94]:
Y_train = [parse_label(y)[0]  for y in Y_train_full]
Y_test  = [parse_label(y)[0]  for y in Y_test_full]

In [42]:
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

(5452, 'train sequences')
(500, 'test sequences')


In [55]:
maxlen_train = max([len(x) for x in X_train])
maxlen_test  = max([len(x) for x in X_test])

print(maxlen_train)
print(maxlen_test)

37
17


In [61]:
max_features = 10000
maxlen = 30  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

In [59]:
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM, SimpleRNN, GRU

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [62]:
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Pad sequences (samples x time)
('X_train shape:', (5452, 30))
('X_test shape:', (500, 30))


In [95]:
catdict, catcount = build_dict([Y_train])

Building dictionary.. 5452  total words  6  unique words


In [84]:
from keras.utils import np_utils

nb_classes = 5



print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
Y_train = np_utils.to_categorical(Y_train, nb_classes)
Y_test = np_utils.to_categorical(Y_test, nb_classes)
print('Y_train shape:', Y_train.shape)
print('Y_test shape:', Y_test.shape)

Convert class vector to binary class matrix (for use with categorical_crossentropy)


ValueError: invalid literal for long() with base 10: 'DESC'

In [76]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(128))
model.add(Activation('softmax'))

Build model...


In [77]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_7 (Embedding)          (None, None, 128)     1280000     embedding_input_7[0][0]          
____________________________________________________________________________________________________
lstm_7 (LSTM)                    (None, 128)           131584      embedding_7[0][0]                
____________________________________________________________________________________________________
dense_6 (Dense)                  (None, 128)           16512       lstm_7[0][0]                     
____________________________________________________________________________________________________
activation_7 (Activation)        (None, 128)           0           dense_6[0][0]                    
Total params: 1,428,096
Trainable params: 1,428,096
Non-trainable params: 0
_______________

In [78]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


In [79]:
print('Train...')
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=15,
          validation_data=(X_test, Y_test))
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...


ValueError: Error when checking model target: expected activation_7 to have shape (None, 128) but got array with shape (5452, 1)