In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import metrics
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.callbacks import EarlyStopping

Using Theano backend.
Using gpu device 0: GeForce GTX 980 Ti (CNMeM is disabled, CuDNN 4007)


In [2]:
import re
token_pattern=r"(?u)\b\w\w+\b"
def build_tokenizer():
    """Return a function that splits a string into a sequence of tokens"""
    pattern = re.compile(token_pattern)
    return lambda doc: pattern.findall(doc)

In [3]:
def readData(src):
    b1 = []
    b2 = []
    with open(src) as p:
        for i, line in enumerate(p):
            s = line.split('\t')
            if len(s) == 2:
                b1.append(s[0])
                b2.append(s[1][:-1]) #remove \n
                lines = i + 1
    return b1, b2, lines

In [4]:
def readGs(src):
    b = []
    with open(src) as p:
        for i, line in enumerate(p):
            b.append(round(float(line),0))
            lines = i + 1
    return b, lines

Read 2012 trainning data

In [11]:
msr = './dataset/STS2012-train/STS.input.MSRpar.txt'
msrvid = './dataset/STS2012-train/STS.input.MSRvid.txt'
smt = './dataset/STS2012-train/STS.input.SMTeuroparl.txt'
b1_12_1, b2_12_1, l_12_1 = readData(msr)
print l_12_1
b1_12_2, b2_12_2, l_12_2 = readData(msrvid)
print l_12_2
b1_12_3, b2_12_3, l_12_3 = readData(smt)
print l_12_3
lines_12 = l_12_1 + l_12_2 + l_12_3
b1_12_train = b1_12_1 + b1_12_2 + b1_12_3
b2_12_train = b2_12_1 + b2_12_2 + b2_12_3
print lines_12

750
750
734
2234


In [12]:
msr_gs = './dataset/STS2012-train/STS.gs.MSRpar.txt'
msr_gs_vid = './dataset/STS2012-train/STS.gs.MSRvid.txt'
smt_gs = './dataset/STS2012-train/STS.gs.SMTeuroparl.txt'
b_12_train = readGs(msr_gs)[0]
b_12_train = b_12_train + readGs(msr_gs_vid)[0]
b_12_train = b_12_train + readGs(smt_gs)[0]
print len(b_12_train) == len(b1_12_train) == len(b2_12_train)

True


Read 2012 test data

In [13]:
msr_test = './dataset/STS2012-test/STS.input.MSRpar.txt'
vid_test = './dataset/STS2012-test/STS.input.MSRvid.txt'
smt_test = './dataset/STS2012-test/STS.input.SMTeuroparl.txt'
surprise_test = './dataset/STS2012-test/STS.input.surprise.OnWN.txt'
surprise2_test = './dataset/STS2012-test/STS.input.surprise.SMTnews.txt'
b1_12_1t, b2_12_1t, l_12_1t = readData(msr_test)
print l_12_1t
b1_12_2t, b2_12_2t, l_12_2t = readData(vid_test)
print l_12_2t
b1_12_3t, b2_12_3t, l_12_3t = readData(smt_test)
print l_12_3t
b1_12_4t, b2_12_4t, l_12_4t = readData(surprise_test)
print l_12_4t
b1_12_5t, b2_12_5t, l_12_5t = readData(surprise2_test)
print l_12_5t
lines = l_12_1t + l_12_2t + l_12_3t + l_12_4t + l_12_5t
b1_12_test = b1_12_1t + b1_12_2t + b1_12_3t + b1_12_4t + b1_12_5t
b2_12_test = b2_12_1t + b2_12_2t + b2_12_3t + b2_12_4t + b2_12_5t
print lines

750
750
459
750
399
3108


In [14]:
msr_test_gs = './dataset/STS2012-test/STS.gs.MSRpar.txt'
vid_test_gs = './dataset/STS2012-test/STS.gs.MSRvid.txt'
smt_test_gs = './dataset/STS2012-test/STS.gs.SMTeuroparl.txt'
surprise_test_gs = './dataset/STS2012-test/STS.gs.surprise.OnWN.txt'
surprise2_test_gs = './dataset/STS2012-test/STS.gs.surprise.SMTnews.txt'
b_12_test = readGs(msr_test_gs)[0]
b_12_test = b_12_test + readGs(vid_test_gs)[0]
b_12_test = b_12_test + readGs(smt_test_gs)[0]
b_12_test = b_12_test + readGs(surprise_test_gs)[0]
b_12_test = b_12_test + readGs(surprise2_test_gs)[0]
print len(b_12_test) == len(b1_12_test) == len(b2_12_test)

True


Read 2014 test data

In [15]:
t14_f = './dataset/STS2014-test/STS.input.deft-forum.txt'
t14_n = './dataset/STS2014-test/STS.input.deft-news.txt'
t14_h = './dataset/STS2014-test/STS.input.headlines.txt'
t14_i = './dataset/STS2014-test/STS.input.images.txt'
t14_o = './dataset/STS2014-test/STS.input.OnWN.txt'
t14_t = './dataset/STS2014-test/STS.input.tweet-news.txt'
b1_14_1t, b2_14_1t, l_14_1t = readData(t14_f)
print l_14_1t
b1_14_2t, b2_14_2t, l_14_2t = readData(t14_n)
print l_14_2t
b1_14_3t, b2_14_3t, l_14_3t = readData(t14_h)
print l_14_3t
b1_14_4t, b2_14_4t, l_14_4t = readData(t14_i)
print l_14_4t
b1_14_5t, b2_14_5t, l_14_5t = readData(t14_o)
print l_14_5t
b1_14_6t, b2_14_6t, l_14_6t = readData(t14_t)
print l_14_6t
b1_14_test = b1_14_1t + b1_14_2t + b1_14_3t + b1_14_4t + b1_14_5t + b1_14_6t
b2_14_test = b2_14_1t + b2_14_2t + b2_14_3t + b2_14_4t + b2_14_5t + b2_14_6t
lines = l_14_1t + l_14_2t + l_14_3t + l_14_4t + l_14_5t + l_14_6t
print lines

450
300
750
750
750
750
3750


In [16]:
t14_f_gs = './dataset/STS2014-test/STS.gs.deft-forum.txt'
t14_n_gs = './dataset/STS2014-test/STS.gs.deft-news.txt'
t14_h_gs = './dataset/STS2014-test/STS.gs.headlines.txt'
t14_i_gs = './dataset/STS2014-test/STS.gs.images.txt'
t14_o_gs = './dataset/STS2014-test/STS.gs.OnWN.txt'
t14_t_gs = './dataset/STS2014-test/STS.gs.tweet-news.txt'
b_14_test = readGs(t14_f_gs)[0]
b_14_test = b_14_test + readGs(t14_n_gs)[0]
b_14_test = b_14_test + readGs(t14_h_gs)[0]
b_14_test = b_14_test + readGs(t14_i_gs)[0]
b_14_test = b_14_test + readGs(t14_o_gs)[0]
b_14_test = b_14_test + readGs(t14_t_gs)[0]
print len(b_14_test) == len(b1_14_test) == len(b2_14_test)

True


Add all years train data

In [17]:
b1 = b1_12_train + b1_12_test + b1_14_test
b2 = b2_12_train + b2_12_test + b2_14_test
y_train = b_12_train + b_12_test + b_14_test
print len(b1) == len(b2) == len(y_train)

True


Read 2015 train data and 2013 test data as validation set

In [18]:
val_f = './dataset/STS2015-train/STS.input.answers-forum.txt'
val_s = './dataset/STS2015-train/STS.input.answers-students.txt'
val_b = './dataset/STS2015-train/STS.input.belief.txt'
val_h = './dataset/STS2015-train/STS.input.headlines.txt'
val_i = './dataset/STS2015-train/STS.input.images.txt'
val_fn = './dataset/STS2013-test/STS.input.FNWN.txt'
val_he = './dataset/STS2013-test/STS.input.headlines.txt'
val_on = './dataset/STS2013-test/STS.input.OnWN.txt'
v1_15_1, v2_15_1, l_15_1 = readData(val_f)
v1_15_2, v2_15_2, l_15_2 = readData(val_s)
v1_15_3, v2_15_3, l_15_3 = readData(val_b)
v1_15_4, v2_15_4, l_15_4 = readData(val_h)
v1_15_5, v2_15_5, l_15_5 = readData(val_i)
v1_13_1, v2_13_1, l_13_1 = readData(val_fn)
v1_13_2, v2_13_2, l_13_2 = readData(val_he)
v1_13_3, v2_13_3, l_13_3 = readData(val_on)
lines = l_15_1 + l_15_2 + l_15_3 + l_15_4 + l_15_5 + l_13_1 + l_13_2 + l_13_3
v1 = v1_15_1 + v1_15_2 + v1_15_3 + v1_15_4 + v1_15_5 + v1_13_1 + v1_13_2 + v1_13_3
v2 = v2_15_1 + v2_15_2 + v2_15_3 + v2_15_4 + v2_15_5 + v2_13_1 + v2_13_2 + v2_13_3
print lines

1570


In [19]:
val_gs_f = './dataset/STS2015-train/STS.gs.answers-forum.txt'
val_gs_s = './dataset/STS2015-train/STS.gs.answers-students.txt'
val_gs_b = './dataset/STS2015-train/STS.gs.belief.txt'
val_gs_h = './dataset/STS2015-train/STS.gs.headlines.txt'
val_gs_i = './dataset/STS2015-train/STS.gs.images.txt'
val_gs_fn = './dataset/STS2013-test/STS.gs.FNWN.txt'
val_gs_he = './dataset/STS2013-test/STS.gs.headlines.txt'
val_gs_on = './dataset/STS2013-test/STS.gs.OnWN.txt'
y_val = readGs(val_gs_f)[0]
y_val = y_val + readGs(val_gs_s)[0]
y_val = y_val + readGs(val_gs_b)[0]
y_val = y_val + readGs(val_gs_h)[0]
y_val = y_val + readGs(val_gs_i)[0]
y_val = y_val + readGs(val_gs_fn)[0]
y_val = y_val + readGs(val_gs_he)[0]
y_val = y_val + readGs(val_gs_on)[0]
print len(y_val)

1570


Read 2015 test data

In [20]:
test_f = './dataset/STS2015-test/STS.input.answers-forums.txt'
b1_test_f, b2_test_f, lines_f = readData(test_f)
test_s = './dataset/STS2015-test/STS.input.answers-students.txt'
b1_test_s, b2_test_s, lines_s = readData(test_s)
test_b = './dataset/STS2015-test/STS.input.belief.txt'
b1_test_b, b2_test_b, lines_b = readData(test_b)
test_h = './dataset/STS2015-test/STS.input.headlines.txt'
b1_test_h, b2_test_h, lines_h = readData(test_h)
test_i = './dataset/STS2015-test/STS.input.images.txt'
b1_test_i, b2_test_i, lines_i = readData(test_i)
print lines_f
print lines_s
print lines_b
print lines_h
print lines_i

2000
1500
2000
1500
1500


In [21]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(b1 + b2)
# vectors_test = vectorizer.transform(b1_test + b2_test)
vectors.shape
vocab = vectorizer.get_feature_names()
print len(vocab)

14758


In [22]:
tokenize = build_tokenizer()
X_train1 = []
X_train2 = []
for seq in b1:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_train1.append(s)
for seq in b2:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_train2.append(s)



In [23]:
print len(X_train1)
print len(X_train2)

9092
9092


In [24]:
X_test1_f = []
X_test2_f = []
for seq in b1_test_f:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_test1_f.append(s)
for seq in b2_test_f:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_test2_f.append(s)
X_test1_s = []
X_test2_s = []
for seq in b1_test_s:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_test1_s.append(s)
for seq in b2_test_s:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_test2_s.append(s)
X_test1_b = []
X_test2_b = []
for seq in b1_test_b:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_test1_b.append(s)
for seq in b2_test_b:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_test2_b.append(s)
X_test1_h = []
X_test2_h = []
for seq in b1_test_h:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_test1_h.append(s)
for seq in b2_test_h:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_test2_h.append(s)
X_test1_i = []
X_test2_i = []
for seq in b1_test_i:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_test1_i.append(s)
for seq in b2_test_i:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_test2_i.append(s)




In [25]:
print len(X_test1_f)
print len(X_test2_s)
print len(X_test1_b)
print len(X_test2_h)
print len(X_test1_i)

2000
1500
2000
1500
1500


In [26]:
X_val1 = []
X_val2 = []
for seq in v1:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_val1.append(s)
for seq in v2:
    s = []
    for word in tokenize(seq):
        if word in vocab:
            s.append(vectorizer.vocabulary_[word] + 1)
    X_val2.append(s)
print len(X_val1)
print len(X_val2)

1570
1570




In [27]:
MAX_LEN = 25
X_train1 = sequence.pad_sequences(X_train1, maxlen=MAX_LEN)
X_train2 = sequence.pad_sequences(X_train2, maxlen=MAX_LEN)

print('X_train shape:', X_train1.shape)


('X_train shape:', (9092, 25))


In [28]:
X_val1 = sequence.pad_sequences(X_val1, maxlen=MAX_LEN)
X_val2 = sequence.pad_sequences(X_val2, maxlen=MAX_LEN)

print('X_val shape:', X_val1.shape)

('X_val shape:', (1570, 25))


In [29]:
X_test1_f = sequence.pad_sequences(X_test1_f,  maxlen=MAX_LEN)
X_test2_f = sequence.pad_sequences(X_test2_f,  maxlen=MAX_LEN)
print('X_test1_f shape:', X_test1_f.shape)
print('X_test2_f shape:', X_test2_f.shape)

('X_test1_f shape:', (2000, 25))
('X_test2_f shape:', (2000, 25))


In [30]:
y_train, y_val = [np_utils.to_categorical(x) for x in (y_train, y_val)]

Use pre_trained word2vec embedding for LSTM

In [31]:
from gensim.models.word2vec import Word2Vec
wv = Word2Vec.load_word2vec_format("/home/tong/Documents/python/GoogleNews-vectors-negative300.bin.gz", binary = True)
print "done" + " loading"

done loading


In [32]:
vocab_dim = 300 # dimensionality of your word vectors
n_symbols = len(vocab) + 1 # adding 1 to account for 0th index (for masking)
embedding_weights = np.random.rand(n_symbols,vocab_dim)
for word in vocab:
    if word in wv:
        embedding_weights[vectorizer.vocabulary_[word] + 1,:] = wv[word]


In [33]:
print vectorizer.vocabulary_["woman"] + 1
print np.array_equal(wv['woman'], embedding_weights[vectorizer.vocabulary_["woman"] + 1])

14570
True


In [39]:
from keras.layers import Merge
print('Build model...')
encoder_a = Sequential()
encoder_a.add(Embedding(n_symbols, vocab_dim, input_length=MAX_LEN, weights=[embedding_weights]))
encoder_a.add(LSTM(vocab_dim, dropout_W=0.6, dropout_U=0.3))  # try using a GRU instead, for fun
encoder_a.add(Dropout(0.6))

encoder_b = Sequential()
encoder_b.add(Embedding(n_symbols, vocab_dim, input_length=MAX_LEN, weights=[embedding_weights]))
encoder_b.add(LSTM(vocab_dim, dropout_W=0.6, dropout_U=0.3)) 
encoder_b.add(Dropout(0.6))

decoder = Sequential()
decoder.add(Merge([encoder_a, encoder_b], mode='concat'))
decoder.add(Dense(6, activation='softmax'))
decoder.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print('Build complete')

Build model...
Build complete


In [39]:
from keras.layers import Merge
print('Build model...')
encoder_a = Sequential()
encoder_a.add(Embedding(n_symbols, vocab_dim, input_length=MAX_LEN, weights=[embedding_weights]))
forwards_a = LSTM(vocab_dim, dropout_W=0.5, dropout_U=0.1)
backwards_a = LSTM(vocab_dim, dropout_W=0.5, dropout_U=0.1, go_backwards=True)
merged_a = Merge([forwards_a, backwards_a], mode='concat', concat_axis=-1)
encoder_a.add(merged_a)
encoder_a.add(Dropout(0.5))

encoder_b = Sequential()
encoder_b.add(Embedding(n_symbols, vocab_dim, input_length=MAX_LEN, weights=[embedding_weights]))
forwards_b = LSTM(vocab_dim, dropout_W=0.5, dropout_U=0.1)
backwards_b = LSTM(vocab_dim, dropout_W=0.5, dropout_U=0.1, go_backwards=True)
encoder_b.add(Merge([forwards_b, backwards_b], mode='concat', concat_axis=-1))
encoder_b.add(Dropout(0.5))

decoder = Sequential()
decoder.add(Merge([encoder_a, encoder_b], mode='concat'))
decoder.add(Dense(6, activation='softmax'))
decoder.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print('Build complete')

Build model...


Exception: Layer is not connected. Did you forget to set "input_shape"?

In [35]:
print len(y_val)

1570


In [40]:
batch_size = 50

print('Train...')
# early_stopping = EarlyStopping(monitor='val_loss', patience=4)
hist = decoder.fit([X_train1, X_train2], y_train, batch_size=batch_size, nb_epoch=100, show_accuracy=True, 
            validation_data=([X_val1, X_val2], y_val), 
#                    callbacks=[early_stopping]
                  )
# print(hist.history)

Train...
Train on 9092 samples, validate on 1570 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epo

In [38]:
decoder.fit([X_val1, X_val2], y_val, batch_size=batch_size, nb_epoch=100, show_accuracy=True, 
            validation_split = 0.1, shuffle=True, 
#                    callbacks=[early_stopping]
                  )

Train on 1413 samples, validate on 157 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100


<keras.callbacks.History at 0x7fceea5ffd50>

predict score of 2015 data

In [114]:
score, acc = decoder.evaluate([X_test1_f, X_test2_f], y_test,
                            batch_size=batch_size,
                            show_accuracy=True)
print('Test score:', score)
print('Test accuracy:', acc)

Exception: All input arrays and the target array must have the same number of samples.

In [194]:
def writeRes(dst, res):
    with open(dst, 'w') as thefile:
        thefile.write("\n".join(str(i) for i in res))

In [227]:
res = decoder.predict_classes([X_test1_f, X_test2_f]) 
# res = [r[0] for r in res]
# writeRes('./dataset/STS2015-test/sys.forum', res)
np.savetxt('./dataset/STS2015-test/sys.forum', res, newline='\n')

