In [8]:
#!curl https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz --output numberbatch-en-17.06.txt.gz

In [5]:
#!gunzip numberbatch-en-17.06.txt.gz

In [7]:
from gensim.models import KeyedVectors

numberbatch = KeyedVectors.load_word2vec_format("numberbatch-en-17.06.txt", binary=False)

In [63]:
numberbatch.most_similar(['functor'])

[('contravariant_functor', 0.9674091935157776),
 ('forgetful_functor', 0.9666687250137329),
 ('yoneda_embedding', 0.9497337937355042),
 ('endofunctor', 0.9360368847846985),
 ('representable_functor', 0.9314213991165161),
 ('cofunctor', 0.9296932220458984),
 ('natural_transformation', 0.9164144992828369),
 ('yoneda_lemma', 0.9022417664527893),
 ('coaugmentation', 0.8871707320213318),
 ('category_theory', 0.8751257658004761)]

In [51]:
from collections import namedtuple

Phrase = namedtuple('Phrase', 'original candidate label')
Token = namedtuple('Token', 'text tags')

def split_tokens(sent):
    tokens = []
    for token in sent.split():
        tags = token.split('/')
        tokens.append(Token(tags[0].lower(), tuple(tags[1:])))
    return tokens


def readData(filename, eval_label, ignoreNone):
    data = []
    with open(filename) as f:
        for line in f:
            fields = line.strip().split('\t')
            if len(fields) == 7:
                (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = fields
            else:
                continue
            label = eval_label(judge)
            if ((label is None) and ignoreNone):
                continue
            data.append(Phrase(split_tokens(origsenttag), split_tokens(candsenttag), label))
    
    return data
                
def eval_amt_label(label):
    nYes = eval(label)[0]            
    
    if nYes >= 3:
        return True
    elif nYes <= 1:
        return False
    
    return None

def eval_expert_label(label):
    nYes = int(label[0])
    
    if nYes >= 4:
        return True
    elif nYes <= 2:
        return False
    
    return None


def readTrainData(filename):
    return readData(filename, eval_amt_label, True)

def readTestData(filename):
    return readData(filename, eval_expert_label, False)

In [123]:
train_data = readTrainData("SemEval-PIT2015-py3/data/train.data")
dev_data = readTrainData("SemEval-PIT2015-py3/data/dev.data")
test_data = [p for p in readTestData("SemEval-PIT2015-py3/data/test.data") if p.label is not None]

In [53]:
train_data[:3]

[Phrase(original=[Token(text='ej', tags=('B-person', 'NNP', 'B-NP', 'O')), Token(text='manuel', tags=('I-person', 'NNP', 'B-VP', 'O')), Token(text='the', tags=('O', 'DT', 'B-NP', 'O')), Token(text='1st', tags=('O', 'CD', 'I-NP', 'O')), Token(text='qb', tags=('O', 'NNP', 'I-NP', 'O')), Token(text='to', tags=('O', 'TO', 'B-VP', 'O')), Token(text='go', tags=('O', 'VB', 'I-VP', 'B-EVENT')), Token(text='in', tags=('O', 'IN', 'B-PP', 'I-EVENT')), Token(text='this', tags=('O', 'DT', 'B-NP', 'O')), Token(text='draft', tags=('O', 'NN', 'I-NP', 'O'))], candidate=[Token(text='but', tags=('O', 'CC', 'O', 'O')), Token(text='my', tags=('O', 'PRP$', 'B-NP', 'O')), Token(text='bro', tags=('O', 'NN', 'I-NP', 'O')), Token(text='from', tags=('O', 'IN', 'B-PP', 'O')), Token(text='the', tags=('O', 'DT', 'B-NP', 'O')), Token(text='757', tags=('O', 'CD', 'I-NP', 'O')), Token(text='ej', tags=('B-person', 'NNP', 'I-NP', 'O')), Token(text='manuel', tags=('I-person', 'NNP', 'I-NP', 'O')), Token(text='is', tags=(

In [54]:
test_data[:3]

[Phrase(original=[Token(text='all', tags=('O', 'DT', 'B-NP', 'O')), Token(text='the', tags=('O', 'DT', 'I-NP', 'O')), Token(text='home', tags=('O', 'NN', 'I-NP', 'O')), Token(text='alones', tags=('O', 'VBZ', 'B-VP', 'O')), Token(text='watching', tags=('O', 'VBG', 'I-VP', 'B-EVENT')), Token(text='8', tags=('O', 'CD', 'B-NP', 'O')), Token(text='mile', tags=('O', 'NN', 'I-NP', 'O'))], candidate=[Token(text='8', tags=('O', 'NN', 'B-NP', 'O')), Token(text='mile', tags=('O', 'NN', 'I-NP', 'O')), Token(text='is', tags=('O', 'VBZ', 'B-VP', 'O')), Token(text='on', tags=('O', 'IN', 'B-PP', 'O')), Token(text='thats', tags=('O', 'NNS', 'B-NP', 'O')), Token(text='my', tags=('O', 'PRP$', 'B-NP', 'O')), Token(text='movie', tags=('O', 'NN', 'I-NP', 'B-EVENT'))], label=None),
 Phrase(original=[Token(text='all', tags=('O', 'DT', 'B-NP', 'O')), Token(text='the', tags=('O', 'DT', 'I-NP', 'O')), Token(text='home', tags=('O', 'NN', 'I-NP', 'O')), Token(text='alones', tags=('O', 'VBZ', 'B-VP', 'O')), Token(t

In [59]:
#from nltk.corpus import stopwords

#stopwords.words('english')

In [60]:
from gensim.corpora import Dictionary

def phrase_tokens(phrase):
    return [token.text for token in (phrase.original + phrase.candidate)]
    

vocab = Dictionary([phrase_tokens(p) for p in train_data + dev_data + test_data])

len(vocab)

10841

In [77]:
VOCAB_SIZE = len(vocab) + 1 # +1 for padding

print(VOCAB_SIZE)

10842


In [65]:
id2token = dict([(i, token)for token, i in vocab.token2id.items()])

In [66]:
def text_to_sequence(words):
    return [i + 1 for i in vocab.doc2idx(words)]

def sequence_to_text(seq):
    return [id2token[i - 1] for i in seq if i]

In [68]:
def data_to_sequences(data):
    
    encoder_seqs = []
    decoder_seqs = []
    labels = []
    
    for phrase in data:
        encoder_seqs.append(text_to_sequence([t.text for t in phrase.original]))
        decoder_seqs.append(text_to_sequence([t.text for t in phrase.candidate]))
        labels.append(phrase.label)
        
    return encoder_seqs, decoder_seqs, labels 

train_encoder_seqs, train_decoder_seqs, train_labels = data_to_sequences(train_data)

print(len(train_encoder_seqs))
print(len(train_decoder_seqs))
print(len(train_labels))

11530
11530
11530


In [70]:
print(sequence_to_text(train_encoder_seqs[0]))
print(sequence_to_text(train_decoder_seqs[0]))
print(train_labels[0])

['ej', 'manuel', 'the', '1st', 'qb', 'to', 'go', 'in', 'this', 'draft']
['but', 'my', 'bro', 'from', 'the', '757', 'ej', 'manuel', 'is', 'the', '1st', 'qb', 'gone']
True


In [71]:
dev_encoder_seqs, dev_decoder_seqs, dev_labels = data_to_sequences(dev_data)

In [124]:
test_encoder_seqs, test_decoder_seqs, test_labels = data_to_sequences(test_data)

In [75]:
MAX_SEQ_LEN = max([len(seq) for seq in (train_encoder_seqs + train_decoder_seqs + \
                                       dev_encoder_seqs + dev_decoder_seqs + \
                                       test_decoder_seqs + test_encoder_seqs)])

print(MAX_SEQ_LEN)

18


In [93]:
from keras.preprocessing.sequence import pad_sequences

def padding(sequences):
    return pad_sequences(sequences, maxlen=MAX_SEQ_LEN, dtype='int32', padding='post', truncating='post')

In [79]:
EMBEDDING_SIZE = 300

In [141]:
import numpy as np

EMBEDDING_MATRIX = np.zeros((VOCAB_SIZE, EMBEDDING_SIZE))
  
missed = []
for word, i in vocab.token2id.items():
    try:
        EMBEDDING_MATRIX[i] = numberbatch[word]
    except KeyError:
        missed.append(word)

print(len(missed))

2446


In [142]:
missed

['757',
 'fsu',
 'qbflorida',
 '2013',
 'noles',
 'cbaire1',
 '17th',
 'nfldraft',
 'wasnt',
 '16',
 'buffalobills',
 'didnt',
 'yoooooo',
 '50',
 '2009',
 'asf',
 'randymoss',
 '59th',
 'mrsh',
 '59',
 'kiper',
 'mcshay',
 'wrmarshall',
 'thunderingherd',
 'patriotsnation',
 'weswelker',
 'onehanded',
 '110m',
 'isnt',
 '110mill',
 '110',
 '40',
 '5yr110',
 '5years',
 '40m',
 '40million',
 'highestpaid',
 '666k',
 'gezwxm87',
 '2325',
 '2325million',
 '22',
 '5year',
 '110million',
 'baaaad',
 'maaan',
 'stephenasmith',
 'bigmoney',
 'espnnfcnblog',
 'shiting',
 'butler182',
 'abbeyview',
 'chicagobears',
 'swearinger',
 'transportredoing',
 'idealware',
 'winemakers',
 'alidoee',
 'bmthofficial',
 'aaaaaaaaaaaaaahahaha',
 'raaaiiiiders',
 'nickiminaj',
 'audiance',
 'audley',
 'liveview',
 'line2',
 'functionalities',
 'exok',
 'n9',
 'lutzenkirchen',
 'americanidol',
 '17',
 '14',
 '9inning',
 '16th',
 'comerica',
 '10',
 'kkkkkkkkkkkkkkkkk',
 '100',
 '9inn',
 'alltime',
 'careerhig

In [151]:
from keras.layers import Embedding

embedding_layer = Embedding(input_dim = VOCAB_SIZE, 
                            output_dim = EMBEDDING_SIZE,
                            input_length = MAX_SEQ_LEN,
                            weights = [EMBEDDING_MATRIX], trainable = False)

In [152]:
from keras.layers import Input, LSTM, Embedding, TimeDistributed, Dense
from keras.models import Model
from keras.optimizers import RMSprop

HIDDEN_DIM = 300

encoder_inputs = Input(shape=(MAX_SEQ_LEN, ), dtype='int32',)
encoder_embedding = embedding_layer(encoder_inputs)
encoder_LSTM = LSTM(HIDDEN_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)
    
decoder_inputs = Input(shape=(MAX_SEQ_LEN, ), dtype='int32',)
decoder_embedding = embedding_layer(decoder_inputs)
decoder_LSTM = LSTM(HIDDEN_DIM, return_state=True)
decoder_outputs, _, _ = decoder_LSTM(decoder_embedding, initial_state=[state_h, state_c])
    
outputs = Dense(1, activation='sigmoid')(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], outputs)

model.summary()
model.compile(optimizer=RMSprop(lr=1e-4), loss='binary_crossentropy', metrics=['acc'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           (None, 18)           0                                            
__________________________________________________________________________________________________
input_13 (InputLayer)           (None, 18)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 18, 300)      3252600     input_13[0][0]                   
                                                                 input_14[0][0]                   
__________________________________________________________________________________________________
lstm_13 (LSTM)                  [(None, 300), (None, 721200      embedding_3[0][0]                
__________

In [158]:
model.fit([padding(train_encoder_seqs), padding(train_decoder_seqs)], np.array(train_labels),
          batch_size = 100, epochs = 5, validation_split = 0.1)

Train on 10377 samples, validate on 1153 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x159e56e10>

In [159]:
labels_predicted = [prob > 0.5 for prob in model.predict([padding(dev_encoder_seqs), padding(dev_decoder_seqs)])]

In [160]:
labels_predicted[0]

array([False])

In [161]:
dev_labels[0]

False

In [162]:
from sklearn.metrics import classification_report

print(classification_report(dev_labels, labels_predicted))

              precision    recall  f1-score   support

       False       0.66      0.95      0.78      2672
        True       0.55      0.10      0.18      1470

   micro avg       0.65      0.65      0.65      4142
   macro avg       0.61      0.53      0.48      4142
weighted avg       0.62      0.65      0.57      4142



In [244]:
import keras
import keras.backend as K
from keras.models import Model
from keras.layers import Activation
from keras.layers import Embedding, Input
from keras.layers import LSTM, Lambda, concatenate
from keras import regularizers

HIDDEN_DIM=100

def exponent_neg_manhattan_distance(x, hidden_size=HIDDEN_DIM):
    ''' Helper function for the similarity estimate of the LSTMs outputs '''
    return K.exp(-K.sum(K.abs(x[:,:hidden_size] - x[:,hidden_size:]), axis=1, keepdims=True))

def exponent_neg_cosine_distance(x, hidden_size=HIDDEN_DIM):
    ''' Helper function for the similarity estimate of the LSTMs outputs '''
    leftNorm = K.l2_normalize(x[:,:hidden_size], axis=-1)
    rightNorm = K.l2_normalize(x[:,hidden_size:], axis=-1)
    return K.exp(K.sum(K.prod([leftNorm, rightNorm], axis=0), axis=1, keepdims=True))

In [257]:
seq_1 = Input(shape=(MAX_SEQ_LEN,), dtype='int32', name='sequence1')
seq_2 = Input(shape=(MAX_SEQ_LEN,), dtype='int32', name='sequence2')

input_1 = embedding_layer(seq_1)
input_2 = embedding_layer(seq_2)

l1 = LSTM(units=HIDDEN_DIM)

l1_out = l1(input_1)
l2_out = l1(input_2)

concats = concatenate([l1_out, l2_out], axis=-1)

#main_output = Lambda(exponent_neg_cosine_distance, output_shape=(1,))(concats)
#main_output = Lambda(exponent_neg_manhattan_distance, output_shape=(1,))(concats)
dense_ouput = Dense(1024, activation="relu")(concats)
main_output = Dense(1, activation="sigmoid")(dense_ouput)

model = Model(inputs=[seq_1, seq_2], outputs=[main_output])

opt = keras.optimizers.Adadelta(lr=1, clipnorm=1.25)

#model.compile(optimizer=RMSprop(lr=1e-4), loss='mean_squared_error', metrics=['accuracy'])
model.compile(optimizer=RMSprop(lr=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
#model.compile(optimizer=opt,loss='mean_squared_error', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sequence1 (InputLayer)          (None, 18)           0                                            
__________________________________________________________________________________________________
sequence2 (InputLayer)          (None, 18)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 18, 300)      3252600     sequence1[0][0]                  
                                                                 sequence2[0][0]                  
__________________________________________________________________________________________________
lstm_35 (LSTM)                  (None, 100)          160400      embedding_3[42][0]               
          

In [261]:
model.fit([padding(train_encoder_seqs), padding(train_decoder_seqs)], np.array(train_labels),
          batch_size = 100, epochs = 5, validation_split = 0.1)

Train on 10377 samples, validate on 1153 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1589a80b8>

In [262]:
predicted_similarity = model.predict([padding(test_encoder_seqs), padding(test_decoder_seqs)])

In [263]:
print(classification_report(test_labels, [prob > 0.5 for prob in predicted_similarity]))

              precision    recall  f1-score   support

       False       0.80      0.96      0.87       663
        True       0.34      0.07      0.12       175

   micro avg       0.78      0.78      0.78       838
   macro avg       0.57      0.52      0.50       838
weighted avg       0.70      0.78      0.72       838

