In [1]:
!curl https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz --output numberbatch-en-17.06.txt.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  257M  100  257M    0     0  1955k      0  0:02:14  0:02:14 --:--:-- 2770k6k


In [5]:
#!gunzip numberbatch-en-17.06.txt.gz

In [2]:
from gensim.models import KeyedVectors

numberbatch = KeyedVectors.load_word2vec_format("numberbatch-en-17.06.txt.gz", binary=False)

In [4]:
numberbatch.most_similar(['functor'])

[('contravariant_functor', 0.9674091935157776),
 ('forgetful_functor', 0.9666687250137329),
 ('yoneda_embedding', 0.9497337937355042),
 ('endofunctor', 0.9360368847846985),
 ('representable_functor', 0.9314213991165161),
 ('cofunctor', 0.9296932220458984),
 ('natural_transformation', 0.9164144992828369),
 ('yoneda_lemma', 0.9022417664527893),
 ('coaugmentation', 0.8871707320213318),
 ('category_theory', 0.8751257658004761)]

In [5]:
from collections import namedtuple

Phrase = namedtuple('Phrase', 'original candidate label')
Token = namedtuple('Token', 'text tags')

def split_tokens(sent):
    tokens = []
    for token in sent.split():
        tags = token.split('/')
        tokens.append(Token(tags[0].lower(), tuple(tags[1:])))
    return tokens


def readData(filename, eval_label, ignoreNone):
    data = []
    with open(filename) as f:
        for line in f:
            fields = line.strip().split('\t')
            if len(fields) == 7:
                (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = fields
            else:
                continue
            label = eval_label(judge)
            if ((label is None) and ignoreNone):
                continue
            data.append(Phrase(split_tokens(origsenttag), split_tokens(candsenttag), label))
    
    return data
                
def eval_amt_label(label):
    nYes = eval(label)[0]            
    
    if nYes >= 3:
        return True
    elif nYes <= 1:
        return False
    
    return None

def eval_expert_label(label):
    nYes = int(label[0])
    
    if nYes >= 4:
        return True
    elif nYes <= 2:
        return False
    
    return None


def readTrainData(filename):
    return readData(filename, eval_amt_label, True)

def readTestData(filename):
    return readData(filename, eval_expert_label, False)

In [6]:
train_data = readTrainData("SemEval-PIT2015-py3/data/train.data")
dev_data = readTrainData("SemEval-PIT2015-py3/data/dev.data")
test_data = [p for p in readTestData("SemEval-PIT2015-py3/data/test.data") if p.label is not None]

In [228]:
len(test_data)

838

In [7]:
train_data[:3]

[Phrase(original=[Token(text='ej', tags=('B-person', 'NNP', 'B-NP', 'O')), Token(text='manuel', tags=('I-person', 'NNP', 'B-VP', 'O')), Token(text='the', tags=('O', 'DT', 'B-NP', 'O')), Token(text='1st', tags=('O', 'CD', 'I-NP', 'O')), Token(text='qb', tags=('O', 'NNP', 'I-NP', 'O')), Token(text='to', tags=('O', 'TO', 'B-VP', 'O')), Token(text='go', tags=('O', 'VB', 'I-VP', 'B-EVENT')), Token(text='in', tags=('O', 'IN', 'B-PP', 'I-EVENT')), Token(text='this', tags=('O', 'DT', 'B-NP', 'O')), Token(text='draft', tags=('O', 'NN', 'I-NP', 'O'))], candidate=[Token(text='but', tags=('O', 'CC', 'O', 'O')), Token(text='my', tags=('O', 'PRP$', 'B-NP', 'O')), Token(text='bro', tags=('O', 'NN', 'I-NP', 'O')), Token(text='from', tags=('O', 'IN', 'B-PP', 'O')), Token(text='the', tags=('O', 'DT', 'B-NP', 'O')), Token(text='757', tags=('O', 'CD', 'I-NP', 'O')), Token(text='ej', tags=('B-person', 'NNP', 'I-NP', 'O')), Token(text='manuel', tags=('I-person', 'NNP', 'I-NP', 'O')), Token(text='is', tags=(

In [8]:
test_data[:3]

[Phrase(original=[Token(text='all', tags=('O', 'DT', 'B-NP', 'O')), Token(text='the', tags=('O', 'DT', 'I-NP', 'O')), Token(text='home', tags=('O', 'NN', 'I-NP', 'O')), Token(text='alones', tags=('O', 'VBZ', 'B-VP', 'O')), Token(text='watching', tags=('O', 'VBG', 'I-VP', 'B-EVENT')), Token(text='8', tags=('O', 'CD', 'B-NP', 'O')), Token(text='mile', tags=('O', 'NN', 'I-NP', 'O'))], candidate=[Token(text='the', tags=('O', 'DT', 'B-NP', 'O')), Token(text='last', tags=('O', 'JJ', 'I-NP', 'O')), Token(text='rap', tags=('O', 'NN', 'I-NP', 'B-EVENT')), Token(text='battle', tags=('O', 'NN', 'I-NP', 'B-EVENT')), Token(text='in', tags=('O', 'IN', 'B-PP', 'O')), Token(text='8', tags=('O', 'CD', 'B-NP', 'O')), Token(text='mile', tags=('O', 'NNP', 'I-NP', 'O')), Token(text='nevr', tags=('O', 'NN', 'I-NP', 'O')), Token(text='gets', tags=('O', 'VBZ', 'B-VP', 'O')), Token(text='old', tags=('O', 'JJ', 'B-NP', 'O')), Token(text='ahah', tags=('O', 'JJ', 'I-NP', 'O'))], label=False),
 Phrase(original=[To

In [253]:
from nltk.corpus import stopwords
from tqdm import tqdm_notebook

def clean_sent(sent):
    new_sent = []
    for token in sent:
        if token.tags[0].startswith('B-'):
            new_sent.append(token.tags[0].split('-')[1])
            continue
        if token.tags[0].startswith('I-'): #or token.text in stopwords.words('english'):
            continue
        if token.tags[1] == 'CD':
            new_sent.append('number')
            continue
        new_sent.append(token.text)
                            
    return new_sent

def clean_data(data):
    return [Phrase(clean_sent(phrase.original), clean_sent(phrase.candidate), phrase.label) \
            for phrase in tqdm_notebook(data)]

In [266]:
clean_train_data = clean_data(train_data)

HBox(children=(IntProgress(value=0, max=11530), HTML(value='')))

In [267]:
clean_dev_data = clean_data(dev_data)

HBox(children=(IntProgress(value=0, max=4142), HTML(value='')))

In [268]:
clean_test_data = clean_data(test_data)

HBox(children=(IntProgress(value=0, max=838), HTML(value='')))

In [269]:
clean_train_data[:3]

[Phrase(original=['person', 'the', 'number', 'qb', 'to', 'go', 'in', 'this', 'draft'], candidate=['but', 'my', 'bro', 'from', 'the', 'number', 'person', 'is', 'the', 'number', 'qb', 'gone'], label=True),
 Phrase(original=['person', 'the', 'number', 'qb', 'to', 'go', 'in', 'this', 'draft'], candidate=['can', 'believe', 'person', 'went', 'as', 'the', 'number', 'qb', 'in', 'the', 'draft'], label=True),
 Phrase(original=['person', 'the', 'number', 'qb', 'to', 'go', 'in', 'this', 'draft'], candidate=['person', 'is', 'the', 'number', 'qb', 'what'], label=True)]

In [270]:
from gensim.corpora import Dictionary

def phrase_tokens(phrase):
    return [token for token in (phrase.original + phrase.candidate)]
    

vocab = Dictionary([phrase_tokens(p) for p in clean_train_data + clean_dev_data + clean_test_data])

len(vocab)

8975

In [271]:
VOCAB_SIZE = len(vocab) + 1 # +1 for padding

print(VOCAB_SIZE)

8976


In [272]:
id2token = dict([(i, token)for token, i in vocab.token2id.items()])

In [273]:
def text_to_sequence(words):
    return [i + 1 for i in vocab.doc2idx(words)]

def sequence_to_text(seq):
    return [id2token[i - 1] for i in seq if i]

In [274]:
def data_to_sequences(data):
    
    encoder_seqs = []
    decoder_seqs = []
    labels = []
    
    for phrase in data:
        encoder_seqs.append(text_to_sequence([t for t in phrase.original]))
        decoder_seqs.append(text_to_sequence([t for t in phrase.candidate]))
        labels.append(phrase.label)
        
    return encoder_seqs, decoder_seqs, labels 

train_encoder_seqs, train_decoder_seqs, train_labels = data_to_sequences(clean_train_data)

print(len(train_encoder_seqs))
print(len(train_decoder_seqs))
print(len(train_labels))

11530
11530
11530


In [275]:
print(sequence_to_text(train_encoder_seqs[0]))
print(sequence_to_text(train_decoder_seqs[0]))
print(train_labels[0])

['person', 'the', 'number', 'qb', 'to', 'go', 'in', 'this', 'draft']
['but', 'my', 'bro', 'from', 'the', 'number', 'person', 'is', 'the', 'number', 'qb', 'gone']
True


In [276]:
dev_encoder_seqs, dev_decoder_seqs, dev_labels = data_to_sequences(clean_dev_data)

In [277]:
test_encoder_seqs, test_decoder_seqs, test_labels = data_to_sequences(clean_test_data)

In [278]:
MAX_SEQ_LEN = max([len(seq) for seq in (train_encoder_seqs + train_decoder_seqs + \
                                       dev_encoder_seqs + dev_decoder_seqs + \
                                       test_decoder_seqs + test_encoder_seqs)])

print(MAX_SEQ_LEN)

18


In [279]:
from keras.preprocessing.sequence import pad_sequences

def padding(sequences):
    return pad_sequences(sequences, maxlen=MAX_SEQ_LEN, dtype='int32', padding='post', truncating='post')

In [280]:
EMBEDDING_SIZE = 300

In [281]:
import numpy as np

EMBEDDING_MATRIX = np.zeros((VOCAB_SIZE, EMBEDDING_SIZE))
  
missed = set([])
for word, i in vocab.token2id.items():
    try:
        EMBEDDING_MATRIX[i] = numberbatch[word]
    except KeyError:
        missed.add(word)

print(len(missed))

1710


In [282]:
missed

{'supposely',
 'itsstephsoricex',
 'ltrauzzi',
 'iker',
 'fbis',
 'courside',
 'bmthofficial',
 'rulegoverence',
 '81st',
 'spursvslakers',
 '40000',
 'latttwoone',
 'prayforkrista',
 'yammed',
 'daterape',
 'at127',
 'squeeeee',
 '90s',
 'hardinghe',
 '59th',
 'fukk',
 'airrielle',
 'subtchicago',
 '9inning',
 'snappin',
 'revolutionised',
 'collegeboard',
 'nlc',
 'grooooooown',
 'nialler',
 'outchea',
 '52nd',
 'ggmu',
 '27footers',
 'homewreaker',
 'sdchargers',
 'fga',
 'worldie',
 'tylerwinkeljohn',
 'uo',
 'davidarchie',
 'boofed',
 'lightskinnedmixed',
 'hackaasik',
 'calumhooding',
 'athome',
 'hahah',
 'solomonswisdom',
 'dagauvins',
 'teamvedo',
 'maloofs',
 'beatt',
 'kunalmerchant',
 'bulllshit',
 'lightskin',
 'nerly',
 'meshugeneh',
 'outtt',
 'coentrao',
 'frankcaliendo',
 'miggitymiggitymiggitymiggity',
 'dealine',
 '12',
 'kuntry',
 'appli',
 '35th',
 'smmfh',
 'ciroc',
 'babycake',
 'couldshould',
 'assel',
 'loooool',
 'gezwxm87',
 'zzzzzzzzzzzzzzzz',
 'pfdc',
 'que

In [283]:
len(missed)/len(vocab)

0.1905292479108635

In [284]:
train_data_missed = [(p, t.text) for p in train_data for t in p.original + p.candidate if t.text in missed]

In [285]:
len(train_data_missed)/len(train_data)

0.7278404163052905

In [156]:
train_data_missed[:10]

[(Phrase(original=[Token(text='ej', tags=('B-person', 'NNP', 'B-NP', 'O')), Token(text='manuel', tags=('I-person', 'NNP', 'B-VP', 'O')), Token(text='the', tags=('O', 'DT', 'B-NP', 'O')), Token(text='1st', tags=('O', 'CD', 'I-NP', 'O')), Token(text='qb', tags=('O', 'NNP', 'I-NP', 'O')), Token(text='to', tags=('O', 'TO', 'B-VP', 'O')), Token(text='go', tags=('O', 'VB', 'I-VP', 'B-EVENT')), Token(text='in', tags=('O', 'IN', 'B-PP', 'I-EVENT')), Token(text='this', tags=('O', 'DT', 'B-NP', 'O')), Token(text='draft', tags=('O', 'NN', 'I-NP', 'O'))], candidate=[Token(text='but', tags=('O', 'CC', 'O', 'O')), Token(text='my', tags=('O', 'PRP$', 'B-NP', 'O')), Token(text='bro', tags=('O', 'NN', 'I-NP', 'O')), Token(text='from', tags=('O', 'IN', 'B-PP', 'O')), Token(text='the', tags=('O', 'DT', 'B-NP', 'O')), Token(text='757', tags=('O', 'CD', 'I-NP', 'O')), Token(text='ej', tags=('B-person', 'NNP', 'I-NP', 'O')), Token(text='manuel', tags=('I-person', 'NNP', 'I-NP', 'O')), Token(text='is', tags=

In [286]:
from keras.layers import Embedding

embedding_layer = Embedding(input_dim = VOCAB_SIZE, 
                            output_dim = EMBEDDING_SIZE,
                            input_length = MAX_SEQ_LEN,
                            weights = [EMBEDDING_MATRIX], trainable = False)

In [287]:
from keras.layers import Input, LSTM, Embedding, TimeDistributed, Dense, Dropout
from keras.models import Model
from keras.optimizers import RMSprop

HIDDEN_DIM = 300

encoder_inputs = Input(shape=(MAX_SEQ_LEN, ), dtype='int32',)
encoder_embedding = embedding_layer(encoder_inputs)
encoder_LSTM = LSTM(HIDDEN_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)
    
decoder_inputs = Input(shape=(MAX_SEQ_LEN, ), dtype='int32',)
decoder_embedding = embedding_layer(decoder_inputs)
decoder_LSTM = LSTM(HIDDEN_DIM, return_state=True)
decoder_outputs, _, _ = decoder_LSTM(decoder_embedding, initial_state=[state_h, state_c])
    
outputs = Dense(1, activation='sigmoid')(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], outputs)

model.summary()
model.compile(optimizer=RMSprop(lr=1e-4), loss='binary_crossentropy', metrics=['acc'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, 18)           0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 18)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 18, 300)      2692800     input_12[0][0]                   
                                                                 input_13[0][0]                   
__________________________________________________________________________________________________
lstm_19 (LSTM)                  [(None, 300), (None, 721200      embedding_3[0][0]                
__________

In [291]:
model.fit([padding(train_encoder_seqs), padding(train_decoder_seqs)], np.array(train_labels),
          batch_size = 100, epochs = 5, validation_data=([padding(dev_encoder_seqs), padding(dev_decoder_seqs)], dev_labels))

Train on 11530 samples, validate on 4142 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8e7e200518>

In [292]:
from sklearn.metrics import classification_report

print(classification_report(dev_labels, [prob > 0.5 for prob in model.predict([padding(dev_encoder_seqs), padding(dev_decoder_seqs)])]))

              precision    recall  f1-score   support

       False       0.67      0.94      0.78      2672
        True       0.59      0.16      0.25      1470

   micro avg       0.66      0.66      0.66      4142
   macro avg       0.63      0.55      0.52      4142
weighted avg       0.64      0.66      0.59      4142



In [293]:
from sklearn.metrics import classification_report

print(classification_report(test_labels, [prob > 0.5 for prob in model.predict([padding(test_encoder_seqs), padding(test_decoder_seqs)])]))

              precision    recall  f1-score   support

       False       0.80      0.95      0.87       663
        True       0.29      0.08      0.12       175

   micro avg       0.77      0.77      0.77       838
   macro avg       0.54      0.51      0.50       838
weighted avg       0.69      0.77      0.71       838



In [237]:
def dump_scores(filename, predicted_similarity):
    with open(filename, 'w+') as f:
        for estimate in predicted_similarity:                    
            f.write("{}\t{:.4f}\n".format(str(estimate.item() > 0.5).lower(), estimate.item()))

In [294]:
dump_scores('PIT2015_zubovych_autoencoder.output', model.predict([padding(test_encoder_seqs), padding(test_decoder_seqs)]))

In [295]:
!python SemEval-PIT2015-py3/scripts/pit2015_eval_single.py SemEval-PIT2015-py3/data/test_bin.label PIT2015_zubovych_autoencoder.output

838	zubovych	autoencoder		F: 0.125	Prec: 0.286	Rec: 0.080		P-corr: 0.060	F1: 0.347	Prec: 0.210	Rec: 1.000


In [297]:
import keras
import keras.backend as K
from keras.models import Model
from keras.layers import Activation, Dropout
from keras.layers import Embedding, Input
from keras.layers import LSTM, Lambda, concatenate
from keras import regularizers

HIDDEN_DIM=100

def exponent_neg_manhattan_distance(x, hidden_size=HIDDEN_DIM):
    ''' Helper function for the similarity estimate of the LSTMs outputs '''
    return K.exp(-K.sum(K.abs(x[:,:hidden_size] - x[:,hidden_size:]), axis=1, keepdims=True))

def exponent_neg_cosine_distance(x, hidden_size=HIDDEN_DIM):
    ''' Helper function for the similarity estimate of the LSTMs outputs '''
    leftNorm = K.l2_normalize(x[:,:hidden_size], axis=-1)
    rightNorm = K.l2_normalize(x[:,hidden_size:], axis=-1)
    return K.exp(K.sum(K.prod([leftNorm, rightNorm], axis=0), axis=1, keepdims=True))

In [299]:
seq_1 = Input(shape=(MAX_SEQ_LEN,), dtype='int32', name='sequence1')
seq_2 = Input(shape=(MAX_SEQ_LEN,), dtype='int32', name='sequence2')

input_1 = embedding_layer(seq_1)
input_2 = embedding_layer(seq_2)

l1 = LSTM(units=HIDDEN_DIM)

l1_out = l1(input_1)
l2_out = l1(input_2)

concats = concatenate([l1_out, l2_out], axis=-1)

#main_output = Lambda(exponent_neg_cosine_distance, output_shape=(1,))(concats)
main_output = Lambda(exponent_neg_manhattan_distance, output_shape=(1,))(concats)
#dense_ouput = Dense(1024, activation="relu")(concats)
#main_output = Dense(1, activation="sigmoid")(dense_ouput)

model = Model(inputs=[seq_1, seq_2], outputs=[main_output])

opt = keras.optimizers.Adadelta(lr = 0.1, clipnorm=1.25)

#model.compile(optimizer=RMSprop(lr=1e-4), loss='mean_squared_error', metrics=['accuracy'])
#model.compile(optimizer=RMSprop(lr=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
model.compile(optimizer=opt,loss='mean_squared_error', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sequence1 (InputLayer)          (None, 18)           0                                            
__________________________________________________________________________________________________
sequence2 (InputLayer)          (None, 18)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 18, 300)      2692800     sequence1[0][0]                  
                                                                 sequence2[0][0]                  
__________________________________________________________________________________________________
lstm_23 (LSTM)                  (None, 100)          160400      embedding_3[6][0]                
          

In [301]:
model.fit([padding(train_encoder_seqs), padding(train_decoder_seqs)], np.array(train_labels),
          batch_size = 100, epochs = 2, validation_data = ([padding(dev_encoder_seqs), padding(dev_decoder_seqs)], dev_labels))

Train on 11530 samples, validate on 4142 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f8e7e221fd0>

In [302]:
print(classification_report(dev_labels, [prob > 0.5 for prob in  model.predict([padding(dev_encoder_seqs), padding(dev_decoder_seqs)])]))

              precision    recall  f1-score   support

       False       0.67      0.86      0.75      2672
        True       0.48      0.23      0.31      1470

   micro avg       0.64      0.64      0.64      4142
   macro avg       0.57      0.55      0.53      4142
weighted avg       0.60      0.64      0.60      4142



In [304]:
print(classification_report(test_labels, [prob > 0.5 for prob in model.predict([padding(test_encoder_seqs), padding(test_decoder_seqs)])]))

              precision    recall  f1-score   support

       False       0.81      0.87      0.84       663
        True       0.31      0.23      0.26       175

   micro avg       0.73      0.73      0.73       838
   macro avg       0.56      0.55      0.55       838
weighted avg       0.71      0.73      0.72       838



In [305]:
dump_scores('PIT2015_zubovych_MaLSTM.output', model.predict([padding(test_encoder_seqs), padding(test_decoder_seqs)]))

In [307]:
!python SemEval-PIT2015-py3/scripts/pit2015_eval_single.py SemEval-PIT2015-py3/data/test.label PIT2015_zubovych_MaLSTM.output

Traceback (most recent call last):
  File "SemEval-PIT2015-py3/scripts/pit2015_eval_single.py", line 205, in <module>
    print(PITEval(testlabelfile, outputfile))
  File "SemEval-PIT2015-py3/scripts/pit2015_eval_single.py", line 181, in PITEval
    return EvalSingleSystem(labelfile, outfile)
  File "SemEval-PIT2015-py3/scripts/pit2015_eval_single.py", line 168, in EvalSingleSystem
    pcorrelation = pearson(sysscores, goldscores)
  File "SemEval-PIT2015-py3/scripts/pit2015_eval_single.py", line 32, in pearson
    assert len(x) == len(y)
AssertionError
