In [1427]:
import json
import urllib
import requests
import os
import hashlib
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import en_core_web_md
# from spacy.tokens import Doc
from nltk.stem.porter import PorterStemmer

In [531]:
nlp = en_core_web_md.load()

In [1453]:
def filter_unknowns(data):
    return [x for x in data if x['gold_label'] != '-']

with open('../../../../../corpora/snli_1.0/snli_1.0_train.jsonl') as f:
    train_data = filter_unknowns([json.loads(line) for line in f.readlines()])
    
with open('../../../../../corpora/snli_1.0/snli_1.0_dev.jsonl') as f:
    dev_data = filter_unknowns([json.loads(line) for line in f.readlines()])
    
with open('../../../../../corpora/snli_1.0/snli_1.0_test.jsonl') as f:
    test_data = filter_unknowns([json.loads(line) for line in f.readlines()])

In [1676]:
def compose(*funcs):
    def inner(*arg):
        res = {}
        for f in funcs:
            res.update(f(*arg))
        return res
    return inner

def get_nlp_normalized(sent):
    doc = nlp(sent)
    neg = ['not', 'n\'t', 'neither', 'nor', 'never', 'none', 'nowhere']
    neg_processed = []
    neg_ind = 100500
    for tok in doc:
        if tok.lower_ in neg:
            neg_ind = tok.i
        elif tok.pos_ == 'PUNCT':
            neg_ind = 100500
            neg_processed.append(tok.text)
        elif tok.i > neg_ind:
            neg_processed.append('NOT_' + tok.lemma_)
        else:
            neg_processed.append(tok.text)
    return nlp(' '.join(neg_processed))


def filter_stop_words(doc):
    return [x for x in doc if not (x.pos_ == 'DET' or x.pos_ == 'NUM' or x.is_stop and x.dep_ != 'ROOT')]


def normalize_sent(func):
    def inner(s1, s2): 
        return func(filter_stop_words(s1), filter_stop_words(s2)) 
    return inner


def get_classifier():
    pipe = Pipeline([
        ('dict_vect', DictVectorizer()),
        ('lrc', LogisticRegression(random_state=42, multi_class='multinomial',
                                   max_iter=100, solver='sag', n_jobs=-1))])

    return pipe

# TODO: use UAS for deps, compare UAS
def get_intersection(ents1, ents2):
    setA = set(ents1)
    setB = set(ents2)
    universe = setA | setB
#     if not universe:
    if not setB:
        return 'NONE'

#     return len(setA & setB)/(len(universe))
    return len(setA & setB)/(len(setB))

# TODO: try to use, see if it's better than ^^
def get_intersection_alt(ents1, ents2):
    if not ents2:
        return 'NONE'
    i = [x for x in ents2 if x in ents1]

    return len(i)/(len(setB))


def get_tokens_similarity(toks1, toks2):
    setA = set(toks1)
    setB = set(toks2)
    universe = set(toks1) | set(toks2)
    sim = [x.similarity(y) for x in setA for y in setB if x.has_vector and y.has_vector]
    return len(sim)/(len(universe))


def get_ngrams(text):
    res = []
    for i in range(0, len(text), 3):
        if i > 0 and i + 3 <= len(text):
            res.append(text[i:i + 3])
        elif i > 0 and i + 3 > len(text):
            res.append(text[i:i + 3] + '</S>')
        else:
            res.append('<S>' + text[i:i + 3])
    return res


def feature_extractor_base(doc1, doc2):
    feats = {}
    feats['similarity'] = doc1.similarity(doc2)
    
    return feats

# It makes it a bit worse
# TODO: investigate and improve
@normalize_sent
def feature_extractor_ner(doc1, doc2):
    def _inner(doc):
        return [x.ent_type_ for x in doc]
    feats = {}

    feats['ner'] = get_intersection(_inner(doc1), _inner(doc2))
    
    return feats

@normalize_sent
def feature_extractor_word(doc1, doc2):
    def _lemm(doc):
        return [x.lemma_ for x in doc]
    def _noun(doc):
        return [x.lemma_ for x in doc if x.pos_ == 'NOUN']
    def _verb(doc):
        return [x.lemma_ for x in doc if x.pos_ == 'VERB']

    feats = {}

    feats['lemma'] = get_intersection(_lemm(doc1), _lemm(doc2))
    feats['noun'] = get_intersection(_noun(doc1), _noun(doc2))
    feats['verb'] = get_intersection(_verb(doc1), _verb(doc2))
    
    return feats



@normalize_sent
def feature_extractor_verb_nn_sim(doc1, doc2):
    def get_by_pos(pos):
        t1 = [x for x in doc1 if x.pos_ == pos]
        t2 = [x for x in doc2 if x.pos_ == pos]
        return t1, t2
        
    feats = {}
    
    sent1_verbs, sent2_verbs = get_by_pos('VERB')
    sent1_nouns, sent2_nouns = get_by_pos('NOUN')
    
    if sent1_verbs and sent2_verbs:
        feats['v-similar'] = get_tokens_similarity(sent1_verbs, sent2_verbs)
    
    if sent1_nouns and sent2_nouns:
        feats['nn-similar'] = get_tokens_similarity(sent1_nouns, sent2_nouns)

    return feats


@normalize_sent
def feature_extractor_ngrams(doc1, doc2):
    def _ng_pos(doc):
        return get_ngrams(' '.join([x.pos_ for x in doc]))
    def _ng_dep(doc):
        return get_ngrams(' '.join([x.dep_ for x in doc]))
    def _ng_lemma(doc):
        return get_ngrams(' '.join([x.lemma_ for x in doc]))

    feats = {}

    feats['ngr-pos'] = get_intersection(_ng_pos(doc1), _ng_pos(doc2))
    feats['ngr-dep'] = get_intersection(_ng_dep(doc1), _ng_dep(doc2))
    feats['ngr-lemma'] = get_intersection(_ng_lemma(doc1), _ng_lemma(doc2))
    
    return feats



@normalize_sent
def feature_extractor_stemm(doc1, doc2):
    stemmer = PorterStemmer()
    def _stem_w(doc):
        return [stemmer.stem(x.text) for x in doc]
    def _stem_v(doc):
        return [stemmer.stem(x.text) for x in doc if x.pos_ == 'VERB']
    def _stem_n(doc):
        return [stemmer.stem(x.text) for x in doc if x.pos_ == 'NOUN']

    feats = {}

    feats['stemm-w'] = get_intersection(_stem_w(doc1), _stem_w(doc2))
    feats['stemm-v'] = get_intersection(_stem_v(doc1), _stem_v(doc2))
    feats['stemm-n'] = get_intersection(_stem_n(doc1), _stem_n(doc2))
    
    return feats


@normalize_sent
def feature_extractor_neg(doc1, doc2):
    def _get_neg(doc):
        neg = ['not', 'n\'t', 'neither', 'nor', 'never', 'none', 'nowhere']
        neg_processed = []
        neg_ind = 100500
        for tok in doc:
            if tok.lower_ in neg:
                neg_ind = tok.i
            elif tok.pos_ == 'PUNCT':
                neg_ind = 100500
                neg_processed.append(tok.lemma_)
            elif tok.i > neg_ind:
                neg_processed.append('NOT_' + tok.lemma_)
            else:
                neg_processed.append(tok.lemma_)
        return neg_processed

    feats = {}
    feats['neg'] = get_intersection(_get_neg(doc1), _get_neg(doc2))
    return feats


#### Grammatical similarity
def feature_extractor_deps(doc1, doc2):
    def _inner_1(doc):
        return [x.dep for x in doc]
    def _inner_2(doc):
        return [x.head.dep for x in doc]
    def _inner_3(doc):
        return [(x.lemma_, x.head.lemma_, x.dep) for x in doc]
    
    feats = {}

    feats['dep'] = get_intersection(_inner_1(doc1), _inner_1(doc2))
    feats['head-dep'] = get_intersection(_inner_2(doc1), _inner_2(doc2))
#     feats['l-edge-dep'] = get_intersection(_inner_3(doc1), _inner_3(doc2))
    
    return feats

# TODO: syntactic relations (x[0].dep <-- x[1])


def feature_extractor_semant(cache):
#     @normalize_sent
    def inner(doc1, doc2):
        feats = {}

        def _get_rels(doc):
            for tok in doc:
                if tok.dep_ == 'ROOT':
                    if tok.lemma_ not in cache:
                        rels = get_rels(tok.lemma_)
                        cache[tok.lemma_] = rels
                        return rels
                    else:
                        return cache[tok.lemma_]


        rels1 = _get_rels(doc1)
        rels2 = _get_rels(doc2)
        root1 = [x for x in doc1 if x.dep_ == 'ROOT'][0].lemma_
                    
        feats['syn'] = len([x for x in set(rels2['synonyms']) if x == root1])
        feats['rel'] = len([x for x in set(rels2['related']) if x == root1])
        feats['mean'] = len([x for x in set(rels2['meanings']) if x == root1 or x in rels1['meanings']])
        feats['sim'] = len([x for x in set(rels2['similarities']) if x == root1])
        feats['hyp'] = len([x for x in set(rels2['hyponyms']) if x == root1])
        feats['mer'] = len([x for x in set(rels2['meronyms']) if x == root1])
        feats['common'] = len([x for x in set(rels2['common_origins']) if x == root1 or x in rels1['common_origins']])
        feats['form'] = len([x for x in set(rels2['forms']) if x == root1 or x in rels1['forms']])
        feats['ant'] = len([x for x in set(rels2['antonyms']) if x in rels1['antonyms']])

        return feats
    return inner


# def get_data(dataset, feature_extractor, cache):
#     features = []
#     labels = []

    
#     for i, ds in enumerate(dataset):
#         sent1 = ds['sentence1']
#         sent2 = ds['sentence2']
#         md5_1 = hashlib.md5(b'{sent1}')
#         md5_2 = hashlib.md5(b'{sent2}')
        
#         nlp1 = None
#         nlp2 = None

#         if md5_1 not in cache:
#             nlp1 = nlp(sent1)
#             cache[md5_1] = nlp1
#         else:
#             nlp1 = cache[md5_1]
#         if md5_2 not in cache:
#             nlp2 = nlp(sent2)
#             cache[md5_2] = nlp2
#         else:
#             nlp2 = cache[md5_2]
            

#         features.append(feature_extractor(nlp1, nlp2))
#         labels.append(ds['gold_label'])
#         if i % 1000 == 0:
#             print(i)
                        
#     return features, labels


def get_data(docs, raw_data, feature_extractor):
    features = []
    labels = []

    for i, doc_pair in enumerate(docs):
        nlp1, nlp2 = doc_pair
            
        features.append(feature_extractor(nlp1, nlp2))
        labels.append(raw_data[i]['gold_label'])
#         if i % 1000 == 0:
#             print(i)
                        
    return features, labels


def print_result(train_docs, test_docs, train_raw_data, test_raw_data, feature_extractor):
    X_train, y_train = get_data(train_docs, train_raw_data, feature_extractor)
    X_dev, y_dev = get_data(test_docs, test_raw_data, feature_extractor)
    clf.fit(X_train, y_train)
    print(classification_report(y_dev, clf.predict(X_dev)))
    
# def print_result(train_data, test_data, feature_extractor, cache):
#     X_train, y_train = get_data(train_data, feature_extractor, cache)
#     X_dev, y_dev = get_data(test_data, feature_extractor, cache)
#     clf.fit(X_train, y_train)
#     print(classification_report(y_dev, clf.predict(X_dev)))
    
    
def get_concepts(concept):
    offset = 0
    req = requests.get('http://api.conceptnet.io/c/en/' + concept + '?offset=' + str(offset) + '&limit=100').json()
    all_edges = req
    return all_edges

In [1639]:
clf = get_classifier()

### Baseline (just simply use sentence similarity from spacy)

In [1634]:
print_result(train_docs, test_docs, train_data, test_data, feature_extractor_base)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 20 epochs took 8 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    8.3s finished


               precision    recall  f1-score   support

contradiction       0.42      0.51      0.46      3237
   entailment       0.43      0.62      0.51      3368
      neutral       0.35      0.11      0.17      3219

     accuracy                           0.42      9824
    macro avg       0.40      0.42      0.38      9824
 weighted avg       0.40      0.42      0.38      9824



### 1. With NER intersection (-)

In [1635]:
feature_extractor = compose(feature_extractor_base, feature_extractor_ner)
# print_result(train_data, dev_data, feature_extractor, {})
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 21 epochs took 8 seconds
               precision    recall  f1-score   support

contradiction       0.42      0.53      0.47      3237
   entailment       0.42      0.68      0.52      3368
      neutral       0.60      0.07      0.13      3219

     accuracy                           0.43      9824
    macro avg       0.48      0.43      0.37      9824
 weighted avg       0.48      0.43      0.38      9824



[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    8.0s finished


### 2. With word intersection

In [1673]:
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word)
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.48      0.57      0.52      3237
   entailment       0.55      0.66      0.60      3368
      neutral       0.45      0.27      0.34      3219

     accuracy                           0.50      9824
    macro avg       0.49      0.50      0.49      9824
 weighted avg       0.49      0.50      0.49      9824



In [None]:
# VERB & NOUN tokens similarity (+) !!!! IT TAKES AGES !!!. mab try later
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word,
                            feature_extractor_verb_nn_sim,
                           )
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

### With ngrams

In [1669]:
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word,
                            feature_extractor_ngrams,
                           )
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.48      0.57      0.52      3237
   entailment       0.56      0.66      0.60      3368
      neutral       0.45      0.29      0.35      3219

     accuracy                           0.51      9824
    macro avg       0.50      0.50      0.49      9824
 weighted avg       0.50      0.51      0.49      9824



### Стеми. Покращення практично нема, а стеммер віджирає багато часу. Викидаємо

In [1674]:
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word,
                            feature_extractor_ngrams,
                            feature_extractor_stemm
                           )
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.49      0.58      0.53      3237
   entailment       0.57      0.66      0.61      3368
      neutral       0.45      0.29      0.35      3219

     accuracy                           0.51      9824
    macro avg       0.50      0.51      0.50      9824
 weighted avg       0.50      0.51      0.50      9824



In [1677]:
# neg - throw it
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word,
                            feature_extractor_ngrams,
                            feature_extractor_neg
                           )
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.48      0.57      0.52      3237
   entailment       0.56      0.66      0.60      3368
      neutral       0.45      0.29      0.35      3219

     accuracy                           0.51      9824
    macro avg       0.50      0.50      0.49      9824
 weighted avg       0.50      0.51      0.49      9824



In [1678]:
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word,
                            feature_extractor_ngrams,
                            feature_extractor_deps
                           )
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.49      0.56      0.52      3237
   entailment       0.56      0.65      0.60      3368
      neutral       0.45      0.31      0.36      3219

     accuracy                           0.51      9824
    macro avg       0.50      0.51      0.50      9824
 weighted avg       0.50      0.51      0.50      9824



In [1679]:
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word,
                            feature_extractor_ngrams,
                            feature_extractor_deps,
                            feature_extractor_semant({})
                           )
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.49      0.56      0.52      3237
   entailment       0.56      0.65      0.60      3368
      neutral       0.45      0.31      0.36      3219

     accuracy                           0.51      9824
    macro avg       0.50      0.51      0.50      9824
 weighted avg       0.50      0.51      0.50      9824



In [661]:
s1 = 'three bikers stop in town.'
s2 = 'The bikers didn\'t stop in the town.'
print(get_intersection([x.head for x in nlp(s1)], [x.head for x in nlp(s2)]))

0.0


In [None]:
def find_rel(t1, t2, doc):
    if t1.head == t2

In [718]:
from nltk.corpus import wordnet as wn

In [1284]:
def get_rels(word):
    concepts = get_concepts_local(word)
#     print(word)

    synonyms = []
    related = []
    forms = []
    hyponyms = []
    meronyms = []
    holonyms = []
    capabilities = []
    causes = []
    antonyms = []
    meanings = []
    similarities = []
    common_origins = []
    can_be_done_to = []
    
    def _check_rel(rel_type, rel_list):
        if concept['rel']['label'] == rel_type:
            lab = concept['end']['label']
            if not lab in rel_list:
                rel_list.append(lab)
    
    for concept in concepts:
        _check_rel('Synonym', synonyms)
        _check_rel('RelatedTo', related)
        _check_rel('FormOf', forms)
        _check_rel('IsA', hyponyms)
        _check_rel('PartOf', meronyms)
        _check_rel('UsedFor', holonyms)
        _check_rel('CapableOf', capabilities)
        _check_rel('Antonym', antonyms)
        _check_rel('DefinedAs', meanings)
        _check_rel('SimilarTo', similarities)
        _check_rel('EtymologicallyRelatedTo', common_origins)
        _check_rel('ReceivesAction', can_be_done_to)
        
    return {
        'synonyms': synonyms,
        'related': related,
        'forms': forms,
        'hyponyms': hyponyms,
        'meronyms': meronyms,
        'holonyms': holonyms,
        'capabilities': capabilities,
        'antonyms': antonyms,
        'meanings': meanings,
        'similarities': similarities,
        'common_origins': common_origins,
        'can_be_done_to': can_be_done_to,
    }


In [1481]:
def feature_extractor_syns(doc1, doc2):
    feats = {}

    rels1 = [get_rels(x.lemma_) for x in doc1 if x.dep_ == 'ROOT']
    rels2 = [get_rels(x.lemma_) for x in doc2 if x.dep_ == 'ROOT']
    
    syns1 = [x['synonyms'] for x in rels1]
    ss = 0
    for rel in rels2:
        for syn in rel['synonyms']:
            if syn in syns1:
                ss += 1
    feats['ss'] = ss
    
    return feats

In [1149]:
def get_concepts_for_roots(data):
    ex_conc = []
    def get_concepts_for_sent(sent):
        s_conc = None
        if os.path.isfile('./concs.txt'):
            with open('./concs.txt') as f:
                ex_conc = [x.rstrip() for x in f.readlines()]
        else:
            ex_conc = []
        for tok in nlp(sent):
            if tok.lemma_ not in ex_conc and tok.dep_ == 'ROOT':
                s_conc = get_concepts(tok.lemma_)['edges']
                with open('./concs.txt', 'a') as f:
                    f.write(tok.lemma_ + '\n')
                ex_conc.append(tok.lemma_)
                
        return s_conc

    conc = []
    for i, item in enumerate(data):
        conc.append(get_concepts_for_sent(item['sentence1']))
        conc.append(get_concepts_for_sent(item['sentence2']))
    return conc

valid_relations = ['Synonym', 'RelatedTo', 'FormOf', 'IsA', 'PartOf', 'UsedFor', 'CapableOf',
                  'Antonym', 'DefinedAs', 'SimilarTo', 'EtymologicallyRelatedTo', 'ReceivesAction']

In [1242]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def get_conc(data, path):
    if not os.path.isdir(path):
        os.mkdir(path)
    chunk = list(chunks(data, 500))
    i = 0
    for ch in chunk:
        train_conc_root = get_concepts_for_roots(ch)
        with open(f'./{path}/{i}.json', 'w') as f:
            non_null = [x for x in train_conc_root if x]
            filtered = [x for conc in non_null for x in conc if x['rel']['label'] in valid_relations \
                        and x['start']['language'] == 'en' and x['end']['language'] == 'en']
            json.dump(filtered, f)
            i+= 1
            

def get_concepts_local(word):
    return [v for dic in normalized for k, v in dic.items() if k == word]

In [None]:
get_conc(test_data, 'test_conc')

In [1255]:
def merge_concepts(dirs):
    res = []
    for d in dirs:
        files = os.listdir(d)
        for file in files:
            with open(os.path.join(d, file)) as f:
                cont = json.load(f)
                res += cont
    return res


def normalize_concepts(concepts):
    res = []
    for concept in concepts:
        if concept['start']['language'] == 'en':
            res.append({concept['start']['label'].lower(): concept})
    return res

In [1205]:
all_concepts = merge_concepts(['train_conc', 'dev_conc', 'test_conc'])

In [1261]:
# get_concepts_local(normalized, 'human')
get_rels('human')
# # print(all_concepts[0])

{'synonyms': ['human', 'ljudski', 'umano'],
 'related': ['human', 'person', 'being', 'humane'],
 'forms': ['human'],
 'hyponyms': ['a biped', 'a human', 'primate'],
 'meronyms': ['an ecology'],
 'holonyms': [],
 'capabilities': ['think critically',
  'laugh about a joke',
  'talk to a human',
  'torture',
  'taste dish'],
 'antonyms': [],
 'meanins': [],
 'similarities': ['human', 'manlike', 'earthborn', 'imperfect'],
 'common_origins': [],
 'can_be_done_to': []}

In [1257]:
with open('./all_concepts.json', 'w') as f:
    json.dump(normalized, f)

In [1256]:
normalized = normalize_concepts(all_concepts)

In [1260]:
print(len(normalized))

72021


In [1384]:
d1 = nlp('A person is at a diner, ordering an omelette.')

In [1424]:
d = nlp('I love you.')
print(dir(d))

['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '_bulk_merge', '_py_tokens', '_realloc', '_vector', '_vector_norm', 'cats', 'char_span', 'count_by', 'doc', 'ents', 'extend_tensor', 'from_array', 'from_bytes', 'from_disk', 'get_extension', 'get_lca_matrix', 'has_extension', 'has_vector', 'is_nered', 'is_parsed', 'is_sentenced', 'is_tagged', 'lang', 'lang_', 'mem', 'merge', 'noun_chunks', 'noun_chunks_iterator', 'print_tree', 'remove_extension', 'retokenize', 'sentiment', 'sents', 'set_extension', 'similarity', 'tensor', 'text', 'text_with_ws', 'to_array', 'to_bytes', 'to_disk', 'to_json', 'to_utf8_array', 'user_data', 'user_hooks', 

In [1423]:
print(d.__hash__())

-9223372036230035196


In [1425]:
print(d.__hash__())

634785270


In [1454]:
print(train_data[0])

{'annotator_labels': ['neutral'], 'captionID': '3416050480.jpg#4', 'gold_label': 'neutral', 'pairID': '3416050480.jpg#4r1n', 'sentence1': 'A person on a horse jumps over a broken down airplane.', 'sentence1_binary_parse': '( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )', 'sentence1_parse': '(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))', 'sentence2': 'A person is training his horse for a competition.', 'sentence2_binary_parse': '( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )', 'sentence2_parse': '(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))'}


In [1457]:
print(train_data[2])

{'annotator_labels': ['entailment'], 'captionID': '3416050480.jpg#4', 'gold_label': 'entailment', 'pairID': '3416050480.jpg#4r1e', 'sentence1': 'A person on a horse jumps over a broken down airplane.', 'sentence1_binary_parse': '( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )', 'sentence1_parse': '(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))', 'sentence2': 'A person is outdoors, on a horse.', 'sentence2_binary_parse': '( ( A person ) ( ( ( ( is outdoors ) , ) ( on ( a horse ) ) ) . ) )', 'sentence2_parse': '(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (ADVP (RB outdoors)) (, ,) (PP (IN on) (NP (DT a) (NN horse)))) (. .)))'}


In [1474]:
print(len(train_data))
print(len(deduped_train))

549367
549367


In [1585]:
def get_nlps(data):
    docs = []
    for i, sent in enumerate(data):
        docs.append((nlp(sent['sentence1']), nlp(sent['sentence2'])))
        if i % 1000 == 0:
            print('>> doc', i)
    return docs

In [1544]:
docs = get_nlps(train_data)

>> doc 0
>> doc 1000
>> doc 2000
>> doc 3000
>> doc 4000
>> doc 5000
>> doc 6000
>> doc 7000
>> doc 8000
>> doc 9000
>> doc 10000
>> doc 11000
>> doc 12000
>> doc 13000
>> doc 14000
>> doc 15000
>> doc 16000
>> doc 17000
>> doc 18000
>> doc 19000
>> doc 20000
>> doc 21000
>> doc 22000
>> doc 23000
>> doc 24000
>> doc 25000
>> doc 26000
>> doc 27000
>> doc 28000
>> doc 29000
>> doc 30000
>> doc 31000
>> doc 32000
>> doc 33000
>> doc 34000
>> doc 35000
>> doc 36000
>> doc 37000
>> doc 38000
>> doc 39000
>> doc 40000
>> doc 41000
>> doc 42000
>> doc 43000
>> doc 44000
>> doc 45000
>> doc 46000
>> doc 47000
>> doc 48000
>> doc 49000
>> doc 50000
>> doc 51000
>> doc 52000
>> doc 53000
>> doc 54000
>> doc 55000
>> doc 56000
>> doc 57000
>> doc 58000
>> doc 59000
>> doc 60000
>> doc 61000
>> doc 62000
>> doc 63000
>> doc 64000
>> doc 65000
>> doc 66000
>> doc 67000
>> doc 68000
>> doc 69000
>> doc 70000
>> doc 71000
>> doc 72000
>> doc 73000
>> doc 74000
>> doc 75000
>> doc 76000
>> doc 77000

In [1564]:
train_docs_json = [[x.to_json(), y.to_json()] for x, y in doc.values() for doc in docs]

KeyboardInterrupt: 

In [1626]:
# TODO: remove
def get_doc_pairs(docs):
    pairs = []
    for i, doc in enumerate(docs):
        for x, y in doc.values():
            pairs.append((x, y))
            if i % 1000 == 0:
                print('>> doc', i)
    return pairs

In [1627]:
train_docs = get_doc_pairs(docs)

>> doc 0
>> doc 1000
>> doc 2000
>> doc 3000
>> doc 4000
>> doc 5000
>> doc 6000
>> doc 7000
>> doc 8000
>> doc 9000
>> doc 10000
>> doc 11000
>> doc 12000
>> doc 13000
>> doc 14000
>> doc 15000
>> doc 16000
>> doc 17000
>> doc 18000
>> doc 19000
>> doc 20000
>> doc 21000
>> doc 22000
>> doc 23000
>> doc 24000
>> doc 25000
>> doc 26000
>> doc 27000
>> doc 28000
>> doc 29000
>> doc 30000
>> doc 31000
>> doc 32000
>> doc 33000
>> doc 34000
>> doc 35000
>> doc 36000
>> doc 37000
>> doc 38000
>> doc 39000
>> doc 40000
>> doc 41000
>> doc 42000
>> doc 43000
>> doc 44000
>> doc 45000
>> doc 46000
>> doc 47000
>> doc 48000
>> doc 49000
>> doc 50000
>> doc 51000
>> doc 52000
>> doc 53000
>> doc 54000
>> doc 55000
>> doc 56000
>> doc 57000
>> doc 58000
>> doc 59000
>> doc 60000
>> doc 61000
>> doc 62000
>> doc 63000
>> doc 64000
>> doc 65000
>> doc 66000
>> doc 67000
>> doc 68000
>> doc 69000
>> doc 70000
>> doc 71000
>> doc 72000
>> doc 73000
>> doc 74000
>> doc 75000
>> doc 76000
>> doc 77000

In [1586]:
dev_docs = get_nlps(dev_data)

>> doc 0
>> doc 1000
>> doc 2000
>> doc 3000
>> doc 4000
>> doc 5000
>> doc 6000
>> doc 7000
>> doc 8000
>> doc 9000


In [1587]:
test_docs = get_nlps(test_data)

>> doc 0
>> doc 1000
>> doc 2000
>> doc 3000
>> doc 4000
>> doc 5000
>> doc 6000
>> doc 7000
>> doc 8000
>> doc 9000


In [1629]:
train_docs[0]

(A person on a horse jumps over a broken down airplane.,
 A person is training his horse for a competition.)