In [1757]:
import json
import urllib
import requests
import os
import hashlib
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import en_core_web_md
from nltk.stem.porter import PorterStemmer
from nltk import ngrams

In [531]:
nlp = en_core_web_md.load()

In [1453]:
def filter_unknowns(data):
    return [x for x in data if x['gold_label'] != '-']

with open('../../../../../corpora/snli_1.0/snli_1.0_train.jsonl') as f:
    train_data = filter_unknowns([json.loads(line) for line in f.readlines()])
    
with open('../../../../../corpora/snli_1.0/snli_1.0_dev.jsonl') as f:
    dev_data = filter_unknowns([json.loads(line) for line in f.readlines()])
    
with open('../../../../../corpora/snli_1.0/snli_1.0_test.jsonl') as f:
    test_data = filter_unknowns([json.loads(line) for line in f.readlines()])

In [1892]:
''' Utils '''


def compose(*funcs):
    def inner(*arg):
        res = {}
        for f in funcs:
            res.update(f(*arg))
        return res
    return inner


def filter_stop_words(doc):
    return [x for x in doc if not (x.pos_ == 'DET' or x.pos_ == 'NUM'
                                   or x.is_stop and x.dep_ != 'ROOT')]


def normalize_sent(func):
    def inner(s1, s2):
        return func(filter_stop_words(s1), filter_stop_words(s2))
    return inner


def get_classifier():
    pipe = Pipeline([
        ('dict_vect', DictVectorizer()),
        ('lrc', LogisticRegression(random_state=42, multi_class='multinomial',
                                   max_iter=100, solver='sag', n_jobs=-1))])

    return pipe


def get_intersection(ents1, ents2):
    setA = set(ents1)
    setB = set(ents2)
    universe = setA | setB
    if not setB:
        return 'NONE'

    return len(setA & setB)/(len(setB))


def get_tokens_similarity(toks1, toks2):
    setA = set(toks1)
    setB = set(toks2)
    universe = set(toks1) | set(toks2)
    sim = [x.similarity(y)
           for x in setA for y in setB if x.has_vector and y.has_vector]
    return len(sim)/(len(universe))


def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
''' Working with concepts'''


def get_concepts_for_roots(data):
    ex_conc = []

    def get_concepts_for_sent(sent):
        s_conc = None
        if os.path.isfile('./concs.txt'):
            with open('./concs.txt') as f:
                ex_conc = [x.rstrip() for x in f.readlines()]
        else:
            ex_conc = []
        for tok in nlp(sent):
            if tok.lemma_ not in ex_conc and tok.dep_ == 'ROOT':
                s_conc = get_concepts(tok.lemma_)['edges']
                with open('./concs.txt', 'a') as f:
                    f.write(tok.lemma_ + '\n')
                ex_conc.append(tok.lemma_)

        return s_conc

    conc = []
    for i, item in enumerate(data):
        conc.append(get_concepts_for_sent(item['sentence1']))
        conc.append(get_concepts_for_sent(item['sentence2']))
    return conc


def get_concepts(concept):
    offset = 0
    req = requests.get('http://api.conceptnet.io/c/en/' +
                       concept + '?offset=' + str(offset) + '&limit=100').json()
    all_edges = req
    return all_edges


def get_conc(data, path):
    valid_relations = ['Synonym', 'RelatedTo', 'FormOf', 'IsA', 'PartOf', 'UsedFor', 'CapableOf',
                       'Antonym', 'DefinedAs', 'SimilarTo', 'EtymologicallyRelatedTo',
                       'ReceivesAction']
    if not os.path.isdir(path):
        os.mkdir(path)
    chunk = list(chunks(data, 500))
    i = 0
    for ch in chunk:
        train_conc_root = get_concepts_for_roots(ch)
        with open(f'./{path}/{i}.json', 'w') as f:
            non_null = [x for x in train_conc_root if x]
            filtered = [x for conc in non_null for x in conc if x['rel']['label'] in valid_relations
                        and x['start']['language'] == 'en' and x['end']['language'] == 'en']
            json.dump(filtered, f)
            i += 1


def merge_concepts(dirs):
    res = []
    for d in dirs:
        files = os.listdir(d)
        for file in files:
            with open(os.path.join(d, file)) as f:
                cont = json.load(f)
                res += cont
    return res


def normalize_concepts(concepts):
    res = []
    for concept in concepts:
        if concept['start']['language'] == 'en':
            res.append({concept['start']['label'].lower(): concept})
    return res


def get_concepts_local(word):
    all_concepts = merge_concepts(['train_conc', 'dev_conc', 'test_conc'])
    normalized = normalize_concepts(all_concepts)
    return [v for dic in normalized for k, v in dic.items() if k == word]


# with open('./all_concepts.json', 'w') as f:
#     all_concepts = merge_concepts(['train_conc', 'dev_conc', 'test_conc'])
#     normalized = normalize_concepts(all_concepts)
#     json.dump(normalized, f)


def get_rels(word):
    concepts = get_concepts_local(word)

    synonyms = []
    related = []
    forms = []
    hyponyms = []
    meronyms = []
    holonyms = []
    capabilities = []
    causes = []
    antonyms = []
    meanings = []
    similarities = []
    common_origins = []
    can_be_done_to = []

    def _check_rel(rel_type, rel_list):
        if concept['rel']['label'] == rel_type:
            lab = concept['end']['label']
            if lab not in rel_list:
                rel_list.append(lab)

    for concept in concepts:
        _check_rel('Synonym', synonyms)
        _check_rel('RelatedTo', related)
        _check_rel('FormOf', forms)
        _check_rel('IsA', hyponyms)
        _check_rel('PartOf', meronyms)
        _check_rel('UsedFor', holonyms)
        _check_rel('CapableOf', capabilities)
        _check_rel('Antonym', antonyms)
        _check_rel('DefinedAs', meanings)
        _check_rel('SimilarTo', similarities)
        _check_rel('EtymologicallyRelatedTo', common_origins)
        _check_rel('ReceivesAction', can_be_done_to)

    return {
        'synonyms': synonyms,
        'related': related,
        'forms': forms,
        'hyponyms': hyponyms,
        'meronyms': meronyms,
        'holonyms': holonyms,
        'capabilities': capabilities,
        'antonyms': antonyms,
        'meanings': meanings,
        'similarities': similarities,
        'common_origins': common_origins,
        'can_be_done_to': can_be_done_to,
    }

In [1902]:
''' Feature extractors '''


def feature_extractor_base(doc1, doc2):
    feats = {}
    feats['similarity'] = doc1.similarity(doc2)

    return feats


@normalize_sent
def feature_extractor_ner(doc1, doc2):
    def _inner(doc):
        return [x.ent_type_ for x in doc]
    feats = {}

    feats['ner'] = get_intersection(_inner(doc1), _inner(doc2))

    return feats


@normalize_sent
def feature_extractor_word(doc1, doc2):
    def _lemm(doc):
        return [x.lemma_ for x in doc]

    def _noun(doc):
        return [x.lemma_ for x in doc if x.pos_ == 'NOUN']

    def _verb(doc):
        return [x.lemma_ for x in doc if x.pos_ == 'VERB']

    feats = {}

    feats['lemma'] = get_intersection(_lemm(doc1), _lemm(doc2))
    feats['noun'] = get_intersection(_noun(doc1), _noun(doc2))
    feats['verb'] = get_intersection(_verb(doc1), _verb(doc2))

    return feats


@normalize_sent
def feature_extractor_spacy_sim(doc1, doc2):
    feats = {}

    feats['similar'] = get_tokens_similarity(doc1, doc2)

    return feats


@normalize_sent
def feature_extractor_ngrams(doc1, doc2):
    def _ng_lemma(doc, n):
        if n == 3:
            return [(x.lemma_, y.lemma_, z.lemma_) for (x, y, z) in (x for x in ngrams(doc, n))]
        else:
            return [(x.lemma_, y.lemma_) for (x, y) in (x for x in ngrams(doc, n))]

    def _ng_text(doc, n):
        if n == 3:
            return [(x.text, y.text, z.text) for (x, y, z) in (x for x in ngrams(doc, n))]
        else:
            return [(x.text, y.text) for (x, y) in (x for x in ngrams(doc, n))]

    def _ng_pos(doc, n):
        if n == 3:
            return [(x.pos_, y.pos_, z.pos_) for (x, y, z) in (x for x in ngrams(doc, n))]
        else:
            return [(x.pos_, y.pos_) for (x, y) in (x for x in ngrams(doc, n))]

    def _ng_dep(doc, n):
        if n == 3:
            return [(x.dep_, y.dep_, z.dep_) for (x, y, z) in (x for x in ngrams(doc, n))]
        else:
            return [(x.dep_, y.dep_) for (x, y) in (x for x in ngrams(doc, n))]

    feats = {}

    feats['ngr-2-pos'] = get_intersection(_ng_pos(doc1, 2), _ng_pos(doc2, 2))
    feats['ngr-2-dep'] = get_intersection(_ng_dep(doc1, 2), _ng_dep(doc2, 2))
    feats['ngr-2-lemma'] = get_intersection(
        _ng_lemma(doc1, 2), _ng_lemma(doc2, 2))
    feats['ngr-2-text'] = get_intersection(
        _ng_text(doc1, 2), _ng_text(doc2, 2))
    feats['ngr-3-pos'] = get_intersection(_ng_pos(doc1, 3), _ng_pos(doc2, 3))
    feats['ngr-3-dep'] = get_intersection(_ng_dep(doc1, 3), _ng_dep(doc2, 3))
    feats['ngr-3-lemma'] = get_intersection(
        _ng_lemma(doc1, 3), _ng_lemma(doc2, 3))
    feats['ngr-3-text'] = get_intersection(
        _ng_text(doc1, 3), _ng_text(doc2, 3))

    return feats


@normalize_sent
def feature_extractor_stemm(doc1, doc2):
    stemmer = PorterStemmer()

    def _stem_v(doc):
        return [stemmer.stem(x.text) for x in doc if x.pos_ == 'VERB']

    def _stem_n(doc):
        return [stemmer.stem(x.text) for x in doc if x.pos_ == 'NOUN']

    feats = {}

    feats['stemm-v'] = get_intersection(_stem_v(doc1), _stem_v(doc2))
    feats['stemm-n'] = get_intersection(_stem_n(doc1), _stem_n(doc2))

    return feats


@normalize_sent
def feature_extractor_neg(doc1, doc2):
    def _get_neg(doc):
        neg = ['not', 'n\'t', 'neither', 'nor', 'never', 'none', 'nowhere']
        neg_processed = []
        neg_ind = 100500
        for tok in doc:
            if tok.lower_ in neg:
                neg_ind = tok.i
            elif tok.pos_ == 'PUNCT':
                neg_ind = 100500
                neg_processed.append(tok.lemma_)
            elif tok.i > neg_ind:
                neg_processed.append('NOT_' + tok.lemma_)
            else:
                neg_processed.append(tok.lemma_)
        return neg_processed

    feats = {}
    feats['neg'] = get_intersection(_get_neg(doc1), _get_neg(doc2))
    return feats


def feature_extractor_deps(doc1, doc2):
    def _inner_1(doc):
        return [(x.dep, x.head.dep) for x in doc]
    
    def _inner_2(doc):
        return [(x.dep, x.head.pos_) for x in doc]
    
    def _inner_3(doc):
        return [(x.pos_, x.head.pos_) for x in doc]
    
    def _inner_4(doc):
        return [(x.lemma_, x.head.lemma_) for x in doc]

    feats = {}

    feats['head-dep'] = get_intersection(_inner_1(doc1), _inner_1(doc2))
    feats['head-pos'] = get_intersection(_inner_2(doc1), _inner_2(doc2))
    feats['dep-pos'] = get_intersection(_inner_3(doc1), _inner_3(doc2))
    feats['dep-lemma'] = get_intersection(_inner_4(doc1), _inner_4(doc2))

    return feats


def feature_extractor_semant(cache):
    def inner(doc1, doc2):
        feats = {}

        def _get_rels(doc):
            for tok in doc:
                if tok.dep_ == 'ROOT':
                    if tok.lemma_ not in cache:
                        rels = get_rels(tok.lemma_)
                        cache[tok.lemma_] = rels
                        return rels
                    else:
                        return cache[tok.lemma_]

        rels1 = _get_rels(doc1)
        rels2 = _get_rels(doc2)
        root1_tok = [x for x in doc1 if x.dep_ == 'ROOT'][0]
        root2_tok = [x for x in doc2 if x.dep_ == 'ROOT'][0]
        root1 = root1_tok.lemma_
        root2 = root2_tok.lemma_

        neg = ['not', 'n\'t', 'neither', 'nor', 'never', 'none', 'nowhere']

        feats['syn'] = len([x for x in set(rels2['synonyms'])
                            if x == root1 and doc2[root2_tok.i - 1].text not in neg]) + \
            len([x for x in set(rels1['synonyms'])
                 if x == root2 and doc1[root1_tok.i - 1].text not in neg])
        feats['mean'] = len([x for x in set(rels2['meanings'])
                             if x == root1 or x in rels1['meanings']])
        feats['sim'] = len(
            [x for x in set(rels2['similarities']) if x == root1])
        feats['form'] = len([x for x in set(rels2['forms'])
                             if x == root1 or x in rels1['forms']])
        feats['ant'] = len(
            [x for x in set(rels2['antonyms']) if x in rels1['antonyms']])

        return feats
    return inner

In [None]:
''' Reporting '''


clf = get_classifier()


def get_data(docs, raw_data, feature_extractor):
    features = []
    labels = []

    for i, doc_pair in enumerate(docs):
        nlp1, nlp2 = doc_pair

        features.append(feature_extractor(nlp1, nlp2))
        labels.append(raw_data[i]['gold_label'])

    return features, labels


def print_result(train_docs, test_docs, train_raw_data, test_raw_data, feature_extractor):
    X_train, y_train = get_data(train_docs, train_raw_data, feature_extractor)
    X_dev, y_dev = get_data(test_docs, test_raw_data, feature_extractor)
    clf.fit(X_train, y_train)
    print(classification_report(y_dev, clf.predict(X_dev)))
    
    
''' Optimization helpers '''


def get_nlps(data):
    docs = []
    for i, sent in enumerate(data):
        docs.append((nlp(sent['sentence1']), nlp(sent['sentence2'])))
    return docs

In [None]:
train_docs = get_nlps(docs)

In [None]:
dev_docs = get_nlps(dev_data)

In [None]:
test_docs = get_nlps(test_data)

### Baseline (just simply use sentence similarity fn from spacy ¯\_(ツ)_/¯)

In [1634]:
print_result(train_docs, test_docs, train_data, test_data, feature_extractor_base)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 20 epochs took 8 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    8.3s finished


               precision    recall  f1-score   support

contradiction       0.42      0.51      0.46      3237
   entailment       0.43      0.62      0.51      3368
      neutral       0.35      0.11      0.17      3219

     accuracy                           0.42      9824
    macro avg       0.40      0.42      0.38      9824
 weighted avg       0.40      0.42      0.38      9824



### 1. With NER intersection (погіршення, викидаємо)

In [1635]:
feature_extractor = compose(feature_extractor_base, feature_extractor_ner)
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 21 epochs took 8 seconds
               precision    recall  f1-score   support

contradiction       0.42      0.53      0.47      3237
   entailment       0.42      0.68      0.52      3368
      neutral       0.60      0.07      0.13      3219

     accuracy                           0.43      9824
    macro avg       0.48      0.43      0.37      9824
 weighted avg       0.48      0.43      0.38      9824



[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    8.0s finished


### 2. With words intersection (покращення є)

In [1673]:
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word)
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.48      0.57      0.52      3237
   entailment       0.55      0.66      0.60      3368
      neutral       0.45      0.27      0.34      3219

     accuracy                           0.50      9824
    macro avg       0.49      0.50      0.49      9824
 weighted avg       0.49      0.50      0.49      9824



### 3. With ngrams (покращення невеличке)

In [1900]:
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word,
                            feature_extractor_ngrams,
                           )
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.49      0.55      0.52      3237
   entailment       0.57      0.65      0.61      3368
      neutral       0.45      0.33      0.38      3219

     accuracy                           0.51      9824
    macro avg       0.50      0.51      0.50      9824
 weighted avg       0.51      0.51      0.50      9824



### 4. Намагалась опрацювати заперечення. Не вийшло :(

In [1901]:
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word,
                            feature_extractor_ngrams,
                            feature_extractor_neg
                           )
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.49      0.55      0.52      3237
   entailment       0.57      0.65      0.61      3368
      neutral       0.45      0.33      0.38      3219

     accuracy                           0.51      9824
    macro avg       0.50      0.51      0.50      9824
 weighted avg       0.51      0.51      0.50      9824



### 5. With dependencies (схоже, я не зрозуміла, як їх використати, бо покращення мізерне)

In [1903]:
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word,
                            feature_extractor_ngrams,
                            feature_extractor_deps
                           )
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.50      0.55      0.53      3237
   entailment       0.57      0.64      0.60      3368
      neutral       0.46      0.35      0.40      3219

     accuracy                           0.52      9824
    macro avg       0.51      0.51      0.51      9824
 weighted avg       0.51      0.52      0.51      9824



### 6. With stemms (не допомогло, тому викидаємо)

In [1904]:
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word,
                            feature_extractor_ngrams,
                            feature_extractor_deps,
                            feature_extractor_stemm
                           )
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.50      0.55      0.53      3237
   entailment       0.57      0.64      0.60      3368
      neutral       0.46      0.35      0.40      3219

     accuracy                           0.52      9824
    macro avg       0.51      0.52      0.51      9824
 weighted avg       0.51      0.52      0.51      9824



### 7. With semantic relations (моє найбільше розчарування...)

In [1905]:
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word,
                            feature_extractor_ngrams,
                            feature_extractor_deps,
                            feature_extractor_semant({})
                           )
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.50      0.55      0.53      3237
   entailment       0.57      0.65      0.60      3368
      neutral       0.46      0.35      0.40      3219

     accuracy                           0.52      9824
    macro avg       0.51      0.52      0.51      9824
 weighted avg       0.51      0.52      0.51      9824



### 8. With spacy similarity (остання надія - на штучний інтелект, раз бракує свого :))

In [1906]:
feature_extractor = compose(feature_extractor_base,
                            feature_extractor_word,
                            feature_extractor_ngrams,
                            feature_extractor_deps,
                            feature_extractor_spacy_sim,
                           )
print_result(train_docs, test_docs, train_data, test_data, feature_extractor)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


               precision    recall  f1-score   support

contradiction       0.51      0.55      0.53      3237
   entailment       0.58      0.63      0.60      3368
      neutral       0.47      0.38      0.42      3219

     accuracy                           0.52      9824
    macro avg       0.52      0.52      0.52      9824
 weighted avg       0.52      0.52      0.52      9824

