In [1]:
import json

def get_data (filename):
    path = 'data/snli_1.0/'
    with open(path + filename, "r") as f:
        data = [json.loads(line) for line in f.readlines() if '"gold_label": "-"' not in line]
            
    labels = [x['gold_label'] for x in data]
    for x in data:
        del x['gold_label']
        del x['annotator_labels']
        del x['captionID']
        del x['pairID']
        
    return data, labels

In [2]:
train_data, train_labels = get_data('snli_1.0_train.jsonl')
dev_data, dev_labels = get_data('snli_1.0_dev.jsonl')
test_data, test_labels = get_data('snli_1.0_test.jsonl')

In [3]:
def prepare_sents (data):
    sents = []
    for item in data:
        sents.append(item['sentence1'])
        sents.append(item['sentence2'])
    return sents

train_sents = prepare_sents(train_data)
dev_sents = prepare_sents(dev_data)
test_sents = prepare_sents(test_data)

In [6]:
print('Train sentences len:', len(train_sents))
print('Dev sentences len:', len(dev_sents))
print('Test sentences len:', len(test_sents))

Train sentences len: 1098734
Dev sentences len: 19684
Test sentences len: 19648


### Baseline

In [5]:
from scipy.spatial.distance import cosine
from gensim.models.wrappers import FastText
import gensim.models.wrappers.fasttext

model = gensim.models.KeyedVectors.load_word2vec_format('data/wiki_news.vec', binary=False, encoding='utf8')

In [7]:
import spacy
import numpy as np
nlp = spacy.load("en_core_web_lg")

def get_sent_vec (sentences):
    result = []
    default_vec = np.ones(300)

    for sent in nlp.pipe(sentences, disable=["ner", "textcat"]):
        word_vectors = []
        for token in sent:
            try:
                word_vector = model[token.lemma_]
                word_vectors.append(np.array(word_vector))
            except:
                word_vectors.append(np.array(default_vec))
            
        sentence_mean = np.mean(np.array(word_vectors), axis=0).reshape(-1,1)
        result.append(sentence_mean)

    return result

train_vectors = get_sent_vec(train_sents)
dev_vectors = get_sent_vec(dev_sents)
test_vectors = get_sent_vec(test_sents)

In [8]:
def get_similiarity (list_of_vecs):
    similarities = []
    for i in range(len(list_of_vecs)):
        if i % 2 == 0:
            similarities.append(cosine(list_of_vecs[i], list_of_vecs[i + 1]))
    return similarities

train_features_base = np.array(get_similiarity(train_vectors)).reshape(-1,1)
dev_features_base = np.array(get_similiarity(dev_vectors)).reshape(-1,1)
test_features_base = np.array(get_similiarity(test_vectors)).reshape(-1,1)

In [9]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

clf = LogisticRegression(solver = 'sag',random_state=0).fit(train_features_base, train_labels)
y_pred = clf.predict(dev_features_base)
print(classification_report(y_pred, dev_labels))




               precision    recall  f1-score   support

contradiction       0.01      0.36      0.02       110
   entailment       0.69      0.37      0.48      6248
      neutral       0.39      0.36      0.38      3484

     accuracy                           0.37      9842
    macro avg       0.36      0.36      0.29      9842
 weighted avg       0.58      0.37      0.44      9842



### Лексична схожість 

In [10]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [14]:
%%time

from collections import defaultdict
from nltk.stem.snowball import SnowballStemmer
from nltk import ngrams

stemmer = SnowballStemmer(language='english')


def get_linguistic_data (sentences):
    result = []
    for sent in nlp.pipe(sentences, disable=["textcat"]):
        sent_data = defaultdict(list)
        
        for i, token in enumerate(sent):
            sent_data['words'].append(token.text.lower())
            sent_data['lemmas'].append(token.lemma_)
            sent_data['stems'].append(stemmer.stem(token.text))
            sent_data['pos'].append(token.pos_)
            sent_data['tags'].append(token.tag_)
            sent_data['lemma->pos'].append((token.lemma_,token.pos_))
            if token.pos_ == 'VERB' and sent[i-1].lemma_ in  ['never', 'not']:
                sent_data['has_neg_verb'].append(1)
        for ent in sent.ents:
            sent_data['entities'].append(ent.text) 
            sent_data['ent_labels'].append(ent.label_)
            
        result.append(sent_data)

    return result

lingdata_train = get_linguistic_data(train_sents)
lingdata_dev = get_linguistic_data(dev_sents)
lingdata_test = get_linguistic_data(test_sents)

CPU times: user 23min 29s, sys: 5min 18s, total: 28min 48s
Wall time: 29min 3s


In [16]:
def extract_features (data, vec_similiarities):
    features = []
    pairs = []
    
    for i in range(len(data)):
        if i % 2 == 0:
            pairs.append((data[i], data[i + 1]))
            
    for i, p in enumerate(pairs):
        pair_feats = dict()
        pair_feats['jaccard_words'] = jaccard_similarity(p[0]['words'], p[1]['words'])
        pair_feats['jaccard_lemmas'] = jaccard_similarity(p[0]['lemmas'], p[1]['lemmas'])
        pair_feats['jaccard_stems'] = jaccard_similarity(p[0]['stems'], p[1]['stems'])
        pair_feats['jaccard_pos'] = jaccard_similarity(p[0]['pos'], p[1]['pos'])
        pair_feats['jaccard_tags'] = jaccard_similarity(p[0]['tags'], p[1]['tags'])
        if p[0]['jaccard_entities'] and p[1]['jaccard_entities']:
            pair_feats['jaccard_entities'] = jaccard_similarity(p[0]['entities'], p[1]['entities'])
            pair_feats['jaccard_ent_tags'] = jaccard_similarity(p[0]['ent_labels'], p[1]['ent_labels'])
        if p[0]['has_neg_verb']:
            pair_feats['has_neg_verb1'] = True
        if p[1]['has_neg_verb']:
            pair_feats['has_neg_verb2'] = True
        # додаємо cosine similarity з минулої ітераці
        pair_feats['cosine_sim'] = vec_similiarities[i][0]
        features.append(pair_feats)
        
    return features

train_features_lexic = extract_features(lingdata_train, train_features_base)
dev_features_lexic = extract_features(lingdata_dev, dev_features_base)
test_features_lexic = extract_features(lingdata_test, test_features_base)

#### Лексична схожість + LogisticRegression

In [18]:
vec = DictVectorizer()
X = vec.fit_transform(train_features_lexic)
clf = LogisticRegression(solver = 'sag', random_state = 0).fit(X, train_labels)
y_pred = clf.predict(vec.transform(dev_features_lexic))
print(classification_report(y_pred, dev_labels))



               precision    recall  f1-score   support

contradiction       0.62      0.46      0.53      4438
   entailment       0.52      0.52      0.52      3376
      neutral       0.23      0.37      0.29      2028

     accuracy                           0.46      9842
    macro avg       0.46      0.45      0.44      9842
 weighted avg       0.51      0.46      0.47      9842



#### Лексична схожість + CatBoost

In [19]:
from catboost import CatBoostClassifier 

def train_catboost(X_train, X_test, y_train, y_test):
    params = {
        "iterations": 2500,
        "learning_rate": 0.5,
        "random_seed": 1,
        "od_wait": 30,
        "od_type": "Iter",
        "thread_count": 8,
        "max_depth": 10,
    }
    model = CatBoostClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=(X_test, y_test),
        verbose=200,
        plot=False,
    )
    y_pred = model.predict(X_test)
    return model

In [21]:
import pandas as pd

train_lexic_df = pd.DataFrame(train_features_lexic)
train_lexic_df.fillna(train_lexic_df.mean(), inplace= True)
dev_lexic_df = pd.DataFrame(dev_features_lexic)
dev_lexic_df.fillna(dev_lexic_df.mean(), inplace= True)
ctb = train_catboost(train_lexic_df, dev_lexic_df, train_labels, dev_labels)

y_pred = ctb.predict(dev_lexic_df)
print(classification_report(y_pred, [str(x) for x in dev_labels]))

0:	learn: 1.0612842	test: 1.0585410	best: 1.0585410 (0)	total: 588ms	remaining: 24m 29s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 1.012979669
bestIteration = 94

Shrink model to first 95 iterations.
               precision    recall  f1-score   support

contradiction       0.62      0.48      0.54      4246
   entailment       0.56      0.53      0.55      3562
      neutral       0.27      0.43      0.33      2034

     accuracy                           0.48      9842
    macro avg       0.48      0.48      0.47      9842
 weighted avg       0.53      0.48      0.50      9842



### Семантична схожість

In [22]:
def get_words (sent):
    nouns, nums, verbs, advs, adjs = set(), set(), set(), set(), set()
    
    for word, tag in sent:
        w = word.lower()
        if tag == 'NOUN':
            nouns.add(w)
        if tag == 'NUM':
            nums.add(w)
        if tag == 'VERB':
            verbs.add(w)
        if tag == 'ADV':
            advs.add(w)
        if tag == 'ADJ':
            adjs.add(w)
            
    return nouns, nums, verbs, advs, adjs

def get_unique_words (l1, l2):
    nouns1, nums1, verbs1, advs1, adjs1 = get_words(l1)
    nouns2, nums2, verbs2, advs2, adjs2 = get_words(l2)

    return (nouns1 - nouns2, nouns2-nouns1), (nums1 - nums2,nums2-nums1), \
        (verbs1 - verbs2, verbs2 - verbs1),  (advs1 - advs2, advs2 - advs1),  (adjs1 - adjs2, adjs2 - adjs1)

In [23]:
import itertools  
from nltk.corpus import wordnet 

concepts = dict()

def calc_semantic_rels_wn (l1, l2):
    
    if len(l1) and len(l2):
        count_ant, count_syn, count_hypernym, count_hyponym  = 0, 0, 0, 0
        pairs = itertools.product(l1, l2)
            
        for p in pairs:
            word1, word2 = p[0], p[1]
            if word1 not in concepts:
                syn, ant, hypernym, hyponym = set(), set(), set(), set()
                synsets = wordnet.synsets(word1)
                
                for synset in synsets:
                    for lemma in synset.lemmas():
                        syn.add(lemma.name())   
                        for lemma in lemma.antonyms():
                            ant.add(lemma.name())
                    for x in synset.hypernyms():
                        for lemma in x.lemmas():
                            hypernym.add(lemma.name())
                    for x in synset.hyponyms():
                        for lemma in x.lemmas():
                            hyponym.add(lemma.name())
                            
                concepts[word1] = {
                    'synonym': syn,
                    'antonym': ant,
                    'hypernym': hypernym,
                    'hyponym': hyponym,
                }
            syn = concepts[word1]['synonym']
            ant = concepts[word1]['antonym']
            hypernym = concepts[word1]['hypernym']
            hyponym = concepts[word1]['hyponym']

            if word2 in syn:
                count_syn += 1
            if word2 in ant:
                count_ant += 1
            if word2 in hypernym:
                count_hypernym += 1
            if word2 in hyponym:
                count_hyponym += 1
        return count_syn, count_ant, count_hypernym, count_hyponym
    
    return None, None, None, None

In [24]:
%%time

def extract_features_wn (data):
    features_list = []
    pairs = []
    
    for i in range(len(data)):
        if i % 2 == 0:
            pairs.append((data[i], data[i + 1]))   
        
    for i, p in enumerate(pairs):
            
        (unique_nn1, unique_nn2), (unique_nums1, unique_nums2), \
        (unique_verbs1, unique_verbs2), (unique_advs1, unique_advs2), \
        (unique_adjs1, unique_adjs2) = get_unique_words(p[0]['lemma->pos'], p[1]['lemma->pos'])
        
        nn_synon, nn_anton, nn_hyper, nn_hypo = calc_semantic_rels_wn(unique_nn1, unique_nn2) 
        num_synon, num_anton, num_hyper, num_hypo  = calc_semantic_rels_wn(unique_nums1, unique_nums2)
        verb_synon, verb_anton, verb_hyper, verb_hypo  = calc_semantic_rels_wn(unique_verbs1, unique_verbs2)
        adv_synon, adv_anton, adv_hyper, adv_hypo = calc_semantic_rels_wn(unique_advs1, unique_advs2) 
        adj_synon, adj_anton, adj_hyper, adj_hypo  = calc_semantic_rels_wn(unique_adjs1, unique_adjs2) 
            
        features_list.append({
            "nn_synon": nn_synon,
            "nn_anton": nn_anton,
            "nn_hyper": nn_hyper,
            "nn_hypo": nn_hypo,
            "num_synon": num_synon,
            "num_anton": num_anton,
            "num_hyper": num_hyper,
            "num_hypo ": num_hypo ,
            "verb_synon": verb_synon,
            "verb_anton": verb_anton,
            "verb_hyper": verb_hyper,
            "verb_hypo": verb_hypo,
            "adv_synon": adv_synon,
            "adv_anton": adv_anton,
            "adv_hyper": adv_hyper,
            "adv_hypo": adv_hypo,
            "adj_synon": adj_synon,
            "adj_anton": adj_anton,
            "adj_hyper": adj_hyper,
            "adj_hypo": adj_hypo,
        })
    return features_list

train_features_wn = extract_features_wn(lingdata_train)
dev_features_wn = extract_features_wn(lingdata_dev)
test_features_wn = extract_features_wn(lingdata_test)

CPU times: user 21 s, sys: 1.89 s, total: 22.9 s
Wall time: 24.4 s


In [25]:
import copy

def merge_features (features1, features2):
    features = copy.deepcopy(features1)
    for i, f in enumerate(features2):
        for k,v in f.items():
            features[i][k] = v
    return features

train_features_final = merge_features(train_features_lexic, train_features_wn)
dev_features_final = merge_features(dev_features_lexic, dev_features_wn)
test_features_final = merge_features(test_features_lexic, test_features_wn)

In [34]:
import pandas as pd

train_df = pd.DataFrame(train_features_final)
train_df.fillna(float('inf'), inplace= True)
dev_df = pd.DataFrame(dev_features_final)
dev_df.fillna(float('inf'), inplace= True)
test_df = pd.DataFrame(test_features_final)
test_df.fillna(float('inf'), inplace= True)

ctb = train_catboost(train_df, dev_df, train_labels, dev_labels)

y_pred = ctb.predict(test_df)
print(classification_report(y_pred, [str(x) for x in test_labels]))

0:	learn: 0.9959192	test: 0.9914860	best: 0.9914860 (0)	total: 695ms	remaining: 28m 56s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9109542333
bestIteration = 56

Shrink model to first 57 iterations.
               precision    recall  f1-score   support

contradiction       0.57      0.57      0.57      3244
   entailment       0.60      0.65      0.62      3150
      neutral       0.49      0.46      0.47      3430

     accuracy                           0.56      9824
    macro avg       0.56      0.56      0.56      9824
 weighted avg       0.55      0.56      0.55      9824



In [35]:
from eli5.catboost import explain_weights_catboost

explain_weights_catboost(ctb)

Weight,Feature
0.1170,nn_anton
0.1167,jaccard_stems
0.0881,cosine_sim
0.0815,jaccard_pos
0.0749,verb_hyper
0.0747,nn_hyper
0.0651,jaccard_lemmas
0.0608,nn_synon
0.0503,jaccard_words
0.0491,jaccard_tags


### Висновки

Вектори не зовсім якісно справляють із задачею знайдення логічних зв'язків (мій бейзлайн). У мене не було великих сподівань на них, але минулої лекції Сєва сказав, що це може бути гарний бейзлайном, тому я вирішила спробувати.  


Щодо роботи WordNet, то на відміну від ConceptNet, де можно було відфільтрувати сумнівні зв'язки у частинах концепту, використовуючи оцінку (або ступінь довіри автору, не пам'ятаю точно як точно називається), то у першому дуже шумні дані. Ось наприклад гіпероніми для слова cat:

In [36]:
from IPython.display import Image


![title](screenshot.png)