In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_lg")

## Load data

In [3]:
import json

In [4]:
def read_jsonnl(file):
    with open(file, "r") as f:
        data = [json.loads(line) for line in f.readlines()]
        return data

In [5]:
snli_path = "snli_1.0/snli_1.0"

In [6]:
train_data = read_jsonnl(f"{snli_path}/snli_1.0_train.jsonl")

In [7]:
train_data[0]

{'annotator_labels': ['neutral'],
 'captionID': '3416050480.jpg#4',
 'gold_label': 'neutral',
 'pairID': '3416050480.jpg#4r1n',
 'sentence1': 'A person on a horse jumps over a broken down airplane.',
 'sentence1_binary_parse': '( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )',
 'sentence1_parse': '(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))',
 'sentence2': 'A person is training his horse for a competition.',
 'sentence2_binary_parse': '( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )',
 'sentence2_parse': '(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))'}

In [8]:
dev_data = read_jsonnl(f"{snli_path}/snli_1.0_dev.jsonl")

In [9]:
test_data = read_jsonnl(f"{snli_path}/snli_1.0_test.jsonl")

In [10]:
def get_nlp_data(nlp, data):
    X, y = [], []
    for i in range(0, len(data)):
        yi = data[i]['gold_label']
        if yi not in ['entailment', 'contradiction', 'neutral']:
            continue
            
        text_sent = data[i]['sentence1']
        text_sent_tokens = nlp(text_sent)

        hypothesis_sent = data[i]['sentence2']
        hypothesis_sent_tokens = nlp(hypothesis_sent)
        
        xi = [text_sent_tokens, hypothesis_sent_tokens]
        X.append(xi)
        y.append(yi)
        
    return X, y

In [11]:
len(train_data)

550152

To save time, let's do the experiments on smaller trainig data, then run the best result on full data. 

In [12]:
%time X_train, y_train = get_nlp_data(nlp, train_data[:30000])

CPU times: user 5min 10s, sys: 408 ms, total: 5min 11s
Wall time: 5min 12s


In [13]:
%time X_dev, y_dev = get_nlp_data(nlp, dev_data)

CPU times: user 1min 45s, sys: 204 ms, total: 1min 45s
Wall time: 1min 45s


In [14]:
len(dev_data)

10000

In [15]:
len(test_data)

10000

## Baseline

In [16]:
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [17]:
def get_jaccard_sim_by_lemma(sent_tokens1, sent_tokens2): 
    lemmas1 = set([token.lemma_ for token in sent_tokens1])
    lemmas2 = set([token.lemma_ for token in sent_tokens2])

    matched = lemmas1.intersection(lemmas2)
    return float(len(matched)) / (len(lemmas1) + len(lemmas2) - len(matched))

In [18]:
def get_jaccard_sim_by_verb(sent_tokens1, sent_tokens2): 
    verbs1 = set([token.lemma_ for token in sent_tokens1 if token.pos_ == "VERB" and not token.lemma == "be"])
    verbs2 = set([token.lemma_ for token in sent_tokens2 if token.pos_ == "VERB" and not token.lemma == "be"])

    matched = verbs1.intersection(verbs2)
    if len(verbs1) + len(verbs2) - len(matched) == 0:
        return None
        
    return float(len(matched)) / (len(verbs1) + len(verbs2) - len(matched))    

In [19]:
def find_all_verbs_in_sent(sentence, nlp):
    doc = nlp(sentence)
    return set([token.lemma_ for token in doc if token.pos_ == "VERB" and not token.lemma == "be"])

In [20]:
def exctract_features(text_sent_tokens, hypothesis_sent_tokens, updaters):
    features = {}
    
    features['text-hyp-sim'] = get_jaccard_sim_by_lemma(text_sent_tokens, hypothesis_sent_tokens)
    text_hyp_sim_verb = get_jaccard_sim_by_verb(text_sent_tokens, hypothesis_sent_tokens)
    if text_hyp_sim_verb:
        features['text-hyp-sim-verb'] = text_hyp_sim_verb

    features['text-len'] = len(text_sent_tokens)
    features['hyp-len'] = len(hypothesis_sent_tokens)
    
    for updater in updaters:
        updater(features, text_sent_tokens, hypothesis_sent_tokens)
    
    return features

In [21]:
X_train_f = [exctract_features(item[0], item[1], []) for item in X_train]

In [22]:
X_dev_f = [exctract_features(item[0], item[1], []) for item in X_dev]

In [23]:
clf = Pipeline([
    ('vect', DictVectorizer()),
    ('svm', svm.SVC())
])

In [24]:
%time clf.fit(X_train_f, y_train)

CPU times: user 36.5 s, sys: 100 ms, total: 36.6 s
Wall time: 36.7 s


Pipeline(memory=None,
         steps=[('vect',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True)),
                ('svm',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [25]:
nb_clf = Pipeline([
    ('vect', DictVectorizer()),
    ('svm', MultinomialNB())
])

In [26]:
%time nb_clf.fit(X_train_f, y_train)

CPU times: user 77.2 ms, sys: 5 Âµs, total: 77.2 ms
Wall time: 76.9 ms


Pipeline(memory=None,
         steps=[('vect',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True)),
                ('svm',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [27]:
y_dev_pred = nb_clf.predict(X_dev_f)

In [28]:
print(classification_report(y_dev, y_dev_pred))

               precision    recall  f1-score   support

contradiction       0.41      0.42      0.41      3278
   entailment       0.51      0.48      0.49      3329
      neutral       0.43      0.43      0.43      3235

     accuracy                           0.45      9842
    macro avg       0.45      0.45      0.45      9842
 weighted avg       0.45      0.45      0.45      9842



In [29]:
y_dev_pred = clf.predict(X_dev_f)

In [30]:
print(classification_report(y_dev, y_dev_pred))

               precision    recall  f1-score   support

contradiction       0.42      0.62      0.50      3278
   entailment       0.57      0.54      0.56      3329
      neutral       0.51      0.29      0.37      3235

     accuracy                           0.49      9842
    macro avg       0.50      0.48      0.48      9842
 weighted avg       0.50      0.49      0.48      9842



### Improve baseline

Inspired by paper, let's try to add some important features.

#### Lexical features

In [31]:
from nltk.corpus import stopwords

In [32]:
en_stop_words = stopwords.words('english')

In [33]:
def get_jaccard_sim_by_noun(sent_tokens1, sent_tokens2): 
    verbs1 = set([token.lemma_ for token in sent_tokens1 if token.pos_ == "NOUN" or token.pos_ == "PRON"])
    verbs2 = set([token.lemma_ for token in sent_tokens2 if token.pos_ == "NOUN" or token.pos_ == "PRON"])

    matched = verbs1.intersection(verbs2)
    if len(verbs1) + len(verbs2) - len(matched) == 0:
        return None
        
    return float(len(matched)) / (len(verbs1) + len(verbs2) - len(matched))    

In [34]:
def add_jaccard_sim_by_bigram_lemma(features, sent_tokens1, sent_tokens2): 
    lemmas1 = [token.lemma_ for token in sent_tokens1]
    bigrams1 = set([''.join(lemmas1[i:i+2:]) for i in range(len(lemmas1) - 1)])
    
    lemmas2 = [token.lemma_ for token in sent_tokens2]
    bigrams2 = set([''.join(lemmas2[i:i+2:]) for i in range(len(lemmas2) - 1)])

    matched = bigrams1.intersection(bigrams2)
    
    if len(bigrams1) + len(bigrams2) - len(matched) != 0:
        features['bigram-sim'] = float(len(matched)) / (len(bigrams1) + len(bigrams2) - len(matched))

In [35]:
def add_stop_words_features(features, text_sent_tokens, hypothesis_sent_tokens):
    text_stop_words = [token.lemma_ for token in text_sent_tokens if token.lemma_ in en_stop_words]
    hyp_stop_words = [token.lemma_ for token in hypothesis_sent_tokens if token.lemma_ in en_stop_words]
    
    features['text-num-stop'] = len(text_stop_words)
    features['hyp-num-stop'] = len(hyp_stop_words)
    
    text_stop_words = set(text_stop_words)
    hyp_stop_words = set(hyp_stop_words)
    
    matched = text_stop_words.intersection(hyp_stop_words)
    
    if len(hyp_stop_words) + len(text_stop_words) - len(matched) != 0:
        features['text-hyp-stop-sim'] = float(len(matched)) / (len(hyp_stop_words) + len(text_stop_words) - len(matched))

In [36]:
def add_more_lexical_features(features, text_sent_tokens, hypothesis_sent_tokens):
    text_hyp_sim_noun = get_jaccard_sim_by_verb(text_sent_tokens, hypothesis_sent_tokens)
    if text_hyp_sim_noun:
        features['text-hyp-sim-noun'] = text_hyp_sim_noun
        
    add_stop_words_features(features, text_sent_tokens, hypothesis_sent_tokens)
    add_jaccard_sim_by_bigram_lemma(features, text_sent_tokens, hypothesis_sent_tokens)


In [37]:
def exctract_features_with_lexical(text_sent_tokens, hypothesis_sent_tokens):
    lexical_updaters = [add_more_lexical_features]
    return exctract_features(text_sent_tokens, hypothesis_sent_tokens, lexical_updaters)

In [38]:
%time X_train_f = [exctract_features_with_lexical(item[0], item[1]) for item in X_train]

CPU times: user 2.39 s, sys: 12 ms, total: 2.41 s
Wall time: 2.42 s


In [39]:
%time X_dev_f = [exctract_features_with_lexical(item[0], item[1]) for item in X_dev]

CPU times: user 839 ms, sys: 3.96 ms, total: 843 ms
Wall time: 848 ms


In [40]:
clf = Pipeline([
    ('vect', DictVectorizer()),
    ('svm', svm.SVC())
])

In [41]:
clf.fit(X_train_f, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True)),
                ('svm',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [42]:
y_dev_pred = clf.predict(X_dev_f)

In [43]:
print(classification_report(y_dev, y_dev_pred))

               precision    recall  f1-score   support

contradiction       0.42      0.64      0.51      3278
   entailment       0.57      0.53      0.55      3329
      neutral       0.52      0.29      0.37      3235

     accuracy                           0.49      9842
    macro avg       0.50      0.49      0.48      9842
 weighted avg       0.51      0.49      0.48      9842



##### semantic features

In [44]:
from nltk.corpus import wordnet

In [45]:
def get_synonyms_and_antonyms(word):
    synonyms = []
    antonyms = []

    for syn in wordnet.synsets(word):
        try:
            for l in syn.lemmas():
                synonyms.append(l.name())
                if l.antonyms():
                     antonyms.append(l.antonyms()[0].name())
        except:
            continue
    
    return set(synonyms), set(antonyms)

In [46]:
def get_entailments_for_word(word):
    entailments = set()        
    action_syn = wordnet.synsets(word, pos='v')
    if len(action_syn) == 0:
        return entailments
    action_syn = action_syn[0]
    for e in action_syn.entailments():
        entailments.update(set(e.lemma_names()))
        
    return entailments

In [47]:
def get_root_hypernyms(word):
    result = set()
    
    action_syn = wordnet.synsets(word, pos='v')
    if len(action_syn) == 0:
        return result
    
    action_syn = action_syn[0]
    for e in action_syn.root_hypernyms():
        result.update(set(e.lemma_names()))
        
    return result

In [48]:
wordnet.synsets('cut', pos='v')[0].root_hypernyms()[0].lemma_names()

['move', 'displace']

In [49]:
get_entailments_for_word('table')

{'call_off', 'cancel', 'reschedule', 'scratch', 'scrub'}

In [50]:
POS_FOR_SYNONYMS = ["VERB", "NOUN", "PROPN"]

In [51]:
def calc_num_synonyms_and_antonyms(text_sent_tokens, hypothesis_sent_tokens):
    text_synonyms = set()
    text_antonyms = set()
    
    
    for token in text_sent_tokens:
        if token.pos_ not in POS_FOR_SYNONYMS:
            continue
            
        synonyms, antonyms = get_synonyms_and_antonyms(token.lemma_)
        text_synonyms.update(synonyms)
        text_antonyms.update(antonyms)
        
    num_synonyms, num_antonyms = 0, 0
    for token in hypothesis_sent_tokens:
        if token.lemma_ in text_synonyms:
            num_synonyms += 1
            
        if token.lemma_ in text_antonyms:
            num_antonyms += 1
            
    return num_synonyms, num_antonyms


In [52]:
def add_synonyms_antonyms_features(features, text_sent_tokens, hypothesis_sent_tokens):
    num_synonyms, num_antonyms = calc_num_synonyms_and_antonyms(text_sent_tokens, hypothesis_sent_tokens)
    features['text-hyp-synonims'] = num_synonyms
    features['text-hyp-antonyms'] = num_antonyms
    
    num_synonyms, num_antonyms = calc_num_synonyms_and_antonyms(hypothesis_sent_tokens, text_sent_tokens)
    features['hyp-text-synonims'] = num_synonyms
    features['hyp-text-antonyms'] = num_antonyms


In [53]:
def add_synonyms_antonyms_inverse_features(features, text_sent_tokens, hypothesis_sent_tokens):
    text_synonyms = set()
    text_antonyms = set()
    
    
    for token in text_sent_tokens:
        if token.pos_ not in POS_FOR_SYNONYMS:
            continue
            
        synonyms, antonyms = get_synonyms_and_antonyms(token.lemma_)
        text_synonyms.update(synonyms)
        text_antonyms.update(antonyms)
        
    num_synonyms, num_antonyms = 0, 0
    for token in hypothesis_sent_tokens:
        if token.lemma_ in text_synonyms:
            num_synonyms += 1
            
        if token.lemma_ in text_antonyms:
            num_antonyms += 1
            
    features['text-hyp-synonims'] = num_synonyms
    features['text-hyp-antonyms'] = num_antonyms

In [54]:
def add_entailments_overlap_features(features, text_sent_tokens, hypothesis_sent_tokens, prefix=""):
    hyp_entailments = set()
    
    for token in hypothesis_sent_tokens:
#         if token.pos_ != "VERB":
#             continue
            
        entailments = get_entailments_for_word(token.lemma_)
        hyp_entailments.update(entailments)
        
    num_entailments = 0
    for token in text_sent_tokens:
        if token.lemma_ in hyp_entailments:
            num_entailments += 1
            
#     print(num_entailments)       
    features[f'hyp-text-entailments'] = num_entailments
    
    
    text_entailments = set()
    for token in text_sent_tokens:
#         if token.pos_ != "VERB":
#             continue
            
        entailments = get_entailments_for_word(token.lemma_)
        text_entailments.update(entailments)
        
    num_entailments = 0
    for token in hypothesis_sent_tokens:
        if token.lemma_ in text_entailments:
            num_entailments += 1
            
#     print(num_entailments)       
    features[f'text-hyp-entailments'] = num_entailments

In [55]:
def add_hypernyms_overlap_features(features, text_sent_tokens, hypothesis_sent_tokens):
    text_hypernyms = set()
    for token in text_sent_tokens:
#         if token.pos_ != "VERB":
#             continue
            
        hypernyms = get_root_hypernyms(token.lemma_)
        text_hypernyms.update(hypernyms)
        
    num = 0
    for token in hypothesis_sent_tokens:
        if token.lemma_ in text_hypernyms:
            num += 1
            
#     print(num_entailments)       
    features['text-hyp-hypernyms'] = num
    
    
    hyp_hypernyms = set()
    for token in hypothesis_sent_tokens:
#         if token.pos_ != "VERB":
#             continue
            
        hypernyms = get_root_hypernyms(token.lemma_)
        hyp_hypernyms.update(hypernyms)
        
    num = 0
    for token in text_sent_tokens:
        if token.lemma_ in hyp_hypernyms:
            num += 1
            
    features['hyp-text-hypernyms'] = num

    

In [56]:
def exctract_features_with_sematic(nlp, snli_item):
    updaters = [
                add_more_lexical_features,
                add_synonyms_antonyms_features, 
                add_entailments_overlap_features,
                add_hypernyms_overlap_features]
    return exctract_features(nlp, snli_item, updaters)


In [57]:
%time X_train_f = [exctract_features_with_sematic(item[0], item[1]) for item in X_train]

CPU times: user 20.7 s, sys: 152 ms, total: 20.8 s
Wall time: 21.2 s


In [58]:
%time X_dev_f = [exctract_features_with_sematic(item[0], item[1]) for item in X_dev]

CPU times: user 6.65 s, sys: 28 ms, total: 6.68 s
Wall time: 6.72 s


In [59]:
clf = Pipeline([
    ('vect', DictVectorizer()),
    ('svm', svm.SVC())
])

In [60]:
clf.fit(X_train_f, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True)),
                ('svm',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [61]:
y_dev_pred = clf.predict(X_dev_f)

In [62]:
print(classification_report(y_dev, y_dev_pred))

               precision    recall  f1-score   support

contradiction       0.50      0.57      0.53      3278
   entailment       0.59      0.63      0.61      3329
      neutral       0.51      0.40      0.45      3235

     accuracy                           0.54      9842
    macro avg       0.53      0.53      0.53      9842
 weighted avg       0.53      0.54      0.53      9842



As we can see from the table above, adding semantic features improved F1 score, it is not significant improvement, as I expected after added entailments, hypernyms features. Why? It is need depper investigation.

### Train classificatior with the best features on full data

In [63]:
%time X_train, y_train = get_nlp_data(nlp, train_data)

CPU times: user 1h 37min 26s, sys: 11.1 s, total: 1h 37min 37s
Wall time: 1h 37min 50s


In [64]:
%time X_train_f = [exctract_features_with_sematic(item[0], item[1]) for item in X_train]

CPU times: user 5min 51s, sys: 148 ms, total: 5min 51s
Wall time: 5min 51s


In [65]:
clf = Pipeline([
    ('vect', DictVectorizer()),
    ('svm', svm.SVC())
])

In [66]:
clf.fit(X_train_f, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True)),
                ('svm',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [67]:
y_dev_pred = clf.predict(X_dev_f)

In [68]:
print(classification_report(y_dev, y_dev_pred))

               precision    recall  f1-score   support

contradiction       0.51      0.58      0.54      3278
   entailment       0.63      0.63      0.63      3329
      neutral       0.51      0.44      0.47      3235

     accuracy                           0.55      9842
    macro avg       0.55      0.55      0.55      9842
 weighted avg       0.55      0.55      0.55      9842



## Measure the result on test data

In [69]:
%time X_test, y_test = get_nlp_data(nlp, test_data)

CPU times: user 1min 45s, sys: 168 ms, total: 1min 46s
Wall time: 1min 46s


In [70]:
%time X_test_f = [exctract_features_with_sematic(item[0], item[1]) for item in X_test]

CPU times: user 6.61 s, sys: 12 ms, total: 6.62 s
Wall time: 6.62 s


In [71]:
y_test_pred = clf.predict(X_test_f)

In [72]:
print(classification_report(y_test, y_test_pred))

               precision    recall  f1-score   support

contradiction       0.52      0.58      0.55      3237
   entailment       0.63      0.63      0.63      3368
      neutral       0.52      0.45      0.48      3219

     accuracy                           0.56      9824
    macro avg       0.55      0.55      0.55      9824
 weighted avg       0.56      0.56      0.55      9824

