In [1]:
import os

from IPython.display import display, Markdown, Image

In [2]:
REPO_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
TASK_PATH = os.path.join(REPO_PATH, "tasks", "09-textual-entailment.md")
DATA_PATH = '/home/dima/Projects/snli_1.0'

In [3]:
def show_markdown(path):
    with open(path, 'r') as fh:
        content = fh.read()
    display(Markdown(content))

In [4]:
show_markdown(TASK_PATH)

# Логічне слідування

## Завдання

Розробіть класифікатор, який приймає на вхід текст та гіпотезу і визначає зв'язок між ними за трьома класами:
- entailment (гіпотеза логічно слідує з тексту)
- contradiction (гіпотеза суперечить тексту)
- neutral (гіпотеза і текст не пов'язані)

Побудуйте базове рішення та ітеративно покращуйте його, додаючи ознаки. Обов'язково випробуйте ознаки лексичної, граматичної та семантичної схожості:
* До ознак *лексичної схожості* належить частка сутностей, слів, енграмів, іменників, дієслів, числівників тощо, які перетинаються в тексті та гіпотезі. Спробуйте лематизацію чи стемінг, опрацюйте заперечення, нормалізуйте дані.
* До ознак *граматичної схожості* належить частка синтаксичних структур чи залежностей, які перетинаються в тексті та гіпотезі. Спробуйте або дерева складників, або дерева залежностей, або і те, і друге.
* До ознак *семантичної схожості* належить:
  1. Наявність лексико-семантичних зв'язків між словами в тексті та в гіпотезі. Спробуйте виявити наявність синонімів, антонімів, гіперонімів, гіпонімів, пов'язаних слів, логічного слідування тощо. Ви можете використати будь-яку онтологію ([WordNet](https://wordnet.princeton.edu/), [ConceptNet](http://conceptnet.io/), [BabelNet](https://babelnet.org/) тощо) та будь-яку бібліотеку для роботи з нею.
  2. *[Опційно]* Схожість семантичних ролей в тексті та гіпотезі. Спробуйте готові рішення для маркування семантичних ролей у тексті та гіпотезі (наприклад, [AllenNLP SRL](https://github.com/masrb/allenNLP-SRL) чи [AMR Eager](https://cohort.inf.ed.ac.uk/amreager.html)).

Корисні статті, у яких можна підглянути ознаки:
- [Feature Analysis for Paraphrase Recognition and Textual Entailment](https://pdfs.semanticscholar.org/2d7d/f0b5ac15cdaa50928031f5bb2fc63a0a1f68.pdf), 2013
- [Machine Learning Experiments for Textual Entailment](http://u.cs.biu.ac.il/~nlp/RTE2/Proceedings/02.pdf), 2006
- [A large annotated corpus for learning natural language inference](https://nlp.stanford.edu/pubs/snli_paper.pdf), 2015
- [Learning to recognize features of valid textual entailments](https://nlp.stanford.edu/pubs/rte-naacl06.pdf), 2006
- [Textual entailment](http://www.lsi.upc.edu/~ageno/anlp/textualEntailment.pdf), 2014

Для тренування та тестування використайте **train** та **dev** частини з [The Stanford Natural Language Inference (SNLI) Corpus](https://nlp.stanford.edu/projects/snli/). Протестуйте фінальне рішення на **test**-частині корпусу.

Запишіть ваші спостереження та результати в окремий файл.

## Оцінювання

100% за завдання.

## Крайній термін

09.05.2020


In [5]:
import pandas as pd
import numpy as np

from tqdm import tqdm

In [6]:
train = pd.read_csv(os.path.join(DATA_PATH, "snli_1.0_train.txt"), sep='\t')
dev = pd.read_csv(os.path.join(DATA_PATH, "snli_1.0_dev.txt"), sep='\t')
test = pd.read_csv(os.path.join(DATA_PATH, "snli_1.0_test.txt"), sep='\t')

columns_old = ['sentence1', 'sentence2', 'gold_label']
columns_new = ['premise', 'hypothesis', 'target']
name_mapping = dict(zip(columns_old, columns_new))

train = train.loc[train.gold_label != '-'][columns_old].rename(columns=name_mapping)
dev = dev.loc[dev.gold_label != '-'][columns_old].rename(columns=name_mapping)
test = test.loc[test.gold_label != '-'][columns_old].rename(columns=name_mapping)


print(train.shape, dev.shape, test.shape)

(549367, 3) (9842, 3) (9824, 3)


In [7]:
train.head()

Unnamed: 0,premise,hypothesis,target
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,neutral
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",contradiction
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",entailment
3,Children smiling and waving at camera,They are smiling at their parents,neutral
4,Children smiling and waving at camera,There are children present,entailment


In [8]:
train.target.value_counts(normalize=True)

entailment       0.333868
contradiction    0.333451
neutral          0.332681
Name: target, dtype: float64

In [9]:
dev.target.value_counts(normalize=True)

entailment       0.338244
contradiction    0.333062
neutral          0.328693
Name: target, dtype: float64

In [10]:
test.target.value_counts(normalize=True)

entailment       0.342834
contradiction    0.329499
neutral          0.327667
Name: target, dtype: float64

### Preprocessing

In [11]:
import spacy
import pickle

from multiprocessing import cpu_count
from sklearn.metrics import *

In [12]:
nlp = spacy.load('en_core_web_md')

In [13]:
sample = train.sample()
trgt = sample.target.values[0]
prem = sample.premise.values[0]
hyp = sample.hypothesis.values[0]

In [14]:
print(trgt)
print(prem)
print(hyp)

contradiction
Woman in red shirt walking back from bowling lane.
A woman is walking in the right-turn-only lane.


In [15]:
def spacy_pipe(df, save_pkl=None, batch_size=4096):
    
    prems = nlp.pipe(df["premise"].values, n_threads=4, batch_size=batch_size)
    hyps = nlp.pipe(df["hypothesis"].values, n_threads=4, batch_size=batch_size)
    res = zip(prems, hyps, df["target"].values)
    res = pd.DataFrame(res, columns=columns_new)
    
    if save_pkl:
        res.to_pickle(save_pkl)
    
    return res

In [16]:
print("Pickling:", pd.__version__, spacy.__version__, sep='\n')

Pickling:
0.25.3
2.2.4


In [17]:
%%time

try:
    train = pd.read_pickle("train.pkl")
except FileNotFoundError:
    train = spacy_pipe(train, save_pkl="train.pkl")
    train.to_pickle('train.pkl')

print(train.shape)

(549361, 3)
CPU times: user 3min 23s, sys: 7.18 s, total: 3min 30s
Wall time: 3min 30s


In [18]:
%%time

try:
    dev = pd.read_pickle("dev.pkl")
except FileNotFoundError:
    dev = spacy_pipe(train, save_pkl="dev.pkl")
    dev.to_pickle('dev.pkl')

print(dev.shape)

(9842, 3)
CPU times: user 11.5 s, sys: 491 ms, total: 12 s
Wall time: 12.2 s


In [19]:
%%time

try:
    test = pd.read_pickle("test.pkl")
except FileNotFoundError:
    test = spacy_pipe(test)
    test.to_pickle('test.pkl')

print(test.shape)

(9824, 3)
CPU times: user 11.8 s, sys: 492 ms, total: 12.3 s
Wall time: 12.4 s


10k sample for testing hypothesises

In [20]:
# data = train.sample(10000)
# dev_sample = dev.sample(1000)
# test_sample = test.sample(1000)

data = train#.sample(100000)
dev_sample = dev
test_sample = test

data.target.value_counts()

entailment       183414
contradiction    183185
neutral          182762
Name: target, dtype: int64

### Simple baseline

In [22]:
data.groupby('target')['hypothesis', 'premise'].agg([lambda x: x.apply(len).mean()])

Unnamed: 0_level_0,hypothesis,premise
Unnamed: 0_level_1,<lambda>,<lambda>
target,Unnamed: 1_level_2,Unnamed: 2_level_2
contradiction,8.226279,14.144821
entailment,7.470973,14.143626
neutral,9.131039,14.144067


In [23]:
def tkn_num_baseline(prem, hyp):
    prem_len = len(prem)
    hyp_len = len(hyp)
    if prem_len - hyp_len < 5:
        return 'neutral'
    if prem_len - hyp_len < 7:
        return 'contradiction'
    else:
        return 'entailment'

In [24]:
pred = test.apply(lambda x: tkn_num_baseline(x.premise, x.hypothesis), 1)

print(classification_report(test['target'], pred))

               precision    recall  f1-score   support

contradiction       0.31      0.12      0.18      3237
   entailment       0.37      0.49      0.42      3368
      neutral       0.37      0.48      0.42      3219

    micro avg       0.37      0.37      0.37      9824
    macro avg       0.35      0.36      0.34      9824
 weighted avg       0.35      0.37      0.34      9824



### Features

In [25]:
import warnings
warnings.simplefilter("ignore", UserWarning)

from nltk import bigrams
from nltk.corpus import wordnet as wn
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

from rouge import Rouge
from jiwer import wer

In [26]:
def syn_overlap(prem_tokens, hyp_tokens):
    pos_map = {'NOUN': wn.NOUN, 'VERB': wn.VERB,
               'ADJ': wn.ADJ, 'ADV': wn.ADV}
    
    common_tokens = token_pos_intersect(prem_tokens, hyp_tokens)
    overlap_count = 0
    for pair in common_tokens:
        synsets = wn.synsets(pair[0][0], pos = pos_map.get(pair[0][1], None))
        if len(synsets) == 0:
            continue
        synonyms = sum([item.lemma_names()for item in synsets], [])
        if pair[1][0] in synonyms:
            overlap_count += 1
    return overlap_count, overlap_count/len(prem_tokens)


def token_pos_intersect(prem_tokens, hyp_tokens):
    res = []
    for i in prem_tokens:
        for j in hyp_tokens:
            if i[1] == j[1]:
                res.append((i, j))
    return list(set(res))


def token_pos_filter(tokens, pos):
    if pos == 'NOUN':
        return [token[0] for token in tokens if token[1] in ('PROPN', 'NOUN')]
    else:
        return [token[0] for token in tokens if token[1] == pos]

In [27]:
rouge = Rouge()

def get_features(prem, hyp):   
    if isinstance(prem, str):
        prem = nlp(prem)
    if isinstance(hyp, str):
        hyp = nlp(hyp)
        
    prem_lem_pos = [(t.lemma_, t.pos_) for t in prem if not t.is_punct]
    hyp_lem_pos = [(t.lemma_, t.pos_) for t in hyp if not t.is_punct]
    
    prem_lem = [t.lemma_ for t in prem if not t.is_punct]
    hyp_lem = [t.lemma_ for t in hyp if not t.is_punct]
    bi_prem_lem = list(bigrams(prem_lem))
    bi_hyp_lem = list(bigrams(hyp_lem))
    
    prem_lemma_txt = " ".join(prem_lem)
    hyp_lemma_txt = " ".join(hyp_lem)
    
    lemma_overlap = list(set(prem_lem).intersection(set(hyp_lem)))
    lemma_overlap_count = len(lemma_overlap)
    lemma_overlap_ratio = lemma_overlap_count/len(set(prem_lem))
    bigram_overlap = list(set(list(bi_prem_lem)).intersection(set(list(bi_hyp_lem))))
    
    syn_overlap_count, syn_overlap_ratio = syn_overlap(prem_lem_pos, hyp_lem_pos)
            
    prem_nouns = set(token_pos_filter(prem_lem_pos, 'NOUN'))
    noun_overlap = prem_nouns.intersection(set(token_pos_filter(hyp_lem_pos, 'NOUN')))
    noun_overlap_count = len(noun_overlap)
    noun_overlap_ratio = noun_overlap_count / len(prem_nouns) if len(prem_nouns) > 0 else 0
    
    prem_verbs = set(token_pos_filter(prem_lem_pos, 'VERB'))
    verb_overlap = prem_verbs.intersection(set(token_pos_filter(hyp_lem_pos, 'VERB')))
    verb_overlap_count = len(verb_overlap)
    verb_overlap_ratio = verb_overlap_count / len(prem_verbs) if len(prem_verbs) > 0 else 0
    
    prem_adjs = set(token_pos_filter(prem_lem_pos, 'ADJ'))
    adj_overlap = prem_adjs.intersection(set(token_pos_filter(hyp_lem_pos, 'ADJ')))
    adj_overlap_count = len(adj_overlap)
    adj_overlap_ratio = adj_overlap_count / len(prem_adjs) if len(prem_adjs) > 0 else 0
    
    prem_advs = set(token_pos_filter(prem_lem_pos, 'ADV'))
    adv_overlap = prem_advs.intersection(set(token_pos_filter(hyp_lem_pos, 'ADV')))
    adv_overlap_count = len(adv_overlap)
    adv_overlap_ratio = adv_overlap_count / len(prem_advs) if len(prem_advs) > 0 else 0
    
    rouge_scores = rouge.get_scores(hyps=hyp_lemma_txt, refs=prem_lemma_txt)[0]

    features = {
        # base features
        'prem_lem_len': len(prem_lem),
        'hyp_lem_len': len(hyp_lem),
        # overlap features
        'lemma_overlap_count': lemma_overlap_count,
        'lemma_overlap_ratio': lemma_overlap_ratio,
        'noun_overlap_count': noun_overlap_count,
        'noun_overlap_ratio': noun_overlap_ratio,
        'verb_overlap_count': verb_overlap_count,
        'verb_overlap_ratio': verb_overlap_ratio,
        'adj_overlap_count': adj_overlap_count,
        'adj_overlap_ratio': adj_overlap_ratio,
        'adv_overlap_count': adv_overlap_count,
        'adv_overlap_ratio': adv_overlap_ratio,
        # WordNet synonyms intersection
        'syn_overlap_count': syn_overlap_count,
        'syn_overlap_ratio': syn_overlap_ratio,
        # BLEU
        'bleu1': sentence_bleu([prem_lem], hyp_lem, weights=(1, 0, 0, 0)),
        'bleu2': sentence_bleu([prem_lem], hyp_lem, weights=(0, 1, 0, 0)),
        'bleu3': sentence_bleu([prem_lem], hyp_lem, weights=(0, 0, 1, 0)),
        'bleu4': sentence_bleu([prem_lem], hyp_lem, weights=(0, 0, 0, 1)),
        'bleu_cum': sentence_bleu([prem_lem], hyp_lem, weights=(0.25, 0.25, 0.25, 0.25)),
        # Rouge
        "rouge-1-f": rouge_scores.get('rouge-1', {}).get('f', 0),
        "rouge-2-f": rouge_scores.get('rouge-2', {}).get('f', 0),
        "rouge-l-f": rouge_scores.get('rouge-l', {}).get('f', 0),
        # Word accuracy
        "wacc": 1 - wer(prem_lemma_txt, hyp_lemma_txt),
        # METEOR
        "meteor": single_meteor_score(reference=prem_lemma_txt, hypothesis=hyp_lemma_txt),
    }
    # unigrams and bigrams for premise
    for gram in prem_lem:
        features[f'prem_{gram}'] = features.get(f'prem_{gram}', 0) + 1
    for gram in bi_prem_lem:
        features[f'prem_{gram[0]}_{gram[1]}'] = features.get(f'prem_{gram[0]}_{gram[1]}', 0) + 1
    # unigrams and bigrams for hypothesis
    for gram in hyp_lem:
        features[f'hyp_{gram}'] = features.get(f'hyp_{gram}', 0) + 1
    for gram in bi_hyp_lem:
        features[f'hyp_{gram[0]}_{gram[1]}'] = features.get(f'hyp_{gram[0]}_{gram[1]}', 0) + 1
    # unigrams and bigrams for overlaping
    for gram in lemma_overlap:
        features[f'common_{gram}'] = features.get(f'common_{gram}', 0) + 1
    for gram in bigram_overlap:
        features[f'common_{gram[0]}_{gram[1]}'] = features.get(f'common_{gram[0]}_{gram[1]}', 0) + 1

    return features

In [28]:
print(prem)
print(hyp)

get_features(prem, hyp)

Woman in red shirt walking back from bowling lane.
A woman is walking in the right-turn-only lane.


{'prem_lem_len': 9,
 'hyp_lem_len': 10,
 'lemma_overlap_count': 4,
 'lemma_overlap_ratio': 0.4444444444444444,
 'noun_overlap_count': 2,
 'noun_overlap_ratio': 0.5,
 'verb_overlap_count': 1,
 'verb_overlap_ratio': 1.0,
 'adj_overlap_count': 0,
 'adj_overlap_ratio': 0.0,
 'adv_overlap_count': 0,
 'adv_overlap_ratio': 0.0,
 'syn_overlap_count': 4,
 'syn_overlap_ratio': 0.4444444444444444,
 'bleu1': 0.4,
 'bleu2': 2.2250738585072626e-308,
 'bleu3': 2.2250738585072626e-308,
 'bleu4': 2.2250738585072626e-308,
 'bleu_cum': 1.4488496539373276e-231,
 'rouge-1-f': 0.42105262659279785,
 'rouge-2-f': 0.0,
 'rouge-l-f': 0.31578946869806096,
 'wacc': 0.11111111111111116,
 'meteor': 0.21978021978021975,
 'prem_woman': 1,
 'prem_in': 1,
 'prem_red': 1,
 'prem_shirt': 1,
 'prem_walk': 1,
 'prem_back': 1,
 'prem_from': 1,
 'prem_bowling': 1,
 'prem_lane': 1,
 'prem_woman_in': 1,
 'prem_in_red': 1,
 'prem_red_shirt': 1,
 'prem_shirt_walk': 1,
 'prem_walk_back': 1,
 'prem_back_from': 1,
 'prem_from_bowli

In [29]:
train_features, train_labels = [], []

for i, row in tqdm(data.iterrows()):
    train_features.append(get_features(row['premise'], row['hypothesis']))
    train_labels.append(row['target'])

549361it [17:03, 536.89it/s]


In [30]:
dev_features, dev_labels = [], []

for i, row in tqdm(dev_sample.iterrows()):
    dev_features.append(get_features(row['premise'], row['hypothesis']))
    dev_labels.append(row['target'])

9842it [00:19, 515.81it/s]


In [31]:
test_features, test_labels = [], []

for i, row in tqdm(test_sample.iterrows()):
    test_features.append(get_features(row['premise'], row['hypothesis']))
    test_labels.append(row['target'])

9824it [00:18, 523.99it/s]


### Modeling

In [32]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [33]:
RANDOM_STATE = 0

In [34]:
vec = DictVectorizer()

In [35]:
%%time

train_vec = vec.fit_transform(train_features)
dev_vec = vec.transform(dev_features)
test_vec = vec.transform(test_features)

print(f"Number of features: {len(vec.vocabulary_)}")

Number of features: 564659
CPU times: user 20 s, sys: 586 ms, total: 20.6 s
Wall time: 20.6 s


#### Logistic regression

In [36]:
lr_clf = LogisticRegression(penalty='l1', random_state=RANDOM_STATE)

lr_clf.fit(train_vec, train_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [37]:
pred = lr_clf.predict(dev_vec)

print(classification_report(pred, dev_labels))

               precision    recall  f1-score   support

contradiction       0.78      0.77      0.78      3303
   entailment       0.85      0.80      0.82      3555
      neutral       0.69      0.75      0.72      2984

    micro avg       0.78      0.78      0.78      9842
    macro avg       0.77      0.77      0.77      9842
 weighted avg       0.78      0.78      0.78      9842



In [38]:
pred = lr_clf.predict(test_vec)

print(classification_report(pred, test_labels))

               precision    recall  f1-score   support

contradiction       0.76      0.77      0.77      3217
   entailment       0.84      0.78      0.81      3616
      neutral       0.69      0.75      0.72      2991

    micro avg       0.77      0.77      0.77      9824
    macro avg       0.77      0.77      0.77      9824
 weighted avg       0.77      0.77      0.77      9824



In [39]:
import eli5
eli5.show_weights(lr_clf, vec=vec)

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+6.845,hyp_nobody,
+5.737,hyp_noone,
+5.037,hyp_xbox,
+4.952,hyp_family_lose,
+4.869,hyp_a_career,
+4.868,hyp_Hitler,
+4.754,hyp_duet_to,
… 26095 more positive …,… 26095 more positive …,
… 24087 more negative …,… 24087 more negative …,
-4.810,hyp_be_ther,

Weight?,Feature
+6.845,hyp_nobody
+5.737,hyp_noone
+5.037,hyp_xbox
+4.952,hyp_family_lose
+4.869,hyp_a_career
+4.868,hyp_Hitler
+4.754,hyp_duet_to
… 26095 more positive …,… 26095 more positive …
… 24087 more negative …,… 24087 more negative …
-4.810,hyp_be_ther

Weight?,Feature
+9.257,hyp_coat_-PRON-
+8.963,hyp_red_white
+8.213,hyp_not_empty
+7.573,hyp_proximity
+7.546,hyp_turn_red
+7.522,hyp_not_naked
+7.391,hyp_collar_worker
+6.742,hyp_not_all
+6.421,hyp_after_bike
+5.907,hyp_not_alone

Weight?,Feature
+7.495,hyp_joyously
+5.726,hyp_tall_human
+5.066,hyp_slim_human
+4.816,hyp_huge_person
+4.301,hyp_large_human
+4.260,hyp_tall_person
+4.247,hyp_not_enough
+4.198,hyp_light_black
+4.186,hyp_funny_human
… 29016 more positive …,… 29016 more positive …


#### LigthGBM

In [40]:
from lightgbm import LGBMClassifier
from sklearn.decomposition import TruncatedSVD

In [48]:
N_COMP = 400

In [49]:
def lgb_fscore(y_true, y_pred):
    y_pred = y_pred.reshape(len(np.unique(y_true)), -1)
    y_pred = y_pred.argmax(axis=0)
    res = f1_score(y_true, y_pred, average='macro')
    return 'macro_f1', res, True

In [50]:
%%time

t_svd = TruncatedSVD(n_components=N_COMP)

train_trunc = t_svd.fit_transform(train_vec)
dev_trunc = t_svd.transform(dev_vec)
test_trunc = t_svd.transform(test_vec)

CPU times: user 8min 21s, sys: 18.5 s, total: 8min 39s
Wall time: 4min 48s


In [53]:
params = {
    'num_class': 3,
    'num_rounds': 10000,
    'max_depth': -1,
    'learning_rate': 0.01,
    'num_leaves': 31,
    'verbose': 100,
    'early_stopping_rounds': 300,
    'min_data_in_leaf': 30,
    'lambda_l2': 0.7,
    'feature_fraction': 0.7,
    'metric': 'custom',
    'random_state': RANDOM_STATE
}

lgb_clf = LGBMClassifier(**params)

In [54]:
lgb_clf.fit(
    X=train_trunc,
    y=train_labels,
    eval_set=[(dev_trunc, dev_labels)],
    verbose=params['verbose'],
    eval_metric=lgb_fscore,
)

Training until validation scores don't improve for 300 rounds
[100]	valid_0's macro_f1: 0.552371
[200]	valid_0's macro_f1: 0.563179
[300]	valid_0's macro_f1: 0.568266
[400]	valid_0's macro_f1: 0.574213
[500]	valid_0's macro_f1: 0.581709
[600]	valid_0's macro_f1: 0.5854
[700]	valid_0's macro_f1: 0.590595
[800]	valid_0's macro_f1: 0.594405
[900]	valid_0's macro_f1: 0.600819
[1000]	valid_0's macro_f1: 0.606922
[1100]	valid_0's macro_f1: 0.610817
[1200]	valid_0's macro_f1: 0.613504
[1300]	valid_0's macro_f1: 0.616452
[1400]	valid_0's macro_f1: 0.616313
[1500]	valid_0's macro_f1: 0.61826
[1600]	valid_0's macro_f1: 0.620515
[1700]	valid_0's macro_f1: 0.622336
[1800]	valid_0's macro_f1: 0.624225
[1900]	valid_0's macro_f1: 0.624635
[2000]	valid_0's macro_f1: 0.626886
[2100]	valid_0's macro_f1: 0.628148
[2200]	valid_0's macro_f1: 0.62941
[2300]	valid_0's macro_f1: 0.631259
[2400]	valid_0's macro_f1: 0.63311
[2500]	valid_0's macro_f1: 0.634187
[2600]	valid_0's macro_f1: 0.635844
[2700]	valid_0's

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        early_stopping_rounds=300, feature_fraction=0.7,
        importance_type='split', lambda_l2=0.7, learning_rate=0.01,
        max_depth=-1, metric='custom', min_child_samples=20,
        min_child_weight=0.001, min_data_in_leaf=30, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_class=3, num_leaves=31,
        num_rounds=10000, objective=None, random_state=0, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0, verbose=100)

In [55]:
pred = lgb_clf.predict(dev_trunc)

print(classification_report(pred, dev_labels))

               precision    recall  f1-score   support

contradiction       0.63      0.64      0.63      3249
   entailment       0.72      0.69      0.71      3464
      neutral       0.60      0.62      0.61      3129

    micro avg       0.65      0.65      0.65      9842
    macro avg       0.65      0.65      0.65      9842
 weighted avg       0.65      0.65      0.65      9842



In [56]:
pred = lgb_clf.predict(test_trunc)

print(classification_report(pred, test_labels))

               precision    recall  f1-score   support

contradiction       0.64      0.65      0.65      3228
   entailment       0.73      0.70      0.71      3524
      neutral       0.59      0.62      0.60      3072

    micro avg       0.66      0.66      0.66      9824
    macro avg       0.65      0.65      0.65      9824
 weighted avg       0.66      0.66      0.66      9824

