In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import fbeta_score,make_scorer

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords


In [16]:
train = pd.read_json('snli_1.0/snli_1.0_train.jsonl', lines = True)
test = pd.read_json('snli_1.0/snli_1.0_test.jsonl', lines = True)
val = pd.read_json('snli_1.0/snli_1.0_dev.jsonl', lines = True)
full = {'train': train, 'val':val, 'test': test}

In [8]:
train.head()

Unnamed: 0,annotator_labels,captionID,gold_label,pairID,sentence1,sentence1_binary_parse,sentence1_parse,sentence2,sentence2_binary_parse,sentence2_parse
0,[neutral],3416050480.jpg#4,neutral,3416050480.jpg#4r1n,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,A person is training his horse for a competition.,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
1,[contradiction],3416050480.jpg#4,contradiction,3416050480.jpg#4r1c,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is at a diner, ordering an omelette.",( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
2,[entailment],3416050480.jpg#4,entailment,3416050480.jpg#4r1e,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is outdoors, on a horse.","( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...",(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
3,[neutral],2267923837.jpg#2,neutral,2267923837.jpg#2r1n,Children smiling and waving at camera,( Children ( ( ( smiling and ) waving ) ( at c...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,They are smiling at their parents,( They ( are ( smiling ( at ( their parents ) ...,(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...
4,[entailment],2267923837.jpg#2,entailment,2267923837.jpg#2r1e,Children smiling and waving at camera,( Children ( ( ( smiling and ) waving ) ( at c...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,There are children present,( There ( ( are children ) present ) ),(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550152 entries, 0 to 550151
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   annotator_labels        550152 non-null  object
 1   captionID               550152 non-null  object
 2   gold_label              550152 non-null  object
 3   pairID                  550152 non-null  object
 4   sentence1               550152 non-null  object
 5   sentence1_binary_parse  550152 non-null  object
 6   sentence1_parse         550152 non-null  object
 7   sentence2               550152 non-null  object
 8   sentence2_binary_parse  550152 non-null  object
 9   sentence2_parse         550152 non-null  object
dtypes: object(10)
memory usage: 42.0+ MB


In [65]:
def label_distribution(data, frame):
    print(frame, data.groupby('gold_label')['gold_label'].count())
    print()
    

for frame in full:
    label_distribution(full[frame], frame)

train gold_label
contradiction    183187
entailment       183416
neutral          182764
Name: gold_label, dtype: int64

val gold_label
contradiction    3278
entailment       3329
neutral          3235
Name: gold_label, dtype: int64

test gold_label
contradiction    3237
entailment       3368
neutral          3219
Name: gold_label, dtype: int64



In [61]:
def delete_unsure_labels(data):
    data = data.drop(index = data[data.gold_label == '-'].index)
    return data


for frame in full:
    full[frame] = delete_unsure_labels(full[frame])

In [67]:
train = full['train']
val = full['val']
test = full['test']

In [68]:
corpus = train.sentence1 +" "+ train.sentence2

In [69]:
corpus[0]

'A person on a horse jumps over a broken down airplane. A person is training his horse for a competition.'

In [70]:
stop = set(stopwords.words('english'))
tfidf = TfidfVectorizer(stop_words=stop)
tfidf.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [71]:
X_train_sentence1 = tfidf.transform(train.sentence1)
X_train_sentence2 = tfidf.transform(train.sentence2)

In [72]:
#разность между предложениями
X_train_distance = X_train_sentence1 - X_train_sentence2
y_train = train.gold_label

In [73]:
X_val_sentence1 = tfidf.transform(val.sentence1)
X_val_sentence2 = tfidf.transform(val.sentence2)
#разность между предложениями
X_val_distance = X_val_sentence1 - X_val_sentence2
y_val = val.gold_label

In [99]:
clf = LogisticRegression(max_iter=500, solver = 'saga')
ftwo_scorer = make_scorer(fbeta_score, beta=3, average='weighted')
cross_val_score(clf, X_train_distance , y_train, cv=3, scoring= ftwo_scorer)

array([0.61596138, 0.6171218 , 0.6146658 ])

In [101]:
clf.fit(X_train_distance , y_train)
y_val_preds = clf.predict(X_val_distance)

In [116]:
fbeta_score(y_val, y_val_preds, beta=7, average=None,)

array([0.5934564 , 0.69128567, 0.61256075])

In [119]:
prfs = precision_recall_fscore_support(y_val, y_val_preds, beta=7, average=None, labels=['contradiction', 'neutral', 'entailment'])
matrix_report = pd.DataFrame(prfs, columns = ['contradiction', 'neutral', 'entailment'], index = ['precision', 'recall', 'fscore', 'support'])
matrix_report['Weighted*'] = 0.5 * matrix_report['contradiction'] +  0.25 * matrix_report['neutral'] +  0.25 * matrix_report['entailment']
matrix_report.T

Unnamed: 0,precision,recall,fscore,support
contradiction,0.630844,0.592739,0.593456,3278.0
neutral,0.655298,0.611747,0.612561,3235.0
entailment,0.616515,0.693001,0.691286,3329.0
Weighted*,0.633375,0.622557,0.62269,3280.0
