In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import fbeta_score,make_scorer

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import paired_distances
from scipy.sparse import hstack

In [3]:
train = pd.read_json('snli_1.0/snli_1.0_train.jsonl', lines = True)
test = pd.read_json('snli_1.0/snli_1.0_test.jsonl', lines = True)
val = pd.read_json('snli_1.0/snli_1.0_dev.jsonl', lines = True)
full = {'train': train, 'val':val, 'test': test}

In [4]:
train.head()

Unnamed: 0,annotator_labels,captionID,gold_label,pairID,sentence1,sentence1_binary_parse,sentence1_parse,sentence2,sentence2_binary_parse,sentence2_parse
0,[neutral],3416050480.jpg#4,neutral,3416050480.jpg#4r1n,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,A person is training his horse for a competition.,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
1,[contradiction],3416050480.jpg#4,contradiction,3416050480.jpg#4r1c,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is at a diner, ordering an omelette.",( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
2,[entailment],3416050480.jpg#4,entailment,3416050480.jpg#4r1e,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is outdoors, on a horse.","( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...",(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
3,[neutral],2267923837.jpg#2,neutral,2267923837.jpg#2r1n,Children smiling and waving at camera,( Children ( ( ( smiling and ) waving ) ( at c...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,They are smiling at their parents,( They ( are ( smiling ( at ( their parents ) ...,(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...
4,[entailment],2267923837.jpg#2,entailment,2267923837.jpg#2r1e,Children smiling and waving at camera,( Children ( ( ( smiling and ) waving ) ( at c...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,There are children present,( There ( ( are children ) present ) ),(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...


## EDA

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550152 entries, 0 to 550151
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   annotator_labels        550152 non-null  object
 1   captionID               550152 non-null  object
 2   gold_label              550152 non-null  object
 3   pairID                  550152 non-null  object
 4   sentence1               550152 non-null  object
 5   sentence1_binary_parse  550152 non-null  object
 6   sentence1_parse         550152 non-null  object
 7   sentence2               550152 non-null  object
 8   sentence2_binary_parse  550152 non-null  object
 9   sentence2_parse         550152 non-null  object
dtypes: object(10)
memory usage: 42.0+ MB


In [6]:
def label_distribution(data, frame):
    print(frame, data.groupby('gold_label')['gold_label'].count())
    print()
    

for frame in full:
    label_distribution(full[frame], frame)

train gold_label
-                   785
contradiction    183187
entailment       183416
neutral          182764
Name: gold_label, dtype: int64

val gold_label
-                 158
contradiction    3278
entailment       3329
neutral          3235
Name: gold_label, dtype: int64

test gold_label
-                 176
contradiction    3237
entailment       3368
neutral          3219
Name: gold_label, dtype: int64



In [7]:
def delete_unsure_labels(data):
    #удаляет объекты, у к-х лейбл '-' (т.е вид связи неопределён)
    data = data.drop(index = data[data.gold_label == '-'].index)
    return data


for frame in full:
    full[frame] = delete_unsure_labels(full[frame])

In [8]:
train = full['train']
val = full['val']
test = full['test']

### TF-IDF

In [9]:
corpus = train.sentence1 +" "+ train.sentence2

In [10]:
corpus[0]

'A person on a horse jumps over a broken down airplane. A person is training his horse for a competition.'

In [11]:
stop = set(stopwords.words('english'))
tfidf = TfidfVectorizer(stop_words=stop)
tfidf.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [12]:
def tfidf_distance(data):
    # преобразует предложения и считает расстояния(евклидово, матхэт., косинусное) между преобразованными предложениями
    data_sentence1 = tfidf.transform(data.sentence1)
    data_sentence2 = tfidf.transform(data.sentence2)
    difference = data_sentence1 - data_sentence2
    
    euclidean = paired_distances(data_sentence1, data_sentence2, metric='euclidean')
    manhattan= paired_distances(data_sentence1, data_sentence2, metric='manhattan')
    cosine = paired_distances(data_sentence1, data_sentence2, metric='cosine')

    distances =  hstack((np.array([euclidean, manhattan, cosine]).T, difference))
    return distances

In [13]:
X_train =  tfidf_distance(train)
y_train = train.gold_label

X_val = tfidf_distance(val)
y_val = val.gold_label

### LogisticRegression

In [49]:
clf = LogisticRegression(max_iter=200, solver = 'saga')
ftwo_scorer = make_scorer(fbeta_score, beta=3, average='weighted', labels=['contradiction', 'neutral', 'entailment'])
cross_val_score(clf, X_train , y_train, cv=3, scoring= ftwo_scorer)

array([0.64774937, 0.64994823, 0.64645655])

In [134]:
clf.fit(X_train , y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [1]:
y_val_probas = clf.predict_proba(X_val)
y_val_preds = clf.predict(X_val)

NameError: name 'clf' is not defined

In [137]:
def print_metric(y_true, y_preds):
    prfs = precision_recall_fscore_support(y_val, y_val_preds, beta=3, average=None, labels=['contradiction', 'neutral', 'entailment'])
    matrix_report = pd.DataFrame(prfs, columns = ['contradiction', 'neutral', 'entailment'], index = ['precision', 'recall', 'fscore', 'support'])
    matrix_report['Weighted*'] = 0.5 * matrix_report['contradiction'] +  0.25 * matrix_report['neutral'] +  0.25 * matrix_report['entailment']
    matrix_report.drop(index = 'support', inplace = True)
    print(matrix_report.T)

In [142]:
print_metric(y_val,y_val_preds)

               precision    recall    fscore
contradiction   0.661084  0.662294  0.662173
neutral         0.674443  0.607728  0.613800
entailment      0.688169  0.753079  0.746042
Weighted*       0.671195  0.671349  0.671047


In [151]:
y_val_preds = np.where(y_val_probas[:, 0] > 0.3, 'contradiction', y_val_preds)

In [152]:
print_metric(y_val,y_val_preds)

               precision    recall    fscore
contradiction   0.573234  0.774863  0.748534
neutral         0.721377  0.498609  0.514497
entailment      0.708976  0.676179  0.679322
Weighted*       0.644205  0.681128  0.672722


### GradientBoosting

In [129]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=31).fit(X_train, y_train)

NameError: name 'X_test' is not defined

In [153]:
y_val_preds = gbc.predict(X_val)
y_val_probas = gbc.predict_proba(X_val)
#y_val_preds = np.where(y_val_probas[:, 0] > 0.3, 'contradiction', y_val_preds)
print_metric(y_val,y_val_preds)

               precision    recall    fscore
contradiction   0.629988  0.616534  0.617854
neutral         0.587302  0.548995  0.552600
entailment      0.643767  0.698108  0.692264
Weighted*       0.622761  0.620043  0.620143


### XGBoost

In [154]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

               precision    recall    fscore
contradiction   0.629988  0.616534  0.617854
neutral         0.587302  0.548995  0.552600
entailment      0.643767  0.698108  0.692264
Weighted*       0.622761  0.620043  0.620143


In [157]:
y_val_preds = xgb.predict(X_val)
y_val_probas = xgb.predict_proba(X_val)
print_metric(y_val,y_val_preds)

               precision    recall    fscore
contradiction   0.670187  0.677547  0.676804
neutral         0.614149  0.614529  0.614491
entailment      0.708295  0.700210  0.701010
Weighted*       0.665705  0.667458  0.667277
