In [2]:
import gensim

In [3]:
from scipy import spatial
from sklearn.metrics.pairwise import paired_distances
from scipy.sparse import hstack

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import fbeta_score,make_scorer

In [5]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Julia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [8]:
train = pd.read_json('snli_1.0_train.jsonl', lines = True)
test = pd.read_json('snli_1.0_test.jsonl', lines = True)
val = pd.read_json('snli_1.0_dev.jsonl', lines = True)
full = {'train': train, 'val':val, 'test': test}

In [32]:
def delete_unsure_labels(data):
    data = data.drop(index = data[data.gold_label == '-'].index)
    return data


for frame in full:
    full[frame] = delete_unsure_labels(full[frame])

In [34]:
train = full['train']
val = full['val']
test = full['test']

Скачиваем модель

In [7]:
model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec', binary=False)

Добавляем в данные столбцы с лемматизированными предложениями и со средними векторами каждого предложения

In [35]:
def avg_vec(words, model, num_features):
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in model:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [36]:
def add_avg_vec(data, sent, model):
    data[f'{sent}_lemmas'] = data[sent].apply(lambda x: tokenizer.tokenize(x.lower()))
    data[f'{sent}_avgvec'] = data[f'{sent}_lemmas'].apply(lambda x: avg_vec(x, model, 300))

In [37]:
for data in [train, val, test]:
    for sent in ['sentence1', 'sentence2']:
        add_avg_vec(data, sent, model)

In [24]:
def count_distances(data):
    # преобразует предложения и считает расстояния(евклидово, матхэт., косинусное) между преобразованными предложениями
    vec_1 = np.array(data['sentence1_avgvec'].tolist())
    vec_2 = np.array(data['sentence2_avgvec'].tolist())
    euclidean = paired_distances(vec_1, vec_2, metric='euclidean')
    manhattan= paired_distances(vec_1, vec_2, metric='manhattan')
    cosine = paired_distances(vec_1, vec_2, metric='cosine')

    distances =  np.array([euclidean, manhattan, cosine]).T
    return distances

In [38]:
X_train = count_distances(train)
y_train = np.array(train['gold_label'])
X_val = count_distances(val)
y_val = np.array(val['gold_label'])
X_test = count_distances(test)
y_test = np.array(test['gold_label'])

### Обучение

In [40]:
clf = LogisticRegression(max_iter=100, solver = 'saga')
clf.fit(X_train, y_train)
y_train_preds = clf.predict(X_train)
f_train = fbeta_score(y_train, y_train_preds, beta=7, average=None,)
y_val_preds = clf.predict(X_val)
f_val = fbeta_score(y_val, y_val_preds, beta=7, average=None,)
print(f'train: {f_train}')
print(f'val: {f_val}')

train: [0.5664823  0.64554178 0.13013437]
val: [0.57963358 0.64230574 0.1359036 ]


In [39]:
clf = LogisticRegression(max_iter=500, solver = 'saga')
ftwo_scorer = make_scorer(fbeta_score, beta=3, average='weighted')
cross_val_score(clf, X_train, y_train, cv=3, scoring= ftwo_scorer)

array([0.4381281 , 0.44300944, 0.44016212])

In [29]:
prfs = precision_recall_fscore_support(y_val, y_val_preds, beta=7, average=None, labels=['contradiction', 'neutral', 'entailment'])
matrix_report = pd.DataFrame(prfs, columns = ['contradiction', 'neutral', 'entailment'], index = ['precision', 'recall', 'fscore', 'support'])
matrix_report['Weighted*'] = 0.5 * matrix_report['contradiction'] +  0.25 * matrix_report['neutral'] +  0.25 * matrix_report['entailment']
matrix_report.T

Unnamed: 0,precision,recall,fscore,support
contradiction,0.464008,0.582062,0.579115,3278.0
neutral,0.368421,0.134158,0.135886,3235.0
entailment,0.457113,0.646741,0.641419,3329.0
Weighted*,0.438387,0.486256,0.483884,3280.0
