In [1]:
import gensim

In [2]:
from scipy import spatial

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import fbeta_score,make_scorer

In [4]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Julia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

Скачиваем модель

In [6]:
model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec', binary=False)

In [7]:
train = pd.read_json('snli_1.0_train.jsonl', lines = True)
test = pd.read_json('snli_1.0_test.jsonl', lines = True)
val = pd.read_json('snli_1.0_dev.jsonl', lines = True)
full = {'train': train, 'val':val, 'test': test}

In [8]:
def delete_unsure_labels(data):
    data = data.drop(index = data[data.gold_label == '-'].index)
    return data


for frame in full:
    full[frame] = delete_unsure_labels(full[frame])

In [9]:
train = full['train']
val = full['val']
test = full['test']

Добавляем в данные столбцы с лемматизированными предложениями и со средними векторами каждого предложения

In [11]:
def avg_vec(words, model, num_features):
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in model:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [12]:
def add_avg_vec(data, sent, model):
    data[f'{sent}_lemmas'] = data[sent].apply(lambda x: tokenizer.tokenize(x.lower()))
    data[f'{sent}_avgvec'] = data[f'{sent}_lemmas'].apply(lambda x: avg_vec(x, model, 300))

In [13]:
for data in [train, val, test]:
    for sent in ['sentence1', 'sentence2']:
        add_avg_vec(data, sent, model)

Считаем схожесть двух предложений (одно число)

In [14]:
def count_similarity(data):
    data['similarity'] = data[['sentence1_avgvec', 'sentence2_avgvec']].apply(lambda x: 1 - spatial.distance.cosine(x['sentence1_avgvec'], x['sentence2_avgvec']), axis=1)

Вычитаем один вектор из другого (получаем вектор)

In [15]:
def count_distance(data):
    data['distance'] = data[['sentence1_avgvec', 'sentence2_avgvec']].apply(lambda x: x['sentence1_avgvec'] - x['sentence2_avgvec'], axis=1)

Можно добавить еще какие-нибудь метрики

In [16]:
for data in [train, val, test]:
    count_distance(data)

In [17]:
for data in [train, val, test]:
    count_similarity(data)

  dist = 1.0 - uv / np.sqrt(uu * vv)


### Будем обучать на расстоянии

In [20]:
X_train = np.array(train['distance'].tolist())
y_train = np.array(train['gold_label'])
X_val = np.array(val['distance'].tolist())
y_val = np.array(val['gold_label'])
X_test = np.array(test['distance'].tolist())
y_test = np.array(test['gold_label'])

### Будем обучать на схожести

In [84]:
X_train = train['similarity'].values.reshape(-1,1)
y_train = np.array(train['gold_label'])
X_val = val['similarity'].values.reshape(-1,1)
y_val = np.array(val['gold_label'])
X_test = test['similarity'].values.reshape(-1,1)
y_test = np.array(test['gold_label'])

[[0.90185082]
 [0.84060919]
 [0.91352427]
 ...
 [0.95085871]
 [0.88993967]
 [0.98977244]]
['neutral' 'contradiction' 'entailment' ... 'neutral' 'contradiction'
 'entailment']


В тренировочных данных появились откуда-то пропущенные значения, я не поняла, откуда, поэтому просто удалила их

In [86]:
indeces = np.argwhere(np.isnan(X_train))
X_train = np.delete(X_train, indeces).reshape(-1,1)
y_train = np.delete(y_train, indeces)
X_train

array([[0.84060919],
       [0.91352427],
       [0.83683312],
       ...,
       [0.95085871],
       [0.88993967],
       [0.98977244]])

In [22]:
clf = LogisticRegression(max_iter=100, solver = 'saga')
clf.fit(X_train, y_train)
y_train_preds = clf.predict(X_train)
f_train = fbeta_score(y_train, y_train_preds, beta=7, average=None,)
y_val_preds = clf.predict(X_val)
f_val = fbeta_score(y_val, y_val_preds, beta=7, average=None,)
print(f'train: {f_train}')
print(f'val: {f_val}')



train: [0.53739805 0.62706275 0.56522514]
val: [0.52235116 0.62097606 0.57441872]


In [90]:
clf = LogisticRegression(max_iter=500, solver = 'saga')
ftwo_scorer = make_scorer(fbeta_score, beta=3, average='weighted')
cross_val_score(clf, X_train, y_train, cv=3, scoring= ftwo_scorer)

array([0.39209549, 0.39414138, 0.39151485])

In [23]:
prfs = precision_recall_fscore_support(y_val, y_val_preds, beta=7, average=None, labels=['contradiction', 'neutral', 'entailment'])
matrix_report = pd.DataFrame(prfs, columns = ['contradiction', 'neutral', 'entailment'], index = ['precision', 'recall', 'fscore', 'support'])
matrix_report['Weighted*'] = 0.5 * matrix_report['contradiction'] +  0.25 * matrix_report['neutral'] +  0.25 * matrix_report['entailment']
matrix_report.T

Unnamed: 0,precision,recall,fscore,support
contradiction,0.564128,0.521965,0.522746,3278.0
neutral,0.58208,0.574343,0.574496,3235.0
entailment,0.572021,0.621508,0.620434,3329.0
Weighted*,0.570589,0.559945,0.560106,3280.0
