# **Задание 1**

In [23]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [25]:
# предобработка текста
import os
os.environ["KERAS_BACKEND"] = "torch"
import keras
import pandas as pd
import numpy as np
from string import punctuation
from sklearn.model_selection import train_test_split
from collections import Counter
from IPython.display import Image
from IPython.core.display import HTML
import matplotlib.pyplot as plt
%matplotlib inline
from datasets import load_dataset
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('lenta_40k.csv.zip')

def preprocess(text):
    tokens = text.lower().split()
    tokens = [token.strip(punctuation) for token in tokens]
    return tokens

vocab = Counter()

for text in data.text:
    vocab.update(preprocess(text))

filtered_vocab = set()

for word in vocab:
    if vocab[word] > 30:
        filtered_vocab.add(word)

word2id = {'PAD':0, 'UNK':1}

for word in filtered_vocab:
    word2id[word] = len(word2id)

id2word = {i:word for word, i in word2id.items()}

X = []

for text in data.text:
    tokens = preprocess(text)
    ids = [word2id.get(token, 1) for token in tokens]
    X.append(ids)

MAX_LEN = max(len(x) for x in X)

MEAN_LEN = np.median([len(x) for x in X])

MAX_LEN = int(MEAN_LEN + 30)

X = keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_LEN)

id2label = {i:label for i, label in enumerate(set(data.topic.values))}
label2id = {l:i for i, l in id2label.items()}

y = keras.utils.to_categorical([label2id[label] for label in data.topic.values])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, stratify=y)

In [3]:
# callback-параметры

checkpoint = keras.callbacks.ModelCheckpoint('model.weights.h5',
                                                monitor='recall@precision',
                                                verbose=1,
                                                save_weights_only=True,
                                                save_best_only=True,
                                                mode='max',
                                                save_freq='epoch'
                                               )

# для остановки
early_stop = keras.callbacks.EarlyStopping(monitor='val_rec@prec',
                                              min_delta=0.01,
                                              patience=5,
                                           # взяла побольше; пусть терпит,
                                           # потому что господь с ним еще не закончил
                                              verbose=1,
                                              mode='max',
                                              )

In [5]:
# модель 1
inputs = keras.layers.Input(shape=(MAX_LEN,))
embeddings = keras.layers.Embedding(input_dim=len(word2id), output_dim=30)(inputs)

# 2 реккурентных слоя; во втором случае передается только последнее состояние
# RNN
rnn_LSTM = keras.layers.LSTM(128, return_sequences=True)(embeddings)
rnn_GPU = keras.layers.GRU(128, return_sequences=True)(rnn_LSTM)

# пулинг
ap_layer = keras.layers.GlobalAveragePooling1D()(rnn_GPU)
mp_layer = keras.layers.GlobalMaxPool1D()(rnn_GPU)
concat = keras.layers.concatenate([rnn_GPU[:,-1,:], ap_layer, mp_layer])

outputs = keras.layers.Dense(len(label2id), activation='softmax')(concat)

model1 = keras.Model(inputs=inputs, outputs=outputs)
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model1.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=[keras.metrics.RecallAtPrecision(0.8, name='rec@prec')])

model1.summary()

In [6]:
# обучение модели 1
model1.fit(X_train, y_train,
          validation_data=(X_valid, y_valid),
          batch_size=200,
          epochs=10,
          callbacks=[checkpoint, early_stop])

Epoch 1/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 615ms/step - loss: 2.3422 - rec@prec: 0.0072 - val_loss: 1.4641 - val_rec@prec: 0.2723
Epoch 2/10


  self._save_model(epoch=epoch, batch=None, logs=logs)


[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 582ms/step - loss: 1.2390 - rec@prec: 0.4088 - val_loss: 1.1577 - val_rec@prec: 0.4883
Epoch 3/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 571ms/step - loss: 0.7936 - rec@prec: 0.7043 - val_loss: 1.0517 - val_rec@prec: 0.5487
Epoch 4/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 583ms/step - loss: 0.5593 - rec@prec: 0.8538 - val_loss: 1.0750 - val_rec@prec: 0.5798
Epoch 5/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 574ms/step - loss: 0.4191 - rec@prec: 0.9187 - val_loss: 1.1507 - val_rec@prec: 0.5654
Epoch 6/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 570ms/step - loss: 0.3167 - rec@prec: 0.9558 - val_loss: 1.2930 - val_rec@prec: 0.5410
Epoch 7/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 569ms/step - loss: 0.2437 - rec@prec: 0.9728 - val_loss: 1.4728 - val_rec@prec: 0.5329
Epoch 8/10
[1m

<keras.src.callbacks.history.History at 0x7a03afd54e50>

In [17]:
# результаты модели 1

pred_model1 = model1.predict(X_valid)
print(classification_report(y_valid.argmax(1),
                             pred_model1.argmax(1),
                             zero_division=0))

[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 331ms/step
              precision    recall  f1-score   support

           0       0.37      0.37      0.37        84
           1       0.82      0.68      0.74       239
           2       0.19      0.27      0.23        22
           3       0.70      0.71      0.71       410
           4       0.43      0.37      0.40       132
           5       0.00      0.00      0.00         2
           6       0.66      0.66      0.66       160
           7       0.40      0.29      0.33        21
           8       0.56      0.62      0.59        66
          10       0.32      0.30      0.31        60
          11       0.00      0.00      0.00         1
          13       0.65      0.77      0.70       481
          14       0.83      0.67      0.74       159
          15       0.00      0.00      0.00         4
          16       0.93      0.91      0.92       195
          17       0.59      0.60      0.60       159
     

In [12]:
# модель 2
inputs = keras.layers.Input(shape=(MAX_LEN,))
embeddings = keras.layers.Embedding(input_dim=len(word2id), output_dim=30)(inputs)

# 2 реккурентных слоя; во втором случае передается только последнее состояние
# RNN
rnn_LSTM_bi = keras.layers.Bidirectional(keras.layers.LSTM(128,
                                                        return_sequences=True))(embeddings)
rnn_GPU_bi = keras.layers.Bidirectional(keras.layers.GRU(128,
                                                         return_sequences=True))(rnn_LSTM_bi)

# пулинг
ap_layer = keras.layers.GlobalAveragePooling1D()(rnn_GPU_bi)
mp_layer = keras.layers.GlobalMaxPool1D()(rnn_GPU_bi)
concat = keras.layers.concatenate([rnn_GPU_bi[:,-1,:], ap_layer, mp_layer])

outputs = keras.layers.Dense(len(label2id), activation='softmax')(concat)

model2 = keras.Model(inputs=inputs, outputs=outputs)
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model2.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=[keras.metrics.RecallAtPrecision(0.8, name='rec@prec')])

model2.summary()

In [13]:
# обучение модели 2
model2.fit(X_train, y_train,
          validation_data=(X_valid, y_valid),
          batch_size=200,
          epochs=10,
          callbacks=[checkpoint, early_stop])

Epoch 1/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 1s/step - loss: 2.1930 - rec@prec: 0.0338 - val_loss: 1.2964 - val_rec@prec: 0.3490
Epoch 2/10


  self._save_model(epoch=epoch, batch=None, logs=logs)


[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 1s/step - loss: 1.0435 - rec@prec: 0.5415 - val_loss: 0.9854 - val_rec@prec: 0.6082
Epoch 3/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 1s/step - loss: 0.6395 - rec@prec: 0.8153 - val_loss: 0.9883 - val_rec@prec: 0.5902
Epoch 4/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 1s/step - loss: 0.4307 - rec@prec: 0.9110 - val_loss: 1.0234 - val_rec@prec: 0.6294
Epoch 5/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 1s/step - loss: 0.2733 - rec@prec: 0.9668 - val_loss: 1.1560 - val_rec@prec: 0.5974
Epoch 6/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 1s/step - loss: 0.1732 - rec@prec: 0.9866 - val_loss: 1.3268 - val_rec@prec: 0.5794
Epoch 7/10
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 1s/step - loss: 0.1149 - rec@prec: 0.9935 - val_loss: 1.5513 - val_rec@prec: 0.5816
Epoch 8/10
[1m211/211[0m [32m━

<keras.src.callbacks.history.History at 0x7a03ae2e0dd0>

In [14]:
# результаты модели 2

pred_model2 = model2.predict(X_valid)
print(classification_report(y_valid.argmax(1),
                             pred_model2.argmax(1),
                             zero_division=0))

[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 579ms/step
              precision    recall  f1-score   support

           0       0.38      0.32      0.35        84
           1       0.81      0.78      0.79       239
           2       0.40      0.27      0.32        22
           3       0.70      0.71      0.70       410
           4       0.48      0.58      0.53       132
           5       0.00      0.00      0.00         2
           6       0.81      0.69      0.75       160
           7       0.20      0.29      0.24        21
           8       0.68      0.64      0.66        66
          10       0.50      0.28      0.36        60
          11       0.00      0.00      0.00         1
          13       0.69      0.74      0.71       481
          14       0.74      0.78      0.76       159
          15       1.00      0.25      0.40         4
          16       0.90      0.93      0.91       195
          17       0.72      0.65      0.68       159
     

# **Задание 2**

## Подготовка данных

In [27]:
dataset = load_dataset("tner/wikiann", 'ru')

The repository for tner/wikiann contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/tner/wikiann.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


test.jsonl:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/5.26M [00:00<?, ?B/s]

dev.jsonl:   0%|          | 0.00/2.63M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [44]:
# подготовка данных для обучения
vocab = Counter()

for sent in dataset['train']['tokens']:
    vocab.update([x.lower() for x in sent])

word2id = {'PAD':0, 'UNK':1}

for word in vocab:
    word2id[word] = len(word2id)

id2word = {i:word for word, i in word2id.items()}

X = []

for sent in dataset['train']['tokens']:
    tokens = [w.lower() for w in sent]
    ids = [word2id.get(token, 1) for token in tokens]
    X.append(ids)

X_test = []

for sent in dataset['test']['tokens']:
    tokens = [w.lower() for w in sent]
    ids = [word2id.get(token, 1) for token in tokens]
    X_test.append(ids)


MAX_LEN = max(len(x) for x in X)
# паддинг
X = keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_LEN, padding='post')
X_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=MAX_LEN, padding='post')

label2id = {
    "B-LOC": 0,
    "B-ORG": 1,
    "B-PER": 2,
    "I-LOC": 3,
    "I-ORG": 4,
    "I-PER": 5,
    "O": 6,
    "PAD": 7
}
id2labels = {v:k for k,v in label2id.items()}

y = keras.preprocessing.sequence.pad_sequences(dataset['train']['tags'], value=7,
                                                  maxlen=MAX_LEN,  padding='post')
y_test = keras.preprocessing.sequence.pad_sequences(dataset['test']['tags'], value=7,
                                                       maxlen=MAX_LEN,  padding='post')

X.shape, X_test.shape
y.shape, y_test.shape

((20000, 54), (10000, 54))

In [52]:
# secret tool we'll use later

import re

def tokenize(text, word2id):
    # токенизирует и переводит в индексы
    tokens = re.findall('\w+|[^\w\s]+', text)
    ids = [word2id.get(token.lower(), 1) for token in tokens]
    return tokens, ids

def pred2tags(pred, id2label, length):
    # декодирует индексы в части речи
    # length нужно чтобы откидывать паддинги или некорректные предсказания
    pred = pred.argmax(2)[0, :length]
    labels = [id2label[l] for l in pred]
    return labels

def label_seq(text, word2id, id2label, max_len, model):
    tokens, ids = tokenize(text, word2id)
    pred = model.predict(keras.preprocessing.sequence.pad_sequences([ids],
                                                                       maxlen=max_len,
                                                                       padding='post'))
    labels = pred2tags(pred, id2label, len(ids))

    return list(zip(tokens, labels))

## Модель 1

In [45]:
# модель wk1

inputs = keras.layers.Input(shape=(MAX_LEN,))
embeddings = keras.layers.Embedding(input_dim=len(word2id), output_dim=100)(inputs)

lstm_wk1 = keras.layers.LSTM(128, return_sequences=True)(embeddings)
gru_wk1 = keras.layers.GRU(128, return_sequences=True)(lstm_wk1)

outputs = keras.layers.Dense(len(label2id), activation='softmax')(gru_wk1)

model_wk1 = keras.Model(inputs=inputs, outputs=outputs)
model_wk1.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

model_wk1.summary()

In [46]:
model_wk1.fit(X, y,
              validation_data=(X_test, y_test),
              batch_size=1028,
              epochs=10,
              callbacks=[checkpoint, early_stop])

Epoch 1/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 207ms/step - accuracy: 0.7217 - loss: 1.4058 - val_accuracy: 0.8680 - val_loss: 0.4104
Epoch 2/10
[1m 1/20[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 160ms/step - accuracy: 0.8694 - loss: 0.4133

  self._save_model(epoch=epoch, batch=None, logs=logs)
  current = self.get_monitor_value(logs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 207ms/step - accuracy: 0.8694 - loss: 0.3478 - val_accuracy: 0.8689 - val_loss: 0.2936
Epoch 3/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 240ms/step - accuracy: 0.8707 - loss: 0.2855 - val_accuracy: 0.8735 - val_loss: 0.2717
Epoch 4/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 209ms/step - accuracy: 0.8974 - loss: 0.2616 - val_accuracy: 0.9258 - val_loss: 0.2502
Epoch 5/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 213ms/step - accuracy: 0.9282 - loss: 0.2387 - val_accuracy: 0.9336 - val_loss: 0.2327
Epoch 6/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 253ms/step - accuracy: 0.9351 - loss: 0.2199 - val_accuracy: 0.9366 - val_loss: 0.2204
Epoch 7/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 210ms/step - accuracy: 0.9384 - loss: 0.2035 - val_accuracy: 0.9384 - val_loss: 0.2105
Epoch 8/10
[1m20/20[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7a03ad41b510>

In [62]:
# результаты модели wk1
pred_model_wk1 = model_wk1.predict(X_test).argmax(2)

print(classification_report(y_test.reshape(-1), pred_model_wk1.reshape(-1),
                            labels=list(id2labels.keys()),
                            target_names=list(id2labels.values()),
                            zero_division=0))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 114ms/step
              precision    recall  f1-score   support

       B-LOC       0.25      0.00      0.00      4560
       B-ORG       0.00      0.00      0.00      4074
       B-PER       0.00      0.00      0.00      3542
       I-LOC       0.00      0.00      0.00      3060
       I-ORG       0.33      0.35      0.34      8008
       I-PER       0.84      0.04      0.08      7544
           O       0.65      0.96      0.77     40480
         PAD       0.99      1.00      1.00    468732

    accuracy                           0.95    540000
   macro avg       0.38      0.29      0.27    540000
weighted avg       0.93      0.95      0.93    540000



### Предсказания модели 1

In [54]:
label_seq('Лада припарковалась возле торгового центра.', word2id, id2labels,
          MAX_LEN, model_wk1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step


[('Лада', 'O'),
 ('припарковалась', 'O'),
 ('возле', 'O'),
 ('торгового', 'O'),
 ('центра', 'O'),
 ('.', 'O')]

In [67]:
label_seq('Том Пушкина был поставлен на самое видное место', word2id, id2labels,
          MAX_LEN, model_wk1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step


[('Том', 'O'),
 ('Пушкина', 'PAD'),
 ('был', 'PAD'),
 ('поставлен', 'O'),
 ('на', 'O'),
 ('самое', 'O'),
 ('видное', 'O'),
 ('место', 'O')]

In [56]:
label_seq('Вышка должна была столкнуться с рядом неприятностей.', word2id, id2labels,
          MAX_LEN, model_wk1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step


[('Вышка', 'O'),
 ('должна', 'O'),
 ('была', 'O'),
 ('столкнуться', 'O'),
 ('с', 'O'),
 ('рядом', 'O'),
 ('неприятностей', 'O'),
 ('.', 'O')]

Либо я что-то сделала не так, либо оно, эээ, не очень работает (точнее, вообще не работает).

## Модель 2

In [60]:
# модель wk2

inputs = keras.layers.Input(shape=(MAX_LEN,))
embeddings = keras.layers.Embedding(input_dim=len(word2id), output_dim=100)(inputs)

lstm_wk1 = keras.layers.LSTM(128, return_sequences=True)(embeddings)
gru_wk1 = keras.layers.GRU(128, return_sequences=True)(lstm_wk1)


lstm_wk2_1 = keras.layers.LSTM(100, return_sequences=True)(embeddings)
lstm_wk2_2 = keras.layers.LSTM(100, return_sequences=True)(lstm_wk2_1)
lstm_wk2_12 = keras.layers.concatenate([lstm_wk2_2, embeddings])
lstm_wk2_3 = keras.layers.LSTM(100, return_sequences=True)(lstm_wk2_12)
lstm_wk2_123 = keras.layers.concatenate([lstm_wk2_3, embeddings])

outputs = keras.layers.Dense(len(label2id), activation='softmax')(gru_wk1)

model_wk2 = keras.Model(inputs=inputs, outputs=outputs)
model_wk2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

model_wk2.summary()

In [61]:
model_wk2.fit(X, y,
              validation_data=(X_test, y_test),
              batch_size=1028,
              epochs=10,
              callbacks=[checkpoint, early_stop])

Epoch 1/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 210ms/step - accuracy: 0.7213 - loss: 1.3410 - val_accuracy: 0.8680 - val_loss: 0.3972
Epoch 2/10
[1m 1/20[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 166ms/step - accuracy: 0.8682 - loss: 0.3879

  self._save_model(epoch=epoch, batch=None, logs=logs)
  current = self.get_monitor_value(logs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 206ms/step - accuracy: 0.8697 - loss: 0.3412 - val_accuracy: 0.8689 - val_loss: 0.2925
Epoch 3/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 236ms/step - accuracy: 0.8702 - loss: 0.2852 - val_accuracy: 0.8995 - val_loss: 0.2669
Epoch 4/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 206ms/step - accuracy: 0.9104 - loss: 0.2576 - val_accuracy: 0.9318 - val_loss: 0.2439
Epoch 5/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 216ms/step - accuracy: 0.9316 - loss: 0.2352 - val_accuracy: 0.9348 - val_loss: 0.2278
Epoch 6/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 231ms/step - accuracy: 0.9365 - loss: 0.2164 - val_accuracy: 0.9360 - val_loss: 0.2163
Epoch 7/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 207ms/step - accuracy: 0.9386 - loss: 0.2027 - val_accuracy: 0.9383 - val_loss: 0.2079
Epoch 8/10
[1m20/20[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7a02d137ccd0>

In [63]:
# результаты модели wk1
pred_model_wk2 = model_wk2.predict(X_test).argmax(2)

print(classification_report(y_test.reshape(-1), pred_model_wk2.reshape(-1),
                            labels=list(id2labels.keys()),
                            target_names=list(id2labels.values()),
                            zero_division=0))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 83ms/step
              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00      4560
       B-ORG       0.00      0.00      0.00      4074
       B-PER       0.00      0.00      0.00      3542
       I-LOC       0.00      0.00      0.00      3060
       I-ORG       0.34      0.38      0.36      8008
       I-PER       0.59      0.01      0.01      7544
           O       0.63      0.97      0.77     40480
         PAD       1.00      1.00      1.00    468732

    accuracy                           0.95    540000
   macro avg       0.32      0.29      0.27    540000
weighted avg       0.93      0.95      0.93    540000



### Предсказания модели 2

In [65]:
label_seq('Лада припарковалась возле торгового центра.', word2id, id2labels,
          MAX_LEN, model_wk2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step


[('Лада', 'O'),
 ('припарковалась', 'O'),
 ('возле', 'O'),
 ('торгового', 'O'),
 ('центра', 'O'),
 ('.', 'O')]

In [66]:
label_seq('Том Пушкина был поставлен на самое видное место', word2id, id2labels,
          MAX_LEN, model_wk2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 332ms/step


[('Том', 'O'),
 ('Пушкина', 'PAD'),
 ('был', 'PAD'),
 ('поставлен', 'PAD'),
 ('на', 'O'),
 ('самое', 'O'),
 ('видное', 'PAD'),
 ('место', 'O')]

In [68]:
label_seq('Вышка должна была столкнуться с рядом неприятностей.', word2id, id2labels,
          MAX_LEN, model_wk1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step


[('Вышка', 'O'),
 ('должна', 'O'),
 ('была', 'O'),
 ('столкнуться', 'O'),
 ('с', 'O'),
 ('рядом', 'O'),
 ('неприятностей', 'O'),
 ('.', 'O')]

Ну ээээ... Тоже так себе определяет

## Модель 3

In [74]:
# модель wk3

inputs = keras.layers.Input(shape=(MAX_LEN,))
embeddings = keras.layers.Embedding(input_dim=len(word2id), output_dim=100)(inputs)

lstm_wk3 = keras.layers.LSTM(128, return_sequences=True)(embeddings)
gru_wk3 = keras.layers.GRU(128, return_sequences=True)(lstm_wk3)

conv1_wk3 = keras.layers.Conv1D(kernel_size=2, filters=128, strides=1, activation='relu',
                                padding='same')(lstm_wk3)
conv2_wk3 = keras.layers.Conv1D(kernel_size=2, filters=128, strides=1, activation='relu',
                                padding='same')(conv1_wk3)


outputs = keras.layers.Dense(len(label2id), activation='softmax')(conv2_wk3)

model_wk3 = keras.Model(inputs=inputs, outputs=outputs)
model_wk3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

model_wk3.summary()

In [75]:
model_wk3.fit(X, y,
              validation_data=(X_test, y_test),
              batch_size=1028,
              epochs=10,
              callbacks=[checkpoint, early_stop])

Epoch 1/10


  outputs = tnn.conv1d(


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 112ms/step - accuracy: 0.7337 - loss: 1.5487 - val_accuracy: 0.8680 - val_loss: 0.3818
Epoch 2/10
[1m 2/20[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 83ms/step - accuracy: 0.8688 - loss: 0.3661

  self._save_model(epoch=epoch, batch=None, logs=logs)
  current = self.get_monitor_value(logs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 110ms/step - accuracy: 0.8896 - loss: 0.3194 - val_accuracy: 0.9257 - val_loss: 0.2537
Epoch 3/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 108ms/step - accuracy: 0.9301 - loss: 0.2364 - val_accuracy: 0.9371 - val_loss: 0.2045
Epoch 4/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 123ms/step - accuracy: 0.9392 - loss: 0.1944 - val_accuracy: 0.9406 - val_loss: 0.1801
Epoch 5/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 125ms/step - accuracy: 0.9412 - loss: 0.1730 - val_accuracy: 0.9419 - val_loss: 0.1639
Epoch 6/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 107ms/step - accuracy: 0.9428 - loss: 0.1525 - val_accuracy: 0.9481 - val_loss: 0.1489
Epoch 7/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 107ms/step - accuracy: 0.9558 - loss: 0.1300 - val_accuracy: 0.9584 - val_loss: 0.1298
Epoch 8/10
[1m20/20[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7a02d04f8a10>

In [76]:
# результаты модели wk3
pred_model_wk3 = model_wk3.predict(X_test).argmax(2)

print(classification_report(y_test.reshape(-1), pred_model_wk3.reshape(-1),
                            labels=list(id2labels.keys()),
                            target_names=list(id2labels.values()),
                            zero_division=0))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 44ms/step
              precision    recall  f1-score   support

       B-LOC       0.63      0.44      0.52      4560
       B-ORG       0.37      0.45      0.41      4074
       B-PER       0.70      0.60      0.65      3542
       I-LOC       0.69      0.40      0.50      3060
       I-ORG       0.64      0.70      0.67      8008
       I-PER       0.88      0.70      0.78      7544
           O       0.87      0.93      0.90     40480
         PAD       1.00      1.00      1.00    468732

    accuracy                           0.97    540000
   macro avg       0.72      0.65      0.68    540000
weighted avg       0.97      0.97      0.97    540000



### Предсказания модели 3

In [77]:
label_seq('Лада припарковалась возле торгового центра.', word2id, id2labels,
          MAX_LEN, model_wk3)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step


[('Лада', 'I-ORG'),
 ('припарковалась', 'I-ORG'),
 ('возле', 'B-ORG'),
 ('торгового', 'I-ORG'),
 ('центра', 'I-ORG'),
 ('.', 'O')]

In [78]:
label_seq('Том Пушкина был поставлен на самое видное место', word2id, id2labels,
          MAX_LEN, model_wk3)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 263ms/step


[('Том', 'I-PER'),
 ('Пушкина', 'O'),
 ('был', 'O'),
 ('поставлен', 'O'),
 ('на', 'O'),
 ('самое', 'O'),
 ('видное', 'I-LOC'),
 ('место', 'O')]

In [80]:
label_seq('Вышка должна была столкнуться с рядом неприятностей.', word2id, id2labels,
          MAX_LEN, model_wk3)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step


[('Вышка', 'O'),
 ('должна', 'O'),
 ('была', 'O'),
 ('столкнуться', 'O'),
 ('с', 'O'),
 ('рядом', 'O'),
 ('неприятностей', 'O'),
 ('.', 'O')]

В целом и по метрикам, и по итоговым результатам model_wk3 лучше (она хотя бы что-то определила). Однако ряд слов, которые вообще ни по каким признаком не могут являться NE, определяются как NE (например, "припарковалась"). Удивило, что "Том" определилось как персона, хотя я была уверена, что модель справится с разграничением по смыслу.
В целом проблема либо в выбранных мною архитектурах (хотя со сверточными слоями обучение произошло лучше), либо в самом датасете (нужно бы его поизучать детальнее, чтобы понять, какие могут быть проблемы).