Датасет:
https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus?select=ner_dataset.csv

### Загружаем данные

In [None]:
!unzip archive.zip

Archive:  archive.zip
  inflating: ner.csv                 
  inflating: ner_dataset.csv         


In [None]:
import pandas as pd
data = pd.read_csv('ner_dataset.csv', encoding= 'unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [None]:
data['POS'].value_counts()

In [None]:
data['Tag'].value_counts(normalize=True)

O        0.846776
B-geo    0.035900
B-tim    0.019391
B-org    0.019210
I-per    0.016452
B-per    0.016203
I-org    0.016006
B-gpe    0.015135
I-geo    0.007071
I-tim    0.006226
B-art    0.000383
B-eve    0.000294
I-art    0.000283
I-eve    0.000241
B-nat    0.000192
I-gpe    0.000189
I-nat    0.000049
Name: Tag, dtype: float64

### Конвертируем данные в числовой вид

- {token} -> {token id}: построим embeddings
- {tag} -> {tag id}.

In [None]:
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [None]:
# token2idx
tag2idx
idx2tag

{0: 'B-nat',
 1: 'O',
 2: 'B-per',
 3: 'I-geo',
 4: 'I-art',
 5: 'B-gpe',
 6: 'I-nat',
 7: 'B-geo',
 8: 'I-gpe',
 9: 'B-art',
 10: 'I-tim',
 11: 'I-eve',
 12: 'I-per',
 13: 'B-eve',
 14: 'I-org',
 15: 'B-org',
 16: 'B-tim'}

In [None]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,19809,1
1,,of,IN,O,31466,1
2,,demonstrators,NNS,O,33809,1
3,,have,VBP,O,35043,1
4,,marched,VBN,O,27336,1


### Выполняем трансформацию датасета для получения данных в виде строк

In [None]:
# Заполнение пропусков
data_fillna = data.fillna(method='ffill', axis=0)
# Группируем
data_group = data_fillna.groupby(
['Sentence #'],as_index=False
)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
# Смотрим
data_group.head()

  


Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[19809, 31466, 33809, 35043, 27336, 29489, 148...","[1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 7, 1, 1, ..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[25507, 4346, 27575, 28781, 3853, 28485, 10136...","[5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[33353, 9778, 13110, 5362, 26819, 8512, 6381, ...","[1, 1, 16, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 15..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[3469, 21759, 14035, 6589, 16702, 24405, 11326...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[21539, 15913, 3340, 3943, 22486, 34615, 11346...","[7, 1, 1, 2, 12, 1, 16, 1, 7, 1, 5, 1, 5, 1, 1..."


### Разбиваем данные на выборки

In [None]:
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    print('maxlen = {}'.format(maxlen))
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tags length:', len(train_tags),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

maxlen = 104
train_tokens length: 32372 
train_tags length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


In [None]:
train_tokens.shape

(32372, 104)

### Строим нейронную сеть

In [None]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

In [None]:
# фиксируем состояния для воспроизводимости экспериментов
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [None]:
input_dim = len(list(set(data['Word'].to_list()))) + 1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

input_dim:  35179 
output_dim:  64 
input_length:  104 
n_tags:  17


In [None]:

def get_bilstm_lstm_model():
    model = Sequential()

    # Слой Embedding
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Слой bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Слой LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Слой timeDistributed Layer (обеспечивает выход формата many-to-many)
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser 
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [None]:
model_bilstm_lstm = get_bilstm_lstm_model()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 104, 64)           2251456   
                                                                 
 bidirectional_4 (Bidirectio  (None, 104, 128)         66048     
 nal)                                                            
                                                                 
 lstm_9 (LSTM)               (None, 104, 64)           49408     
                                                                 
 dense_4 (Dense)             (None, 104, 17)           1105      
                                                                 
Total params: 2,368,017
Trainable params: 2,368,017
Non-trainable params: 0
_________________________________________________________________


#### Обучение

In [None]:
def train_model(X, y, model):
    loss = list()
    for i in range(3):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=128, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [None]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 104, 64)           2251456   
                                                                 
 bidirectional_3 (Bidirectio  (None, 104, 128)         66048     
 nal)                                                            
                                                                 
 lstm_7 (LSTM)               (None, 104, 64)           49408     
                                                                 
 time_distributed_2 (TimeDis  (None, 104, 17)          1105      
 tributed)                                                       
                                                                 
Total params: 2,368,017
Trainable params: 2,368,017
Non-trainable params: 0
_________________________________________________________________


### Смотрим результат работы сети 

In [None]:
predict = model_bilstm_lstm.predict(test_tokens)

In [None]:
num = 700
np.argmax(predict[num], axis=1)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
np.argmax(test_tags[num], axis=1)

array([7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])