# Загрузка библиотек

In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer

from razdel import tokenize, sentenize
import re
import string
from rnnmorph.predictor import RNNMorphPredictor
import pymorphy2
import spacy
import pymystem3
"""
pyemestem3 работает быстрее при обработке
большого текста в моменте, по сравнению с
pymorphy2. Но если запускать pymystem3
много раз на кусках текста, то работа
будет медленной
"""

nltk.download('punkt')
nltk.download('stopwords')

# in terminal:
# python -m spacy download en_core_web_sm

2024-04-21 16:12:03.695467: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-21 16:12:03.695859: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-21 16:12:03.698337: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-21 16:12:03.730654: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /home/vitalii/nltk_data...
[nltk_

True

# Примеры

In [2]:
data = "All work and no play makes jack a dull boy, all work and no play"
tokens = word_tokenize(data.lower())
print(tokens, type(tokens))
print(sent_tokenize("I was going home when whe runs. It was a surprise."))

['all', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play'] <class 'list'>
['I was going home when whe runs.', 'It was a surprise.']


In [3]:
# razdel
text = "Кружка-термос на 0.5л (50/64 см², 516;...)"
list(tokenize(text))

[Substring(0, 13, 'Кружка-термос'),
 Substring(14, 16, 'на'),
 Substring(17, 20, '0.5'),
 Substring(20, 21, 'л'),
 Substring(22, 23, '('),
 Substring(23, 28, '50/64'),
 Substring(29, 32, 'см²'),
 Substring(32, 33, ','),
 Substring(34, 37, '516'),
 Substring(37, 38, ';'),
 Substring(38, 41, '...'),
 Substring(41, 42, ')')]

In [4]:
# nltk
tokens = word_tokenize(text.lower())
print(tokens)

['кружка-термос', 'на', '0.5л', '(', '50/64', 'см²', ',', '516', ';', '...', ')']


### N-gramm

In [5]:
data = "All work and no play makes jack a dull boy, all work and no play"
tokens = word_tokenize(data.lower())
unigram = list(nltk.ngrams(tokens, 1))
bigram = list(nltk.ngrams(tokens, 2))
trigram = list(nltk.ngrams(tokens, 3))
print(unigram[:5])
print(bigram[:5])
print(trigram[:5])

[('all',), ('work',), ('and',), ('no',), ('play',)]
[('all', 'work'), ('work', 'and'), ('and', 'no'), ('no', 'play'), ('play', 'makes')]
[('all', 'work', 'and'), ('work', 'and', 'no'), ('and', 'no', 'play'), ('no', 'play', 'makes'), ('play', 'makes', 'jack')]


### Стоп-слова

In [6]:
stop_words = set(stopwords.words('russian'))
print(stop_words)
print(len(stop_words))

{'то', 'чего', 'иногда', 'где', 'совсем', 'хорошо', 'как', 'чем', 'под', 'три', 'все', 'в', 'если', 'при', 'за', 'себя', 'ей', 'такой', 'на', 'всегда', 'тут', 'ж', 'мой', 'что', 'так', 'этот', 'эту', 'со', 'куда', 'тебя', 'можно', 'его', 'был', 'ну', 'всю', 'после', 'кто', 'им', 'они', 'моя', 'ее', 'есть', 'их', 'бы', 'эти', 'этом', 'даже', 'теперь', 'об', 'но', 'ни', 'нибудь', 'не', 'да', 'было', 'себе', 'ней', 'тем', 'чтобы', 'же', 'до', 'впрочем', 'у', 'быть', 'были', 'более', 'ты', 'свою', 'много', 'этой', 'никогда', 'чуть', 'уже', 'будто', 'тоже', 'всех', 'там', 'раз', 'потом', 'надо', 'хоть', 'сам', 'том', 'здесь', 'вы', 'только', 'уж', 'нет', 'он', 'всего', 'вам', 'без', 'между', 'к', 'разве', 'нельзя', 'по', 'меня', 'для', 'ему', 'нее', 'из', 'конечно', 'потому', 'опять', 'сейчас', 'через', 'во', 'я', 'вот', 'мы', 'чтоб', 'ним', 'больше', 'лучше', 'еще', 'была', 'этого', 'один', 'или', 'другой', 'над', 'может', 'тот', 'ничего', 'какой', 'перед', 'вдруг', 'зачем', 'ли', 'а', 'и'

In [7]:
stop_words = set(stopwords.words('english'))
print(stop_words)
print(len(stop_words))

{'who', 'do', 'her', 'own', 'are', 'down', 't', "won't", 'some', 'was', 'all', 'can', 'does', 'out', 'under', 'ma', 'over', 'through', 'the', 'don', 'too', 'that', 'doing', 'whom', 'll', 'to', 'those', 'i', 'no', 'above', "weren't", 'isn', 'themselves', "you've", "wasn't", 'won', 've', 'him', 'its', 'again', 'in', "she's", 'both', 'his', 'but', 'ain', 'himself', 'hadn', 'needn', "it's", 'aren', "shan't", 'couldn', 'than', 'd', 'where', 'during', 'have', "should've", 'until', 'an', 're', 'now', 'it', 'they', 'only', 'shan', 'about', "you'd", 'should', 'been', 'when', 'these', 'yourself', 'didn', 'had', 'and', 'is', 'which', 'did', 'yourselves', 'further', 'below', "wouldn't", 'or', 'from', 'why', "you'll", "doesn't", 'myself', 'once', 'weren', 'with', "needn't", 'same', "mustn't", 'as', 'ourselves', 'itself', 'on', 'your', "aren't", 'mightn', 'at', 'their', 'hasn', 'if', 'because', 'doesn', 'we', 'hers', 'not', 'for', 'haven', 'each', 'he', 'you', 'there', 'between', 'ours', 'theirs', "

In [8]:
print([word for word in tokens if word not in stop_words])

['work', 'play', 'makes', 'jack', 'dull', 'boy', ',', 'work', 'play']


In [9]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


### Стемминг и лемматизация

In [10]:
words_en = ["game", "gaming", "gamed", "games", "compacted"]
words_ru = ["корова", "мальчики", "мужчины", "столом", "убежала"]

In [11]:
ps = PorterStemmer()
print(list(map(ps.stem, words_en)))
print(list(map(ps.stem, words_ru)))  # Старый метод не работает с русским языком

['game', 'game', 'game', 'game', 'compact']
['корова', 'мальчики', 'мужчины', 'столом', 'убежала']


In [12]:
ss = SnowballStemmer(language="english")
print(list(map(ss.stem, words_en)))
ss = SnowballStemmer(language="russian")
print(list(map(ss.stem, words_ru)))

['game', 'game', 'game', 'game', 'compact']
['коров', 'мальчик', 'мужчин', 'стол', 'убежа']


In [13]:
text_en = """DENNIS: Listen, strange women lying in ponds 
distributing swords is no basis for a system of
government. Supreme executive power derives from a
mandate from the masses, not from some farcical aquatic ceremony.
"""

text_ru = """Не существует научных доказательств в пользу
эффективности НЛП, оно призвано псевдонаукой. Систематические
обзоры указывают, что НЛП основано на устаревших
представлениях об устройстве мозга, несовместимо с современной
неврологией и содержит ряд фактических ошибок.
"""

In [14]:
morph = pymorphy2.MorphAnalyzer()
pymorphy_results = list(map(lambda x: morph.parse(x), text_ru.split(' ')))
print(' '.join([res[0].normal_form for res in pymorphy_results]))

не существовать научный доказательство в пользу
эффективность нлп, оно призвать псевдонаукой. систематические
обзор указывают, что нлп основать на устаревших
представление о устройство мозга, несовместимый с современной
неврология и содержать ряд фактический ошибок.



##### Разборы слов

In [15]:
pymorphy_results[0]

[Parse(word='не', tag=OpencorporaTag('PRCL'), normal_form='не', score=1.0, methods_stack=((DictionaryAnalyzer(), 'не', 22, 0),))]

In [16]:
pymorphy_results[2]

[Parse(word='научных', tag=OpencorporaTag('ADJF,Qual plur,gent'), normal_form='научный', score=0.774193, methods_stack=((DictionaryAnalyzer(), 'научных', 12, 21),)),
 Parse(word='научных', tag=OpencorporaTag('ADJF,Qual plur,loct'), normal_form='научный', score=0.209677, methods_stack=((DictionaryAnalyzer(), 'научных', 12, 26),)),
 Parse(word='научных', tag=OpencorporaTag('ADJF,Qual anim,plur,accs'), normal_form='научный', score=0.016129, methods_stack=((DictionaryAnalyzer(), 'научных', 12, 23),))]

In [17]:
nlp = spacy.load("en_core_web_sm")
spacy_results = nlp(text_en)
print(' '.join([token.lemma_ for token in spacy_results]))


DENNIS : listen , strange woman lie in pond 
 distribute sword be no basis for a system of 
 government . Supreme executive power derive from a 
 mandate from the masse , not from some farcical aquatic ceremony . 



# Задачи

### Part-of-speech (определение части речи)

In [18]:
# 1.
[(res[0].normal_form, res[0].tag) for res in pymorphy_results[:9]]

[('не', OpencorporaTag('PRCL')),
 ('существовать', OpencorporaTag('VERB,impf,intr sing,3per,pres,indc')),
 ('научный', OpencorporaTag('ADJF,Qual plur,gent')),
 ('доказательство', OpencorporaTag('NOUN,inan,neut plur,gent')),
 ('в', OpencorporaTag('PREP')),
 ('пользу\nэффективность', OpencorporaTag('NOUN,inan,femn sing,gent')),
 ('нлп,', OpencorporaTag('UNKN')),
 ('оно', OpencorporaTag('NPRO,neut,3per,Anph sing,nomn')),
 ('призвать', OpencorporaTag('PRTS,perf,past,pssv neut,sing'))]

In [19]:
# 2.
[(token.lemma_, token.pos_) for token in spacy_results[:7]]

[('DENNIS', 'PROPN'),
 (':', 'PUNCT'),
 ('listen', 'VERB'),
 (',', 'PUNCT'),
 ('strange', 'ADJ'),
 ('woman', 'NOUN'),
 ('lie', 'VERB')]

In [20]:
# 3.
predictor = RNNMorphPredictor(language="ru")
rnnmorph_result = predictor.predict(text_ru.split(' '))
[(token.normal_form, token.posm, token.tag) for token in rnnmorph_result[:7]]

TypeError: Could not locate class 'Functional'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'class_name': 'Functional', 'config': {'name': 'model_1', 'layers': [{'class_name': 'InputLayer', 'config': {'batch_input_shape': [None, None, 56], 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'grammemes'}, 'name': 'grammemes', 'inbound_nodes': []}, {'class_name': 'Dropout', 'config': {'name': 'dropout_1', 'trainable': True, 'dtype': 'float32', 'rate': 0.2, 'noise_shape': None, 'seed': None}, 'name': 'dropout_1', 'inbound_nodes': [[['grammemes', 0, 0, {}]]]}, {'class_name': 'InputLayer', 'config': {'batch_input_shape': [None, None, 32], 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'chars'}, 'name': 'chars', 'inbound_nodes': []}, {'class_name': 'Dense', 'config': {'name': 'dense_1', 'trainable': True, 'dtype': 'float32', 'units': 30, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'name': 'dense_1', 'inbound_nodes': [[['dropout_1', 0, 0, {}]]]}, {'class_name': 'TimeDistributed', 'config': {'name': 'time_distributed_1', 'trainable': True, 'dtype': 'float32', 'layer': {'class_name': 'Functional', 'config': {'name': 'model_1', 'layers': [{'class_name': 'InputLayer', 'config': {'batch_input_shape': [None, 32], 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'chars'}, 'name': 'chars', 'inbound_nodes': []}, {'class_name': 'Embedding', 'config': {'name': 'chars_embeddings', 'trainable': True, 'batch_input_shape': [None, None], 'dtype': 'float32', 'input_dim': 101, 'output_dim': 24, 'embeddings_initializer': {'class_name': 'RandomUniform', 'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None}}, 'embeddings_regularizer': None, 'activity_regularizer': None, 'embeddings_constraint': None, 'mask_zero': False, 'input_length': None}, 'name': 'chars_embeddings', 'inbound_nodes': [[['chars', 0, 0, {}]]]}, {'class_name': 'Reshape', 'config': {'name': 'reshape_1', 'trainable': True, 'dtype': 'float32', 'target_shape': [768]}, 'name': 'reshape_1', 'inbound_nodes': [[['chars_embeddings', 0, 0, {}]]]}, {'class_name': 'Dropout', 'config': {'name': 'dropout_2', 'trainable': True, 'dtype': 'float32', 'rate': 0.2, 'noise_shape': None, 'seed': None}, 'name': 'dropout_2', 'inbound_nodes': [[['reshape_1', 0, 0, {}]]]}, {'class_name': 'Dense', 'config': {'name': 'dense_2', 'trainable': True, 'dtype': 'float32', 'units': 500, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'name': 'dense_2', 'inbound_nodes': [[['dropout_2', 0, 0, {}]]]}, {'class_name': 'Dropout', 'config': {'name': 'dropout_3', 'trainable': True, 'dtype': 'float32', 'rate': 0.2, 'noise_shape': None, 'seed': None}, 'name': 'dropout_3', 'inbound_nodes': [[['dense_2', 0, 0, {}]]]}, {'class_name': 'Dense', 'config': {'name': 'dense_3', 'trainable': True, 'dtype': 'float32', 'units': 200, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'name': 'dense_3', 'inbound_nodes': [[['dropout_3', 0, 0, {}]]]}, {'class_name': 'Dropout', 'config': {'name': 'dropout_4', 'trainable': True, 'dtype': 'float32', 'rate': 0.2, 'noise_shape': None, 'seed': None}, 'name': 'dropout_4', 'inbound_nodes': [[['dense_3', 0, 0, {}]]]}], 'input_layers': [['chars', 0, 0]], 'output_layers': [['dropout_4', 0, 0]]}}}, 'name': 'time_distributed_1', 'inbound_nodes': [[['chars', 0, 0, {}]]]}, {'class_name': 'Concatenate', 'config': {'name': 'LSTM_input', 'trainable': True, 'dtype': 'float32', 'axis': -1}, 'name': 'LSTM_input', 'inbound_nodes': [[['dense_1', 0, 0, {}], ['time_distributed_1', 0, 0, {}]]]}, {'class_name': 'Dense', 'config': {'name': 'dense_4', 'trainable': True, 'dtype': 'float32', 'units': 200, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'name': 'dense_4', 'inbound_nodes': [[['LSTM_input', 0, 0, {}]]]}, {'class_name': 'LSTM', 'config': {'name': 'LSTM_1_forward', 'trainable': True, 'dtype': 'float32', 'return_sequences': True, 'return_state': False, 'go_backwards': False, 'stateful': False, 'unroll': False, 'time_major': False, 'units': 128, 'activation': 'tanh', 'recurrent_activation': 'hard_sigmoid', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}, 'shared_object_id': 25}, 'recurrent_initializer': {'class_name': 'Orthogonal', 'config': {'gain': 1.0, 'seed': None}, 'shared_object_id': 26}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}, 'shared_object_id': 27}, 'unit_forget_bias': True, 'kernel_regularizer': None, 'recurrent_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'recurrent_constraint': None, 'bias_constraint': None, 'dropout': 0.3, 'recurrent_dropout': 0.3, 'implementation': 1}, 'name': 'LSTM_1_forward', 'inbound_nodes': [[['dense_4', 0, 0, {}]]]}, {'class_name': 'ReversedLSTM', 'config': {'name': 'LSTM_1_backward', 'trainable': True, 'dtype': 'float32', 'return_sequences': True, 'return_state': False, 'go_backwards': True, 'stateful': False, 'unroll': False, 'time_major': False, 'units': 128, 'activation': 'tanh', 'recurrent_activation': 'hard_sigmoid', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}, 'shared_object_id': 30}, 'recurrent_initializer': {'class_name': 'Orthogonal', 'config': {'gain': 1.0, 'seed': None}, 'shared_object_id': 31}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}, 'shared_object_id': 32}, 'unit_forget_bias': True, 'kernel_regularizer': None, 'recurrent_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'recurrent_constraint': None, 'bias_constraint': None, 'dropout': 0.3, 'recurrent_dropout': 0.3, 'implementation': 1}, 'name': 'LSTM_1_backward', 'inbound_nodes': [[['dense_4', 0, 0, {}]]]}, {'class_name': 'Concatenate', 'config': {'name': 'BiLSTM_input', 'trainable': True, 'dtype': 'float32', 'axis': -1}, 'name': 'BiLSTM_input', 'inbound_nodes': [[['LSTM_1_forward', 0, 0, {}], ['LSTM_1_backward', 0, 0, {}]]]}, {'class_name': 'Bidirectional', 'config': {'name': 'bidirectional_1', 'trainable': True, 'dtype': 'float32', 'layer': {'class_name': 'LSTM', 'config': {'name': 'LSTM_0', 'trainable': True, 'dtype': 'float32', 'return_sequences': True, 'return_state': False, 'go_backwards': False, 'stateful': False, 'unroll': False, 'time_major': False, 'units': 128, 'activation': 'tanh', 'recurrent_activation': 'hard_sigmoid', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}, 'shared_object_id': 36}, 'recurrent_initializer': {'class_name': 'Orthogonal', 'config': {'gain': 1.0, 'seed': None}, 'shared_object_id': 37}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}, 'shared_object_id': 38}, 'unit_forget_bias': True, 'kernel_regularizer': None, 'recurrent_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'recurrent_constraint': None, 'bias_constraint': None, 'dropout': 0.3, 'recurrent_dropout': 0.3, 'implementation': 1}}, 'merge_mode': 'concat'}, 'name': 'bidirectional_1', 'inbound_nodes': [[['BiLSTM_input', 0, 0, {}]]]}, {'class_name': 'TimeDistributed', 'config': {'name': 'time_distributed_2', 'trainable': True, 'dtype': 'float32', 'layer': {'class_name': 'Dense', 'config': {'name': 'dense_5', 'trainable': True, 'dtype': 'float32', 'units': 128, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}}}, 'name': 'time_distributed_2', 'inbound_nodes': [[['bidirectional_1', 0, 0, {}]]]}, {'class_name': 'TimeDistributed', 'config': {'name': 'time_distributed_3', 'trainable': True, 'dtype': 'float32', 'layer': {'class_name': 'Dropout', 'config': {'name': 'dropout_2', 'trainable': True, 'dtype': 'float32', 'rate': 0.2, 'noise_shape': None, 'seed': None}}}, 'name': 'time_distributed_3', 'inbound_nodes': [[['time_distributed_2', 0, 0, {}]]]}, {'class_name': 'TimeDistributed', 'config': {'name': 'time_distributed_4', 'trainable': True, 'dtype': 'float32', 'layer': {'class_name': 'BatchNormalization', 'config': {'name': 'batch_normalization_1', 'trainable': True, 'dtype': 'float32', 'axis': [1], 'momentum': 0.99, 'epsilon': 0.001, 'center': True, 'scale': True, 'beta_initializer': {'class_name': 'Zeros', 'config': {}}, 'gamma_initializer': {'class_name': 'Ones', 'config': {}}, 'moving_mean_initializer': {'class_name': 'Zeros', 'config': {}}, 'moving_variance_initializer': {'class_name': 'Ones', 'config': {}}, 'beta_regularizer': None, 'gamma_regularizer': None, 'beta_constraint': None, 'gamma_constraint': None}}}, 'name': 'time_distributed_4', 'inbound_nodes': [[['time_distributed_3', 0, 0, {}]]]}, {'class_name': 'TimeDistributed', 'config': {'name': 'time_distributed_5', 'trainable': True, 'dtype': 'float32', 'layer': {'class_name': 'Activation', 'config': {'name': 'activation_1', 'trainable': True, 'dtype': 'float32', 'activation': 'relu'}}}, 'name': 'time_distributed_5', 'inbound_nodes': [[['time_distributed_4', 0, 0, {}]]]}, {'class_name': 'Dense', 'config': {'name': 'main_pred', 'trainable': True, 'dtype': 'float32', 'units': 253, 'activation': 'softmax', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'name': 'main_pred', 'inbound_nodes': [[['time_distributed_5', 0, 0, {}]]]}], 'input_layers': [['grammemes', 0, 0], ['chars', 0, 0]], 'output_layers': [['main_pred', 0, 0]]}, 'keras_version': '2.8.0', 'backend': 'tensorflow'}

### Named entities recognition (Поиск именованых сущностей)

In [21]:
doc = nlp("Apple is looking at buing U.K. startup for $1 bullion")

for ent in doc.ents:
    print(ent.text,
          ent.start_char,
          ent.end_char,
          ent.label_)

Apple 0 5 ORG
U.K. 26 30 GPE
$1 bullion 43 53 MONEY
