# Загрузка библиотек

In [41]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

import numpy as np
from numpy.linalg import norm

import gensim.downloader as api
from gensim.models import Word2Vec

from razdel import tokenize, sentenize
import re
import string
from rnnmorph.predictor import RNNMorphPredictor
import pymorphy2
import spacy
import pymystem3
"""
pyemestem3 работает быстрее при обработке
большого текста в моменте, по сравнению с
pymorphy2. Но если запускать pymystem3
много раз на кусках текста, то работа
будет медленной
"""

nltk.download('punkt')
nltk.download('stopwords')

# in terminal:
# python -m spacy download en_core_web_sm
# pip install scipy==1.10.1

[nltk_data] Downloading package punkt to /home/vitalii/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vitalii/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Примеры

In [2]:
data = "All work and no play makes jack a dull boy, all work and no play"
tokens = word_tokenize(data.lower())
print(tokens, type(tokens))
print(sent_tokenize("I was going home when whe runs. It was a surprise."))

['all', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play'] <class 'list'>
['I was going home when whe runs.', 'It was a surprise.']


In [3]:
# razdel
text = "Кружка-термос на 0.5л (50/64 см², 516;...)"
list(tokenize(text))

[Substring(0, 13, 'Кружка-термос'),
 Substring(14, 16, 'на'),
 Substring(17, 20, '0.5'),
 Substring(20, 21, 'л'),
 Substring(22, 23, '('),
 Substring(23, 28, '50/64'),
 Substring(29, 32, 'см²'),
 Substring(32, 33, ','),
 Substring(34, 37, '516'),
 Substring(37, 38, ';'),
 Substring(38, 41, '...'),
 Substring(41, 42, ')')]

In [4]:
# nltk
tokens = word_tokenize(text.lower())
print(tokens)

['кружка-термос', 'на', '0.5л', '(', '50/64', 'см²', ',', '516', ';', '...', ')']


### N-gramm

In [5]:
data = "All work and no play makes jack a dull boy, all work and no play"
tokens = word_tokenize(data.lower())
unigram = list(nltk.ngrams(tokens, 1))
bigram = list(nltk.ngrams(tokens, 2))
trigram = list(nltk.ngrams(tokens, 3))
print(unigram[:5])
print(bigram[:5])
print(trigram[:5])

[('all',), ('work',), ('and',), ('no',), ('play',)]
[('all', 'work'), ('work', 'and'), ('and', 'no'), ('no', 'play'), ('play', 'makes')]
[('all', 'work', 'and'), ('work', 'and', 'no'), ('and', 'no', 'play'), ('no', 'play', 'makes'), ('play', 'makes', 'jack')]


### Стоп-слова

In [6]:
stop_words = set(stopwords.words('russian'))
print(stop_words)
print(len(stop_words))

{'то', 'чего', 'иногда', 'где', 'совсем', 'хорошо', 'как', 'чем', 'под', 'три', 'все', 'в', 'если', 'при', 'за', 'себя', 'ей', 'такой', 'на', 'всегда', 'тут', 'ж', 'мой', 'что', 'так', 'этот', 'эту', 'со', 'куда', 'тебя', 'можно', 'его', 'был', 'ну', 'всю', 'после', 'кто', 'им', 'они', 'моя', 'ее', 'есть', 'их', 'бы', 'эти', 'этом', 'даже', 'теперь', 'об', 'но', 'ни', 'нибудь', 'не', 'да', 'было', 'себе', 'ней', 'тем', 'чтобы', 'же', 'до', 'впрочем', 'у', 'быть', 'были', 'более', 'ты', 'свою', 'много', 'этой', 'никогда', 'чуть', 'уже', 'будто', 'тоже', 'всех', 'там', 'раз', 'потом', 'надо', 'хоть', 'сам', 'том', 'здесь', 'вы', 'только', 'уж', 'нет', 'он', 'всего', 'вам', 'без', 'между', 'к', 'разве', 'нельзя', 'по', 'меня', 'для', 'ему', 'нее', 'из', 'конечно', 'потому', 'опять', 'сейчас', 'через', 'во', 'я', 'вот', 'мы', 'чтоб', 'ним', 'больше', 'лучше', 'еще', 'была', 'этого', 'один', 'или', 'другой', 'над', 'может', 'тот', 'ничего', 'какой', 'перед', 'вдруг', 'зачем', 'ли', 'а', 'и'

In [7]:
stop_words = set(stopwords.words('english'))
print(stop_words)
print(len(stop_words))

{'who', 'do', 'her', 'own', 'are', 'down', 't', "won't", 'some', 'was', 'all', 'can', 'does', 'out', 'under', 'ma', 'over', 'through', 'the', 'don', 'too', 'that', 'doing', 'whom', 'll', 'to', 'those', 'i', 'no', 'above', "weren't", 'isn', 'themselves', "you've", "wasn't", 'won', 've', 'him', 'its', 'again', 'in', "she's", 'both', 'his', 'but', 'ain', 'himself', 'hadn', 'needn', "it's", 'aren', "shan't", 'couldn', 'than', 'd', 'where', 'during', 'have', "should've", 'until', 'an', 're', 'now', 'it', 'they', 'only', 'shan', 'about', "you'd", 'should', 'been', 'when', 'these', 'yourself', 'didn', 'had', 'and', 'is', 'which', 'did', 'yourselves', 'further', 'below', "wouldn't", 'or', 'from', 'why', "you'll", "doesn't", 'myself', 'once', 'weren', 'with', "needn't", 'same', "mustn't", 'as', 'ourselves', 'itself', 'on', 'your', "aren't", 'mightn', 'at', 'their', 'hasn', 'if', 'because', 'doesn', 'we', 'hers', 'not', 'for', 'haven', 'each', 'he', 'you', 'there', 'between', 'ours', 'theirs', "

In [8]:
print([word for word in tokens if word not in stop_words])

['work', 'play', 'makes', 'jack', 'dull', 'boy', ',', 'work', 'play']


In [9]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


### Стемминг и лемматизация

In [10]:
words_en = ["game", "gaming", "gamed", "games", "compacted"]
words_ru = ["корова", "мальчики", "мужчины", "столом", "убежала"]

In [11]:
ps = PorterStemmer()
print(list(map(ps.stem, words_en)))
print(list(map(ps.stem, words_ru)))  # Старый метод не работает с русским языком

['game', 'game', 'game', 'game', 'compact']
['корова', 'мальчики', 'мужчины', 'столом', 'убежала']


In [12]:
ss = SnowballStemmer(language="english")
print(list(map(ss.stem, words_en)))
ss = SnowballStemmer(language="russian")
print(list(map(ss.stem, words_ru)))

['game', 'game', 'game', 'game', 'compact']
['коров', 'мальчик', 'мужчин', 'стол', 'убежа']


In [13]:
text_en = """DENNIS: Listen, strange women lying in ponds 
distributing swords is no basis for a system of
government. Supreme executive power derives from a
mandate from the masses, not from some farcical aquatic ceremony.
"""

text_ru = """Не существует научных доказательств в пользу
эффективности НЛП, оно призвано псевдонаукой. Систематические
обзоры указывают, что НЛП основано на устаревших
представлениях об устройстве мозга, несовместимо с современной
неврологией и содержит ряд фактических ошибок.
"""

In [14]:
morph = pymorphy2.MorphAnalyzer()
pymorphy_results = list(map(lambda x: morph.parse(x), text_ru.split(' ')))
print(' '.join([res[0].normal_form for res in pymorphy_results]))

не существовать научный доказательство в пользу
эффективность нлп, оно призвать псевдонаукой. систематические
обзор указывают, что нлп основать на устаревших
представление о устройство мозга, несовместимый с современной
неврология и содержать ряд фактический ошибок.



##### Разборы слов

In [15]:
pymorphy_results[0]

[Parse(word='не', tag=OpencorporaTag('PRCL'), normal_form='не', score=1.0, methods_stack=((DictionaryAnalyzer(), 'не', 22, 0),))]

In [16]:
pymorphy_results[2]

[Parse(word='научных', tag=OpencorporaTag('ADJF,Qual plur,gent'), normal_form='научный', score=0.774193, methods_stack=((DictionaryAnalyzer(), 'научных', 12, 21),)),
 Parse(word='научных', tag=OpencorporaTag('ADJF,Qual plur,loct'), normal_form='научный', score=0.209677, methods_stack=((DictionaryAnalyzer(), 'научных', 12, 26),)),
 Parse(word='научных', tag=OpencorporaTag('ADJF,Qual anim,plur,accs'), normal_form='научный', score=0.016129, methods_stack=((DictionaryAnalyzer(), 'научных', 12, 23),))]

In [17]:
nlp = spacy.load("en_core_web_sm")
spacy_results = nlp(text_en)
print(' '.join([token.lemma_ for token in spacy_results]))


DENNIS : listen , strange woman lie in pond 
 distribute sword be no basis for a system of 
 government . Supreme executive power derive from a 
 mandate from the masse , not from some farcical aquatic ceremony . 



# Задачи

### Part-of-speech (определение части речи)

In [18]:
# 1.
[(res[0].normal_form, res[0].tag) for res in pymorphy_results[:9]]

[('не', OpencorporaTag('PRCL')),
 ('существовать', OpencorporaTag('VERB,impf,intr sing,3per,pres,indc')),
 ('научный', OpencorporaTag('ADJF,Qual plur,gent')),
 ('доказательство', OpencorporaTag('NOUN,inan,neut plur,gent')),
 ('в', OpencorporaTag('PREP')),
 ('пользу\nэффективность', OpencorporaTag('NOUN,inan,femn sing,gent')),
 ('нлп,', OpencorporaTag('UNKN')),
 ('оно', OpencorporaTag('NPRO,neut,3per,Anph sing,nomn')),
 ('призвать', OpencorporaTag('PRTS,perf,past,pssv neut,sing'))]

In [19]:
# 2.
[(token.lemma_, token.pos_) for token in spacy_results[:7]]

[('DENNIS', 'PROPN'),
 (':', 'PUNCT'),
 ('listen', 'VERB'),
 (',', 'PUNCT'),
 ('strange', 'ADJ'),
 ('woman', 'NOUN'),
 ('lie', 'VERB')]

In [20]:
# 3.
predictor = RNNMorphPredictor(language="ru")
rnnmorph_result = predictor.predict(text_ru.split(' '))
[(token.normal_form, token.posm, token.tag) for token in rnnmorph_result[:7]]

TypeError: Could not locate class 'Functional'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'class_name': 'Functional', 'config': {'name': 'model_1', 'layers': [{'class_name': 'InputLayer', 'config': {'batch_input_shape': [None, None, 56], 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'grammemes'}, 'name': 'grammemes', 'inbound_nodes': []}, {'class_name': 'Dropout', 'config': {'name': 'dropout_1', 'trainable': True, 'dtype': 'float32', 'rate': 0.2, 'noise_shape': None, 'seed': None}, 'name': 'dropout_1', 'inbound_nodes': [[['grammemes', 0, 0, {}]]]}, {'class_name': 'InputLayer', 'config': {'batch_input_shape': [None, None, 32], 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'chars'}, 'name': 'chars', 'inbound_nodes': []}, {'class_name': 'Dense', 'config': {'name': 'dense_1', 'trainable': True, 'dtype': 'float32', 'units': 30, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'name': 'dense_1', 'inbound_nodes': [[['dropout_1', 0, 0, {}]]]}, {'class_name': 'TimeDistributed', 'config': {'name': 'time_distributed_1', 'trainable': True, 'dtype': 'float32', 'layer': {'class_name': 'Functional', 'config': {'name': 'model_1', 'layers': [{'class_name': 'InputLayer', 'config': {'batch_input_shape': [None, 32], 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'chars'}, 'name': 'chars', 'inbound_nodes': []}, {'class_name': 'Embedding', 'config': {'name': 'chars_embeddings', 'trainable': True, 'batch_input_shape': [None, None], 'dtype': 'float32', 'input_dim': 101, 'output_dim': 24, 'embeddings_initializer': {'class_name': 'RandomUniform', 'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None}}, 'embeddings_regularizer': None, 'activity_regularizer': None, 'embeddings_constraint': None, 'mask_zero': False, 'input_length': None}, 'name': 'chars_embeddings', 'inbound_nodes': [[['chars', 0, 0, {}]]]}, {'class_name': 'Reshape', 'config': {'name': 'reshape_1', 'trainable': True, 'dtype': 'float32', 'target_shape': [768]}, 'name': 'reshape_1', 'inbound_nodes': [[['chars_embeddings', 0, 0, {}]]]}, {'class_name': 'Dropout', 'config': {'name': 'dropout_2', 'trainable': True, 'dtype': 'float32', 'rate': 0.2, 'noise_shape': None, 'seed': None}, 'name': 'dropout_2', 'inbound_nodes': [[['reshape_1', 0, 0, {}]]]}, {'class_name': 'Dense', 'config': {'name': 'dense_2', 'trainable': True, 'dtype': 'float32', 'units': 500, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'name': 'dense_2', 'inbound_nodes': [[['dropout_2', 0, 0, {}]]]}, {'class_name': 'Dropout', 'config': {'name': 'dropout_3', 'trainable': True, 'dtype': 'float32', 'rate': 0.2, 'noise_shape': None, 'seed': None}, 'name': 'dropout_3', 'inbound_nodes': [[['dense_2', 0, 0, {}]]]}, {'class_name': 'Dense', 'config': {'name': 'dense_3', 'trainable': True, 'dtype': 'float32', 'units': 200, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'name': 'dense_3', 'inbound_nodes': [[['dropout_3', 0, 0, {}]]]}, {'class_name': 'Dropout', 'config': {'name': 'dropout_4', 'trainable': True, 'dtype': 'float32', 'rate': 0.2, 'noise_shape': None, 'seed': None}, 'name': 'dropout_4', 'inbound_nodes': [[['dense_3', 0, 0, {}]]]}], 'input_layers': [['chars', 0, 0]], 'output_layers': [['dropout_4', 0, 0]]}}}, 'name': 'time_distributed_1', 'inbound_nodes': [[['chars', 0, 0, {}]]]}, {'class_name': 'Concatenate', 'config': {'name': 'LSTM_input', 'trainable': True, 'dtype': 'float32', 'axis': -1}, 'name': 'LSTM_input', 'inbound_nodes': [[['dense_1', 0, 0, {}], ['time_distributed_1', 0, 0, {}]]]}, {'class_name': 'Dense', 'config': {'name': 'dense_4', 'trainable': True, 'dtype': 'float32', 'units': 200, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'name': 'dense_4', 'inbound_nodes': [[['LSTM_input', 0, 0, {}]]]}, {'class_name': 'LSTM', 'config': {'name': 'LSTM_1_forward', 'trainable': True, 'dtype': 'float32', 'return_sequences': True, 'return_state': False, 'go_backwards': False, 'stateful': False, 'unroll': False, 'time_major': False, 'units': 128, 'activation': 'tanh', 'recurrent_activation': 'hard_sigmoid', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}, 'shared_object_id': 25}, 'recurrent_initializer': {'class_name': 'Orthogonal', 'config': {'gain': 1.0, 'seed': None}, 'shared_object_id': 26}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}, 'shared_object_id': 27}, 'unit_forget_bias': True, 'kernel_regularizer': None, 'recurrent_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'recurrent_constraint': None, 'bias_constraint': None, 'dropout': 0.3, 'recurrent_dropout': 0.3, 'implementation': 1}, 'name': 'LSTM_1_forward', 'inbound_nodes': [[['dense_4', 0, 0, {}]]]}, {'class_name': 'ReversedLSTM', 'config': {'name': 'LSTM_1_backward', 'trainable': True, 'dtype': 'float32', 'return_sequences': True, 'return_state': False, 'go_backwards': True, 'stateful': False, 'unroll': False, 'time_major': False, 'units': 128, 'activation': 'tanh', 'recurrent_activation': 'hard_sigmoid', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}, 'shared_object_id': 30}, 'recurrent_initializer': {'class_name': 'Orthogonal', 'config': {'gain': 1.0, 'seed': None}, 'shared_object_id': 31}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}, 'shared_object_id': 32}, 'unit_forget_bias': True, 'kernel_regularizer': None, 'recurrent_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'recurrent_constraint': None, 'bias_constraint': None, 'dropout': 0.3, 'recurrent_dropout': 0.3, 'implementation': 1}, 'name': 'LSTM_1_backward', 'inbound_nodes': [[['dense_4', 0, 0, {}]]]}, {'class_name': 'Concatenate', 'config': {'name': 'BiLSTM_input', 'trainable': True, 'dtype': 'float32', 'axis': -1}, 'name': 'BiLSTM_input', 'inbound_nodes': [[['LSTM_1_forward', 0, 0, {}], ['LSTM_1_backward', 0, 0, {}]]]}, {'class_name': 'Bidirectional', 'config': {'name': 'bidirectional_1', 'trainable': True, 'dtype': 'float32', 'layer': {'class_name': 'LSTM', 'config': {'name': 'LSTM_0', 'trainable': True, 'dtype': 'float32', 'return_sequences': True, 'return_state': False, 'go_backwards': False, 'stateful': False, 'unroll': False, 'time_major': False, 'units': 128, 'activation': 'tanh', 'recurrent_activation': 'hard_sigmoid', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}, 'shared_object_id': 36}, 'recurrent_initializer': {'class_name': 'Orthogonal', 'config': {'gain': 1.0, 'seed': None}, 'shared_object_id': 37}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}, 'shared_object_id': 38}, 'unit_forget_bias': True, 'kernel_regularizer': None, 'recurrent_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'recurrent_constraint': None, 'bias_constraint': None, 'dropout': 0.3, 'recurrent_dropout': 0.3, 'implementation': 1}}, 'merge_mode': 'concat'}, 'name': 'bidirectional_1', 'inbound_nodes': [[['BiLSTM_input', 0, 0, {}]]]}, {'class_name': 'TimeDistributed', 'config': {'name': 'time_distributed_2', 'trainable': True, 'dtype': 'float32', 'layer': {'class_name': 'Dense', 'config': {'name': 'dense_5', 'trainable': True, 'dtype': 'float32', 'units': 128, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}}}, 'name': 'time_distributed_2', 'inbound_nodes': [[['bidirectional_1', 0, 0, {}]]]}, {'class_name': 'TimeDistributed', 'config': {'name': 'time_distributed_3', 'trainable': True, 'dtype': 'float32', 'layer': {'class_name': 'Dropout', 'config': {'name': 'dropout_2', 'trainable': True, 'dtype': 'float32', 'rate': 0.2, 'noise_shape': None, 'seed': None}}}, 'name': 'time_distributed_3', 'inbound_nodes': [[['time_distributed_2', 0, 0, {}]]]}, {'class_name': 'TimeDistributed', 'config': {'name': 'time_distributed_4', 'trainable': True, 'dtype': 'float32', 'layer': {'class_name': 'BatchNormalization', 'config': {'name': 'batch_normalization_1', 'trainable': True, 'dtype': 'float32', 'axis': [1], 'momentum': 0.99, 'epsilon': 0.001, 'center': True, 'scale': True, 'beta_initializer': {'class_name': 'Zeros', 'config': {}}, 'gamma_initializer': {'class_name': 'Ones', 'config': {}}, 'moving_mean_initializer': {'class_name': 'Zeros', 'config': {}}, 'moving_variance_initializer': {'class_name': 'Ones', 'config': {}}, 'beta_regularizer': None, 'gamma_regularizer': None, 'beta_constraint': None, 'gamma_constraint': None}}}, 'name': 'time_distributed_4', 'inbound_nodes': [[['time_distributed_3', 0, 0, {}]]]}, {'class_name': 'TimeDistributed', 'config': {'name': 'time_distributed_5', 'trainable': True, 'dtype': 'float32', 'layer': {'class_name': 'Activation', 'config': {'name': 'activation_1', 'trainable': True, 'dtype': 'float32', 'activation': 'relu'}}}, 'name': 'time_distributed_5', 'inbound_nodes': [[['time_distributed_4', 0, 0, {}]]]}, {'class_name': 'Dense', 'config': {'name': 'main_pred', 'trainable': True, 'dtype': 'float32', 'units': 253, 'activation': 'softmax', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}, 'name': 'main_pred', 'inbound_nodes': [[['time_distributed_5', 0, 0, {}]]]}], 'input_layers': [['grammemes', 0, 0], ['chars', 0, 0]], 'output_layers': [['main_pred', 0, 0]]}, 'keras_version': '2.8.0', 'backend': 'tensorflow'}

### Named entities recognition (Поиск именованых сущностей)

In [21]:
doc = nlp("Apple is looking at buing U.K. startup for $1 bullion")

for ent in doc.ents:
    print(ent.text,
          ent.start_char,
          ent.end_char,
          ent.label_)

Apple 0 5 ORG
U.K. 26 30 GPE
$1 bullion 43 53 MONEY


# Численное представление текста

### Смотрим набор данных fetch_20newsgroups в задаче классификации темы по новости

In [2]:
newsgroups_train = fetch_20newsgroups(subset="train")
print(type(newsgroups_train))

<class 'sklearn.utils._bunch.Bunch'>


In [3]:
# Количество новостей
newsgroups_train.filenames.shape

(11314,)

In [4]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

##### Рассмотрим подвыборку

In [5]:
categories = ["alt.atheism", "talk.religion.misc",
              "comp.graphics", "sci.space"]
newsgroups_train = fetch_20newsgroups(subset="train", categories=categories)
newsgroups_train.filenames.shape

(2034,)

In [6]:
print(type(newsgroups_train.data), newsgroups_train.data[0])

<class 'list'> From: rych@festival.ed.ac.uk (R Hawkes)
Subject: 3DS: Where did all the texture rules go?
Lines: 21

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych

Rycharde Hawkes				email: rych@festival.ed.ac.uk
Virtual Environment Laboratory
Dept. of Psychology			Tel  : +44 31 650 3426
Univ. of Edinburgh			Fax  : +44 31 667 0150



##### Первые 10 меток

In [7]:
newsgroups_train.target[:10]

array([1, 3, 2, 0, 2, 0, 2, 1, 2, 1])

##### Векторизация текстов с помощью TF-IDF

Некоторые параметры:  
- input: str ('filename', 'file', 'content')
- lowercase: bool, default True
- preprocessor: callable, default None
- stop_words: str ('english'), list, default None
- ngram_range: tuple (min_n, max_n)
- max_df: float in range [0., 1.] or int, default 1.
- min_df: floaf in range [0., 1.] or int, default 1
- max_features: int, default None

In [8]:
vectorizer = TfidfVectorizer(lowercase=False)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 42307)

In [9]:
# lowercase
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 34118)

##### Посмотрим некоторые слова, встречающиеся в тексте

In [10]:
vectorizer.get_feature_names_out()[:10]

array(['00', '000', '0000', '00000', '000000', '000005102000', '000021',
       '000062david42', '0000vec', '0001'], dtype=object)

##### Посмотрим на слова, которые встречаются в 80% случаях

In [11]:
vectorizer = TfidfVectorizer(min_df=0.8)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 9)

##### Посмотрим на самые часто встречающиеся слова
Как мы видим, у нас есть много слов, типа from, lines, the, to, которые не несут в себе смысловой информации. Мы хотим от них избавиться

In [12]:
vectorizer.get_feature_names_out()

array(['and', 'from', 'in', 'lines', 'of', 'organization', 'subject',
       'the', 'to'], dtype=object)

##### Посмотрим на то, сколько у нас будет слов, если мы будем работать со словами, встречающиеся в диапазоне от 1% до 80%

In [13]:
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.8)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 2391)

##### Рассмотрим ngram-range для отлавливания взаимодействия между токенами, а не токены по отдельности

In [14]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=0.03, max_df=0.9)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 1236)

##### Используем Стоп-слова

In [15]:
# Стоп-слова и preprocessor
stop_words = set(stopwords.words("english"))
nltk.download("wordnet")
wnl = nltk.WordNetLemmatizer()

def preproc_nltk(text):
    # text = re.sub(f"[{string.punctuation}]", ' ', text)
    return ' '.join([wnl.lemmatize(word) for word in word_tokenize(text.lower()) if word not in stop_words])

st = "Oh, I think ve landed where there are miracles at work, For the thirst and for the hunger Come the conference of birds"
preproc_nltk(st)

[nltk_data] Downloading package wordnet to /home/vitalii/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'oh , think landed miracle work , thirst hunger come conference bird'

In [16]:
%%time
vectorizer = TfidfVectorizer(preprocessor=preproc_nltk)
vectors = vectorizer.fit_transform(newsgroups_train.data)

CPU times: user 7.02 s, sys: 0 ns, total: 7.02 s
Wall time: 7.03 s


In [17]:
# preprocessor_spacy
nlp = spacy.load("en_core_web_sm")
texts = newsgroups_train.data.copy()

def preproc_spacy(text):
    spacy_results = nlp(text)
    return ' '.join([token.lemma_ for token in spacy_results if token.lemma_ not in stop_words])

preproc_spacy(st)

'oh , I think land miracle work , thirst hunger come conference bird'

In [18]:
%%time
new_texts = []
for doc in nlp.pipe(texts, batch_size=32, n_process=3, disable=["parser", "ner"]):
    new_texts.append(' '.join([tok.lemma_ for tok in doc if tok.lemma not in stop_words]))

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(new_texts)

CPU times: user 8.23 s, sys: 385 ms, total: 8.62 s
Wall time: 29.6 s


In [19]:
print(newsgroups_train.data[0])

From: rych@festival.ed.ac.uk (R Hawkes)
Subject: 3DS: Where did all the texture rules go?
Lines: 21

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych

Rycharde Hawkes				email: rych@festival.ed.ac.uk
Virtual Environment Laboratory
Dept. of Psychology			Tel  : +44 31 650 3426
Univ. of Edinburgh			Fax  : +44 31 667 0150



In [20]:
print(new_texts[0])

from : rych@festival.ed.ac.uk ( R Hawkes ) 
 subject : 3ds : where do all the texture rule go ? 
 line : 21 

 Hi , 

 I have notice that if you only save a model ( with all your mapping plane 
 position carefully ) to a .3ds file that when you reload it after restart 
 3ds , they be give a default position and orientation .   but if you save 
 to a .prj file their position / orientation be preserve .   do anyone 
 know why this information be not store in the .3ds file ?   nothing be 
 explicitly say in the manual about save texture rule in the .prj file . 
 I would like to be able to read the texture rule information , do anyone have 
 the format for the .PRJ file ? 

 be the .cel file format available from somewhere ? 

 rych 

 = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
 Rycharde Hawkes 				 email : rych@festival.ed.ac.uk 
 Virtual Environment Laboratory 
 Dept . of psychology 			 Tel 

##### Итоговая модель

In [21]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.5, max_features=1000)
vectors = vectorizer.fit_transform(new_texts)
vectorizer.get_feature_names_out()[::100]

array(['000', 'au', 'christ', 'engineering', 'human', 'look', 'of this',
       'report', 'technology', 'understand'], dtype=object)

In [22]:
vectors.shape

(2034, 1000)

In [23]:
vectorizer.get_feature_names_out()[::-100]

array(['zoo toronto edu', 'under', 'technical', 'reply to', 'of these',
       'long', 'however', 'end', 'choose', 'attempt'], dtype=object)

In [24]:
vectorizer.get_feature_names_out()[::100]

array(['000', 'au', 'christ', 'engineering', 'human', 'look', 'of this',
       'report', 'technology', 'understand'], dtype=object)

##### Посмотрим на косинусную меру между словами
- Косинусное расстояние - это 1 - косинусная схожесть (чем меньше, тем лучше)
- Косинусная мера близость (схожесть) (чем больше, тем лучше)

In [25]:
type(vectors)

scipy.sparse._csr.csr_matrix

In [26]:
vector = vectors.todense()[0]
(vector != 0).sum()

52

In [27]:
dense_vectors = vectors.todense()
dense_vectors.shape

(2034, 1000)

In [28]:
def cosine_sim(v1, v2):
    # v1, v2 (1 x dim)
    return np.array(v1 @ v2.T / norm(v1) / norm(v2))[0][0]

In [29]:
cosine_sim(dense_vectors[0], dense_vectors[0])

1.0000000000000002

In [30]:
cosines = []
for i in range(10):
    cosines.append(cosine_sim(dense_vectors[0], dense_vectors[i]))

Схожесть между 0 и первыми 10-ю новостями

In [31]:
# Классы для первых 10 новостей: [1, 3, 2, 0, 2, 0, 2, 1, 2, 1]
cosines

[1.0000000000000002,
 0.04191279776414236,
 0.005868383611019931,
 0.09771238093526102,
 0.07060916453270281,
 0.06745764842966309,
 0.0267141823627476,
 0.22853760897260958,
 0.031636420124663965,
 0.06928662593161493]

##### Обучим любую известную модель на полученных признаках

In [32]:
X_train, X_test, y_train, y_test = train_test_split(dense_vectors,
                                                    newsgroups_train.target,
                                                    test_size=0.2,
                                                    random_state=0)

y_train.shape, y_test.shape

((1627,), (407,))

In [33]:
%%time
svc = svm.SVC()
X_train = np.asarray(X_train)
svc.fit(X_train, y_train)

CPU times: user 1.08 s, sys: 437 µs, total: 1.08 s
Wall time: 1.07 s


In [34]:
X_test = np.asarray(X_test)
accuracy_score(y_test, svc.predict(X_test))

0.9238329238329238

In [35]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
accuracy_score(y_test, sgd.predict(X_test))

0.9164619164619164

##### Embeddings
Качество для эмбеддингов хуже, чем для TF-IDF обработки. Дело в том, что мы использовали малый размер для эмбеддингов (25, 32 по сравнению с 1000)  
Достаточный размер эмбеддинга должен быть около 200-300

In [36]:
embeddings_pretrained = api.load("glove-twitter-25")

In [47]:
embeddings_pretrained.index_to_key[:10]

['<user>',
 '.',
 ':',
 'rt',
 ',',
 '<repeat>',
 '<hashtag>',
 '<number>',
 '<url>',
 '!']

In [49]:
print(len(embeddings_pretrained.index_to_key))

1193514


In [37]:
proc_words = [preproc_nltk(text).split() for text in newsgroups_train.data]
embeddings_trained = Word2Vec(proc_words,  # data for model to train on
                              vector_size=32,  # embedding vector size
                              min_count=3,  # consider words that occured at least 5 times
                              window=3).wv

In [38]:
def vectorize_sum(comment, embeddings):
    """
    Implement a function that converts preprocessed comment to a sum of token vectors
    """
    embedding_dim = embeddings.vectors.shape[1]
    features = np.zeros([embedding_dim], dtype="float32")

    for word in preproc_nltk(comment).split():
        if word in embeddings:
            features += embeddings[f'{word}']

    return features

In [39]:
X_wv = np.stack([vectorize_sum(text, embeddings_pretrained) for text in newsgroups_train.data])
X_train_wv, X_test_wv, y_train, y_test = train_test_split(X_wv, newsgroups_train.target, test_size=0.2, random_state=0)
X_train_wv.shape, X_test_wv.shape

((1627, 25), (407, 25))

In [42]:
clf = LogisticRegression(max_iter=5000)
wv_model = clf.fit(X_train_wv, y_train)
accuracy_score(y_test, wv_model.predict(X_test_wv))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7027027027027027

In [43]:
X_wv = np.stack([vectorize_sum(text, embeddings_trained) for text in newsgroups_train.data])
X_train_wv, X_test_wv, y_train, y_test = train_test_split(X_wv, newsgroups_train.target, test_size=0.2, random_state=0)
X_train_wv.shape, X_test_wv.shape

((1627, 32), (407, 32))

In [50]:
len(embeddings_trained.index_to_key)

13566

##### Качество для обученных эмбеддингов на своём тексте выше, чем для предобученных

In [44]:
clf = LogisticRegression(max_iter=5000)
wv_model = clf.fit(X_train_wv, y_train)
accuracy_score(y_test, wv_model.predict(X_test_wv))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7321867321867321

##### Используем более длинные эмбеддинги

In [51]:
proc_words = [preproc_nltk(text).split() for text in newsgroups_train.data]
embeddings_trained = Word2Vec(proc_words,  # data for model to train on
                              vector_size=200,  # embedding vector size
                              min_count=3,  # consider words that occured at least 5 times
                              window=3).wv

X_wv = np.stack([vectorize_sum(text, embeddings_trained) for text in newsgroups_train.data])
X_train_wv, X_test_wv, y_train, y_test = train_test_split(X_wv, newsgroups_train.target, test_size=0.2, random_state=0)
X_train_wv.shape, X_test_wv.shape

clf = LogisticRegression(max_iter=5000)
wv_model = clf.fit(X_train_wv, y_train)
accuracy_score(y_test, wv_model.predict(X_test_wv))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8476658476658476

##### Увеличим количество итераций для логистической регрессии

In [52]:
clf = LogisticRegression(max_iter=10000)
wv_model = clf.fit(X_train_wv, y_train)
accuracy_score(y_test, wv_model.predict(X_test_wv))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8427518427518428