### Тема «POS-tagger и NER»

Задание 1. Написать теггер на данных с русским языком

- проверить UnigramTagger, BigramTagger, TrigramTagger и их комбинации
- написать свой теггер как на занятии, попробовать разные векторайзеры, добавить знание не только букв но и слов
- сравнить все реализованные методы, сделать выводы
 
Задание 2. Проверить, насколько хорошо работает NER

Данные брать из http://www.labinform.ru/pub/named_entities/
- проверить NER из nltk/spacy/deeppavlov.
- написать свой NER, попробовать разные подходы.
- передаём в сетку токен и его соседей.
- передаём в сетку только токен.
- свой вариант.
- сравнить свои реализованные подходы на качество — вывести precision/recall/f1_score.


!wget -O ru_syntagrus-ud-train.conllu https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-train-a.conllu
!wget -O ru_syntagrus-ud-dev.conllu https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu

!pip install pyconll

In [4]:
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import pyconll
import nltk
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger

import warnings
warnings.filterwarnings("ignore")

In [5]:
data_train = pyconll.load_from_file('ru_syntagrus-ud-train.conllu')
data_test = pyconll.load_from_file('ru_syntagrus-ud-dev.conllu')

In [6]:
fdata_train = []
for sent in data_train[:]:
    fdata_train.append([(token.form, token.upos) for token in sent])
    
fdata_test = []
for sent in data_test[:]:
    fdata_test.append([(token.form, token.upos) for token in sent])
    
fdata_sent_test = []
for sent in data_test[:]:
    fdata_sent_test.append([token.form for token in sent])

In [7]:
len(fdata_train), len(fdata_test), len(fdata_sent_test)

(24516, 8906, 8906)

In [9]:
unigram_tagger = UnigramTagger(fdata_train)
unigram_acc = unigram_tagger.evaluate(fdata_test)

bigram_tagger = BigramTagger(fdata_train)
bigram_acc = bigram_tagger.evaluate(fdata_test)

trigram_tagger = TrigramTagger(fdata_train)
trigram_acc = trigram_tagger.evaluate(fdata_test)

bigram_tagger = BigramTagger(fdata_train, backoff=unigram_tagger)
bigram_unigram_acc = bigram_tagger.evaluate(fdata_test)

trigram_tagger = TrigramTagger(fdata_train, backoff=bigram_tagger)
trigram_bigram_unigram_acc = trigram_tagger.evaluate(fdata_test)

print(f'Accuracy:\nUnigram Tagger: {round(unigram_acc, 3)},\nBigram Tagger: {round(bigram_acc, 5)},\n'
      f'Trigram Tagger: {round(trigram_acc, 3)},\nBigram and Unigram Tagger: {round(bigram_unigram_acc, 5)},\n'
      f'Trigram, Bigram and Unigram Tagger: {round(trigram_bigram_unigram_acc, 5)},\n')

Accuracy:
Unigram Tagger: 0.824,
Bigram Tagger: 0.60939,
Trigram Tagger: 0.178,
Bigram and Unigram Tagger: 0.82928,
Trigram, Bigram and Unigram Tagger: 0.82914,



Наблюдаем лучший результат у комбинации "Bigram and Unigram Tagger"

Сделаем свой вариант тагера

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [11]:
train_tok = []
train_label = []
for sent in fdata_train[:]:
    for tok in sent:
        train_tok.append(tok[0])
        train_label.append('NO_TAG' if tok[1] is None else tok[1])
        
test_tok = []
test_label = []
for sent in fdata_test[:]:
    for tok in sent:
        test_tok.append(' ' if tok[0] is None else tok[0])
        test_label.append('NO_TAG' if tok[1] is None else tok[1])

In [13]:
le = LabelEncoder()
train_enc_labels = le.fit_transform(train_label)
test_enc_labels = le.transform(test_label)
le.classes_

array(['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN',
       'NO_TAG', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
       'VERB', 'X'], dtype='<U6')

In [14]:
%time

vectorizers = [CountVectorizer(ngram_range=(1, 3), analyzer='char'), 
               TfidfVectorizer(ngram_range=(1, 3), analyzer='char'), 
               HashingVectorizer(ngram_range=(1, 3), analyzer='char', n_features=1000)] 
vectorizers_word = [CountVectorizer(ngram_range=(1, 3), analyzer='word'), 
               TfidfVectorizer(ngram_range=(1, 3), analyzer='word'), 
               HashingVectorizer(ngram_range=(1, 3), analyzer='word', n_features=1000)] 
n_features = [2000, 3000, 5000]
vectorizers_hash = [HashingVectorizer(ngram_range=(1, 3), analyzer='char', n_features=feat) for feat in n_features]
vectorizers_hash_word = [HashingVectorizer(ngram_range=(1, 3), analyzer='word', n_features=feat) for feat in n_features]
f1_scores = []
accuracy_scores = []

for vectorizer in vectorizers + vectorizers_word + vectorizers_hash + vectorizers_hash_word:
    X_train = vectorizer.fit_transform(train_tok)
    X_test = vectorizer.transform(test_tok)
    
    lr = LogisticRegression(random_state=0, max_iter=100)
    lr.fit(X_train, train_enc_labels)
    pred = lr.predict(X_test)
    f1 = f1_score(test_enc_labels, pred, average='weighted')
    f1_scores.append(f1)
    acc = accuracy_score(test_enc_labels, pred)
    accuracy_scores.append(acc)
    
    print(vectorizer)
    print(classification_report(test_enc_labels, pred, target_names=le.classes_))

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.72 µs
CountVectorizer(analyzer='char', ngram_range=(1, 3))
              precision    recall  f1-score   support

         ADJ       0.91      0.91      0.91     15103
         ADP       0.98      1.00      0.99     13717
         ADV       0.90      0.90      0.90      7783
         AUX       0.81      0.97      0.88      1390
       CCONJ       0.88      0.98      0.93      5672
         DET       0.89      0.73      0.80      4265
        INTJ       0.35      0.29      0.32        24
        NOUN       0.92      0.95      0.93     36238
      NO_TAG       1.00      0.77      0.87       265
         NUM       0.84      0.90      0.87      1734
        PART       0.95      0.76      0.85      5125
        PRON       0.83      0.90      0.86      7444
       PROPN       0.75      0.58      0.66      5473
       PUNCT       1.00      1.00      1.00     29186
       SCONJ       0.80      0.91      0.85      2865
         SYM  

In [15]:
result_model = pd.DataFrame({'Vectorizer': vectorizers + vectorizers_word + vectorizers_hash + vectorizers_hash_word,
                            'f1_score': f1_scores})
result_model.sort_values('f1_score', ascending=False)

Unnamed: 0,Vectorizer,f1_score
0,"CountVectorizer(analyzer='char', ngram_range=(...",0.924069
1,"TfidfVectorizer(analyzer='char', ngram_range=(...",0.9175
8,"HashingVectorizer(analyzer='char', n_features=...",0.897959
7,"HashingVectorizer(analyzer='char', n_features=...",0.892972
6,"HashingVectorizer(analyzer='char', n_features=...",0.89091
2,"HashingVectorizer(analyzer='char', n_features=...",0.876592
3,"CountVectorizer(ngram_range=(1, 3))",0.662481
4,"TfidfVectorizer(ngram_range=(1, 3))",0.650386
11,"HashingVectorizer(n_features=5000, ngram_range...",0.585894
10,"HashingVectorizer(n_features=3000, ngram_range...",0.568223


In [16]:
result_model_acc = pd.DataFrame({'Vectorizer': vectorizers + vectorizers_word + vectorizers_hash + vectorizers_hash_word,
                            'Accuracy': accuracy_scores})
result_model_acc.sort_values('Accuracy', ascending=False)

Unnamed: 0,Vectorizer,Accuracy
0,"CountVectorizer(analyzer='char', ngram_range=(...",0.925822
1,"TfidfVectorizer(analyzer='char', ngram_range=(...",0.919917
8,"HashingVectorizer(analyzer='char', n_features=...",0.901524
7,"HashingVectorizer(analyzer='char', n_features=...",0.897057
6,"HashingVectorizer(analyzer='char', n_features=...",0.894511
2,"HashingVectorizer(analyzer='char', n_features=...",0.87951
3,"CountVectorizer(ngram_range=(1, 3))",0.653936
4,"TfidfVectorizer(ngram_range=(1, 3))",0.640152
11,"HashingVectorizer(n_features=5000, ngram_range...",0.604213
10,"HashingVectorizer(n_features=3000, ngram_range...",0.592747


In [40]:
!python3 -m venv .env
!source .env/bin/activate
!pip3 install -U pip setuptools wheel
!pip3 install -U spacy

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbr

In [42]:

!/usr/local/bin/spacy download ru_core_news_sm

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting ru-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.4.0/ru_core_news_sm-3.4.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: ru-core-news-sm
[33m  DEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[33mDEPRECATION: Configuring installation scheme with distutils config files

!wget http://www.labinform.ru/pub/named_entities/collection5.zip

!unzip collection5.zip

In [17]:
import corus
from corus import load_ne5

In [18]:
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/dv/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /Users/dv/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /Users/dv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dv/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [21]:
records = load_ne5('Collection5/')
document = next(records).text
document

'Жириновский предлагает обменять с США Сноудена на Бута\r\n\r\nЛидер ЛДПР Владимир Жириновский предложил обменять бывшего сотрудника ЦРУ США Эдварда Сноудена, который прибыл в Москву, на осужденного в Америке бизнесмена Виктора Бута.\r\n\r\n"Сноудена ни в коем случае не высылать в США, а обменять на Виктора Бута и Константина Ярошенко. В идеале — добавить генерала Олега Калугина", — написал он в своем микроблоге в Twitter.\r\n\r\nСноуден, работавший на компанию Booz Allen Hamilton — подрядчика Центрального разведывательного управления США, в начале июня распространил секретный ордер суда, по которому спецслужбы получили доступ ко всем звонкам крупнейшего сотового оператора Verizon, а также данные о сверхсекретной программе агентства национальной безопасности PRISM, позволяющей отслеживать электронные коммуникации на крупнейших сайтах. В воскресенье стало известно, что Сноуден прибыл из Гонконга в Москву и запросил убежища в Эквадоре.\r\n\r\nЧто ждет Эдварда Сноудена\r\n\r\nЭдвард Сноуд

In [22]:
{(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(document))) if hasattr(chunk, 'label') }

{('Hamilton', 'PERSON'),
 ('Америке', 'PERSON'),
 ('Виктора Бута', 'PERSON'),
 ('Москву', 'PERSON'),
 ('Сноуден', 'PERSON'),
 ('Эдварда Сноудена', 'PERSON'),
 ('Эдварда Сноудена Эдвард Сноуден', 'PERSON')}

In [43]:
import spacy
from spacy import displacy

In [45]:
nlp = spacy.load('ru_core_news_sm')

In [46]:
ny_bb = document
article = nlp(ny_bb)
displacy.render(article, jupyter=True, style='ent')

In [47]:
for token in article:
    print(token.text, token.pos_, token.dep_)

Жириновский PROPN nsubj
предлагает VERB ROOT
обменять VERB xcomp
с ADP case
США PROPN obl
Сноудена PROPN obj
на ADP case
Бута PROPN obl


 SPACE dep
Лидер NOUN nsubj
ЛДПР PROPN nmod
Владимир PROPN appos
Жириновский PROPN flat:name
предложил VERB conj
обменять VERB xcomp
бывшего ADJ amod
сотрудника NOUN obj
ЦРУ PROPN nmod
США PROPN nmod
Эдварда PROPN appos
Сноудена PROPN flat:name
, PUNCT punct
который PRON nsubj
прибыл VERB acl:relcl
в ADP case
Москву PROPN obl
, PUNCT punct
на ADP case
осужденного NOUN acl
в ADP case
Америке PROPN obl
бизнесмена NOUN appos
Виктора PROPN appos
Бута PROPN flat:name
. PUNCT punct


 SPACE dep
" PUNCT punct
Сноудена PROPN obj
ни PART advmod
в ADP fixed
коем DET fixed
случае NOUN fixed
не PART advmod
высылать VERB ROOT
в ADP case
США PROPN obl
, PUNCT punct
а CCONJ cc
обменять VERB conj
на ADP case
Виктора PROPN obl
Бута PROPN flat:name
и CCONJ cc
Константина PROPN conj
Ярошенко PROPN flat:name
. PUNCT punct
В ADP case
идеале NOUN ROOT
— PUNCT punct
добави

!pip install razdel

In [49]:
from razdel import tokenize

In [50]:
words_docs = []
for ix, rec in enumerate(records):
    words = []
    for token in tokenize(rec.text):
        
        type_ent = 'OUT'
        for ent in rec.spans:
            if (token.start >= ent.start) and (token.stop <= ent.stop):
                type_ent = ent.type
                break
        words.append([token.text, type_ent])
    words_docs.extend(words)

In [51]:
df_words = pd.DataFrame(words_docs, columns=['word', 'tag'])

In [52]:
df_words['tag'].value_counts()

OUT         219014
PER          21178
ORG          13641
LOC           4564
GEOPOLIT      4349
MEDIA         2481
Name: tag, dtype: int64

In [53]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D, GRU, LSTM, Dropout, Input, Bidirectional,Reshape
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn import model_selection, preprocessing, linear_model

In [54]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df_words['word'], df_words['tag'])

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [55]:
train_data = tf.data.Dataset.from_tensor_slices((train_x, train_y))
valid_data = tf.data.Dataset.from_tensor_slices((valid_x, valid_y))

train_data = train_data.batch(2048)
valid_data = valid_data.batch(2048)

2022-10-22 13:56:33.426834: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [56]:
AUTOTUNE = tf.data.AUTOTUNE

train_data = train_data.cache().prefetch(buffer_size=AUTOTUNE)
valid_data = valid_data.cache().prefetch(buffer_size=AUTOTUNE)

In [57]:
def custom_standardization(input_data):
    return input_data

vocab_size = 30000
seq_len = 10


vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=seq_len
    )

text_data = train_data.map(lambda x, y: x)
vectorize_layer.adapt(text_data)

In [58]:
t=np.unique(encoder.inverse_transform(valid_y),return_counts=True)[1]
t=t/t.sum()

In [59]:
model = tf.keras.Sequential([
  vectorize_layer,
  tf.keras.layers.Embedding(len(vectorize_layer.get_vocabulary()), 64, mask_zero=True),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(300, activation='relu'),
  tf.keras.layers.Dropout(0.5),
  tf.keras.layers.Dense(50, activation='relu'),
  tf.keras.layers.Dense(6, activation='softmax')
  ])

In [60]:
model.compile(optimizer='adam',         
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])
model.fit(train_data, validation_data=valid_data, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x19ecc5400>

In [61]:
pred=model.predict(valid_data)



In [62]:
list(zip(encoder.classes_,pred.mean(axis=0)/t))

[('GEOPOLIT', 0.9684594529900095),
 ('LOC', 0.9577035941135312),
 ('MEDIA', 0.9768202204155387),
 ('ORG', 1.0044458789699269),
 ('OUT', 1.0168183409614737),
 ('PER', 0.8422694103030541)]

In [63]:
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(len(vectorize_layer.get_vocabulary()), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(6,activation='softmax')
])

In [64]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

In [65]:
model.fit(train_data, validation_data=valid_data, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a33047f0>