# Тема: Part-of-Speech разметка, NER, извлечение отношений

**Задание 1.** Написать теггер на данных с русским языком

In [1]:
import pandas as pd
import re
import string

import pyconll

import nltk
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
full_train = pyconll.load_from_file('data/ru_syntagrus-ud-train.conllu')
full_test = pyconll.load_from_file('data/ru_syntagrus-ud-dev.conllu')
stata = pd.DataFrame(columns=['accurasy'])

In [3]:
fdata_train = []
for sent in full_train[:]:
    fdata_train.append([(token.form, token.upos) for token in sent])
    
fdata_test = []
for sent in full_test[:]:
    fdata_test.append([(token.form, token.upos) for token in sent])
    
fdata_sent_test = []
for sent in full_test[:]:
    fdata_sent_test.append([token.form for token in sent])

**1. проверить UnigramTagger, BigramTagger, TrigramTagger и их комбинации**

In [4]:
taggers = [UnigramTagger, BigramTagger, TrigramTagger]

for tag in taggers:
    tagger = tag(fdata_train)
    name_tag = ''.join([ch for ch in list(re.split('\.', str(tag))[-1] ) if ch not in string.punctuation])
    stata.loc[name_tag, 'accurasy'] = tagger.evaluate(fdata_test)

In [5]:
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff

backoff = DefaultTagger('NOUN') 
tag = backoff_tagger(fdata_train,  
                     taggers,  
                     backoff = backoff) 
stata.loc['BackoffTagger', 'accurasy'] = tag.evaluate(fdata_test)

**2. написать свой теггер как на занятии, попробовать разные векторайзеры, добавить знание не только букв но и слов**

In [6]:
train_tok = []
train_label = []
for sent in fdata_train[:]:
    for tok in sent:
        train_tok.append(tok[0])
        train_label.append('NO_TAG' if tok[1] is None else tok[1])
        
test_tok = []
test_label = []
for sent in fdata_test[:]:
    for tok in sent:
        test_tok.append(tok[0])
        test_label.append('NO_TAG' if tok[1] is None else tok[1])
        
le = LabelEncoder()
train_enc_labels = le.fit_transform(train_label)
test_enc_labels = le.transform(test_label)
le.classes_

array(['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN',
       'NO_TAG', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
       'VERB', 'X'], dtype='<U6')

In [7]:
vectorizers = [CountVectorizer, HashingVectorizer, TfidfVectorizer]

for vect in vectorizers:

    vectorizer = vect(ngram_range=(1, 5), analyzer='char')
    X_train = vectorizer.fit_transform(train_tok)
    X_test = vectorizer.transform(test_tok)

    lr = LogisticRegression(random_state=0)
    lr.fit(X_train, train_enc_labels)

    pred = lr.predict(X_test)
    name_vect = ''.join([ch for ch in list(re.split('\.', str(vect))[-1] ) if ch not in string.punctuation])
    stata.loc[name_vect, 'accurasy'] = accuracy_score(test_enc_labels, pred)

**3.** сравнить все реализованные методы сделать выводы

In [8]:
stata

Unnamed: 0,accurasy
UnigramTagger,0.877254
BigramTagger,0.696306
TrigramTagger,0.248087
BackoffTagger,0.911999
CountVectorizer,0.94998
HashingVectorizer,0.930231
TfidfVectorizer,0.938227
