<a href="https://colab.research.google.com/github/vydra-v-getrax/Chinese_pos_tagging/blob/main/pos_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Comparison of POS tagging models for Chinese

## Import data

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import json

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

  from pandas import Panel


### Getting data for gold standard

You don't need that, the dataset has already been annotated.

Go to see: https://docs.google.com/spreadsheets/d/1ZG3xwqC7z857qjFdm3Z02yz968ArbGv7MJ-EDdw3yew/edit?usp=sharing



In [None]:
!pip install fastHan
from fastHan import fastHan
model=FastHan()

In [2]:
def load_gold(path, model):

    """
    This function takes texts from .txt files and annotates them with fastHan
    """

    with open(path, 'r', encoding='utf-8') as f:
        file = f.readlines()
    pairs = [sent.split(' ||| ') for sent in file]
    dct = []
    # ru = []
    for id_sent, pair in tqdm(enumerate(pairs)):
        ru = pair[0]
        zh = ''.join(pair[1].split())
        answer = model(zh, target='POS')[0]
        count = 0
        for k, token in enumerate(answer):
            id_token = [count+i for i in range(len(token[0]))]
            count+= len(token[0])
            dct.append([id_sent, token[0], token[1], id_token, zh, ru])

    res = pd.DataFrame(columns=['id_sent', 'token', 'pos', 'id_token', 'zho', 'rus'], data=dct)
    return res

In [2]:
gold4 = load_gold('pos/alignment/gold4_900.txt')
gold2 = load_gold('pos/alignment/gold2_125.txt')

In [None]:
gold4

Unnamed: 0,id_sent,token,pos,id_token,zho,rus
0,0,它,PN,[0],它没有翅翼。,крыльев у него нет .
1,0,没有,VE,"[1, 2]",它没有翅翼。,крыльев у него нет .
2,0,翅翼,NN,"[3, 4]",它没有翅翼。,крыльев у него нет .
3,0,。,PU,[5],它没有翅翼。,крыльев у него нет .
4,1,“,PU,[0],“是优雅，生机勃勃和32号鞋的鞋后跟——瞧，这是个淫荡的女人吗？,"— изящество , трепет , каблучки тридцать второ..."
...,...,...,...,...,...,...
11795,544,人民,NN,"[34, 35]",你听，正哭呢。”又说：“她知道这消息之后，也就无状可告了，再也不会闯人民大会堂了。”,"слышите , как рыдает ? она поняла , что теперь..."
11796,544,大会堂,NN,"[36, 37, 38]",你听，正哭呢。”又说：“她知道这消息之后，也就无状可告了，再也不会闯人民大会堂了。”,"слышите , как рыдает ? она поняла , что теперь..."
11797,544,了,SP,[39],你听，正哭呢。”又说：“她知道这消息之后，也就无状可告了，再也不会闯人民大会堂了。”,"слышите , как рыдает ? она поняла , что теперь..."
11798,544,。,PU,[40],你听，正哭呢。”又说：“她知道这消息之后，也就无状可告了，再也不会闯人民大会堂了。”,"слышите , как рыдает ? она поняла , что теперь..."


In [None]:
gold4.to_csv('/pos/alignment/gold4_900.csv')
gold2.to_csv('/pos/alignment/gold4_125.csv')

## Getting russian parallel data

1.   Get sentences from .txt aligned format
2.   Preprocess with pymorphy2





In [None]:
def load_ru(path): 

    with open(path, 'r', encoding='utf-8') as f:
        file = f.readlines()
    ru = [sent.split(' ||| ')[0] for sent in file]
    dct = []
    ru_words = [sent.split() for sent in ru]
    return ru_words


In [None]:
ru_gold2 = load_ru('/pos/alignment/gold2_125.txt')
ru_gold4 = load_ru('/pos/alignment/gold4_900.txt')

In [None]:
!pip install pymorphy2
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

Collecting pymorphy2
[?25l  Downloading https://files.pythonhosted.org/packages/07/57/b2ff2fae3376d4f3c697b9886b64a54b476e1a332c67eee9f88e7f1ae8c9/pymorphy2-0.9.1-py3-none-any.whl (55kB)
[K     |██████                          | 10kB 14.3MB/s eta 0:00:01[K     |███████████▉                    | 20kB 16.8MB/s eta 0:00:01[K     |█████████████████▊              | 30kB 11.8MB/s eta 0:00:01[K     |███████████████████████▋        | 40kB 9.5MB/s eta 0:00:01[K     |█████████████████████████████▌  | 51kB 5.3MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.2MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85745766c0047ccc3b5036f1d03559fd46bb38b5eeb/DAWG_Python-0.7.2-py2.py3-none-any.whl
Collecting pymorphy2-dicts-ru<3.0,>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/3a/79/bea0021eeb7eeefde22ef9e96badf174068a2dd20264b9a378f2be1cdd9e/pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-no

In [None]:
import string
PUNCT = string.punctuation+'...'+'—'+'–'+'«'+'»'+'№'+'--'+'…'   #add PUNCT to Pymorphy2 tags
import re

def tag_rus(data, nick):

    '''
    Annotate russian sentences

    data: list of lists 
    nick: string for nickname of set (for alignment)
    '''

    dct = {'word': [], 'pos': [], 'id_sent': [], 'id_token': []}
    for id_sent, sentence in enumerate(data):
        pos_tags = []
        for id_token, word in enumerate(sentence):
            p = morph.parse(word.replace('ё', 'ё').replace('ё', 'ё').replace('й', 'й'))[0].tag.POS
            if word in PUNCT:
                p = 'PUNCT'
            if re.search('\d', word):
                p = 'NUMR'
            dct['word'].append(word)
            dct['pos'].append(p)
            dct['id_sent'].append(id_sent)
            dct['id_token'].append(id_token)

    pos = pd.DataFrame(columns=['word', 'pos', 'id_sent', 'id_token'], data=dct)
    pos['nickname'] = [nick for i in range(pos.shape[0])]

    return pos

ru2 = tag_rus(ru_gold2, 'gold2')
ru4 = tag_rus(ru_gold4, 'gold4')

In [None]:
ru = pd.concat([ru2, ru4])

## Get gold standard




In [24]:
# This enables access to google spreadsheets

from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials
from gspread_dataframe import set_with_dataframe


gc = gspread.authorize(GoogleCredentials.get_application_default())

In [440]:
my_book = gc.open_by_key('1ZG3xwqC7z857qjFdm3Z02yz968ArbGv7MJ-EDdw3yew') # link to our data

In [None]:
worksheet_list = my_book.worksheets()
col_names = ["num", 'id_sent', 'token', 'pos', 'true_pos', 'id_token', 'zho', 'rus']
stand = pd.DataFrame(columns=col_names)
list_of_counts = []
list_of_df = []
for i in ['gold2_125', 'gold4_365', 'gold4_continue']: # for 'manual' spreadsheet
    my_sheet = my_book.worksheet(i)
    list_of_lists = my_sheet.get_all_values()
    df = pd.DataFrame(data=list_of_lists)
    df.columns = col_names # 0th row to col names
    df['nickname'] = [i for _ in list_of_lists]
    df.drop(df.index[0], inplace=True) # drop 0th row
    list_of_df.append(df)

In [None]:
# This loads all annotated sheets to one pandas df

full = pd.concat([list_of_df[0], list_of_df[1], list_of_df[2]], ignore_index=True)

In [None]:
list_of_df[2].id_sent = list_of_df[2].id_sent.apply(lambda x: int(x)+365)

In [None]:
full.tail()

Unnamed: 0,num,id_sent,token,pos,true_pos,id_token,zho,rus,nickname
21815,11795,909,人民,NN,NN,"[34, 35]",你听，正哭呢。”又说：“她知道这消息之后，也就无状可告了，再也不会闯人民大会堂了。”,"слышите , как рыдает ? она поняла , что теперь...",gold4_continue
21816,11796,909,大会堂,NN,NN,"[36, 37, 38]",你听，正哭呢。”又说：“她知道这消息之后，也就无状可告了，再也不会闯人民大会堂了。”,"слышите , как рыдает ? она поняла , что теперь...",gold4_continue
21817,11797,909,了,SP,SP,[39],你听，正哭呢。”又说：“她知道这消息之后，也就无状可告了，再也不会闯人民大会堂了。”,"слышите , как рыдает ? она поняла , что теперь...",gold4_continue
21818,11798,909,。,PU,PU,[40],你听，正哭呢。”又说：“她知道这消息之后，也就无状可告了，再也不会闯人民大会堂了。”,"слышите , как рыдает ? она поняла , что теперь...",gold4_continue
21819,11799,909,”,PU,PU,[41],你听，正哭呢。”又说：“她知道这消息之后，也就无状可告了，再也不会闯人民大会堂了。”,"слышите , как рыдает ? она поняла , что теперь...",gold4_continue


### Statistics



In [20]:
# I want to write some data to sheets.

stat = gc.open_by_key('1OS6sKIF-yJQVwXu2RVZH_P7EI4_wreAyXxh9ybf8Kmg') #usable link to stat file

def write_to_stats(my_book, name, data):
    """
    Writes dataframe to sheets.
    my_book : sheet,
    name : name of list
    data: your df
    """
    worksheet = my_book.add_worksheet(title=name, rows="100", cols="100")
    set_with_dataframe(worksheet, data)

In [None]:
write_to_stats(stat, full.columns, 'dataset', full)

In [None]:
# Get statistics 
write_to_stats(stat, ['true_pos', 'number'], 'value_counts', pd.DataFrame(full.true_pos.value_counts()).reset_index())

cols = ['accuracy', 'precision', 'recall', 'f1']
write_to_stats(stat, 'manual_results', pd.DataFrame([accuracy_score(full.true_pos, full.pos), 
                            precision_score(full.true_pos, full.pos, average='macro'),
                            recall_score(full.true_pos, full.pos, average='macro'), 
                            f1_score(full.true_pos, full.pos, average='macro')], index=[cols]).reset_index())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Group by sentences
sentences = full.groupby(['nickname', 'id_sent']).agg({'token':lambda x: list(x),
                                                      'pos': lambda x: list(x), 
                                                      'id_token': lambda x: list(x)})
N_char = sum(full.token.apply(lambda x: len(x)))
N_sent = sentences.count()[0]
N_words = full.shape[0]

print(f'Общее число предложений: {N_sent}\n\
Общее число слов: {N_words}\n\
Общее число символов: {N_char}\n\
\nСредняя длина предложения в словах: {np.mean(sentences.token.apply(lambda x: len(x)))}\n\
Средняя длина предложения в символах: {np.mean(sentences.token.apply(lambda x: len("".join(x))))}\n\
Средня длина слова: {np.mean(full.token.apply(lambda x: len(x)))}')

Общее число предложений: 1036
Общее число слов: 21820
Общее число символов: 31921

Средняя длина предложения в словах: 21.06177606177606
Средняя длина предложения в символах: 30.81177606177606
Средня длина слова: 1.4629239230064162


# Main part

## Load reference table for postags of different tools


In [5]:
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

ref_table = gc.open_by_key('1w7qRF3H2GmFOW5HvMIGrgIBC8RfK17UmkMJpzcOwAaQ')
ref_table = ref_table.get_worksheet(0)
list_table = ref_table.get_all_values()
ref_table = pd.DataFrame(data=list_table)
ref_table.columns = ref_table.iloc[0]
ref_table.drop(ref_table.index[0], inplace=True)
ref_table['fastHan'] = ref_table.fastHan.apply(lambda x: x.strip())

for col in ref_table.iloc[:, 2:]:
    ref_table[col] = ref_table[col].apply(lambda x: x.split(', '))

### Comparison by words

In [3]:
pos_tools = ['ckiptagger', 'pkuseg' ,'fastHan', 'PyNLPIR', 'stanza', 'spacy', 'ltp', 'jiagu', 'lac', 'snownlp']

In [18]:
# How many different tags refer to target set

count_tags = pd.DataFrame(columns=[t for t in pos_tools if t != 'fastHan'], index=ref_table['fastHan'])
for tool in pos_tools:
    if tool != 'fastHan':
        count_tags[tool] = ref_table[tool].apply(lambda x: len(x)).tolist()

In [26]:
write_to_stats(stat, 'count_tags', count_tags)

### Install models

In [None]:
for tool in pos_tools:
    try:
        if tool == 'spacy':
            !pip install -U spacy
            !python -m spacy download zh_core_web_sm
        else:
          !pip install {tool}
    except:
        print("Failed in installing", tool)
        continue

Collecting ckiptagger
  Downloading https://files.pythonhosted.org/packages/6b/bc/5cbf8d019167d5e5e1775069fb8b71a08691ab847e2926bbe7dee9a19010/ckiptagger-0.2.1-py3-none-any.whl
Installing collected packages: ckiptagger
Successfully installed ckiptagger-0.2.1
Collecting pkuseg
[?25l  Downloading https://files.pythonhosted.org/packages/6f/59/09644bdf620738f93520c28d83a67b9550e446705eac6438f444bb6563ca/pkuseg-0.0.25-cp37-cp37m-manylinux1_x86_64.whl (50.2MB)
[K     |████████████████████████████████| 50.2MB 83kB/s 
Installing collected packages: pkuseg
Successfully installed pkuseg-0.0.25
Collecting fastHan
[?25l  Downloading https://files.pythonhosted.org/packages/b7/cf/f604d4c95348d2ec954fba13ab3317444d5117f4c4c386a049ae2305a29c/fastHan-1.7-py3-none-any.whl (64kB)
[K     |████████████████████████████████| 71kB 3.6MB/s 
[?25hCollecting FastNLP>=0.5.5
[?25l  Downloading https://files.pythonhosted.org/packages/49/76/c80dc6ba0c29ca0ac7ae8b15d5e443628ed901651b55d4edba9436de59ae/FastNLP-0

In [None]:
from ckiptagger import data_utils, WS, POS
import pkuseg
!pynlpir update
import pynlpir
import stanza
print('import')
stanza.download('zh', processors='tokenize, pos')
import zh_core_web_sm
from ltp import LTP
import jiagu
from LAC import LAC
import snownlp
from snownlp import SnowNLP
from fastHan import FastHan


License updated.


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 29.7MB/s]                    
2021-05-22 13:59:58 INFO: "zh" is an alias for "zh-hans"
2021-05-22 13:59:58 INFO: Downloading these customized packages for language: zh-hans (Simplified_Chinese)...
| Processor | Package |
-----------------------
| tokenize  | gsdsimp |
| pos       | gsdsimp |
| pretrain  | gsdsimp |



import


Downloading http://nlp.stanford.edu/software/stanza/1.2.0/zh-hans/tokenize/gsdsimp.pt: 100%|██████████| 1.13M/1.13M [00:00<00:00, 3.16MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.2.0/zh-hans/pos/gsdsimp.pt: 100%|██████████| 21.3M/21.3M [00:00<00:00, 23.3MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.2.0/zh-hans/pretrain/gsdsimp.pt: 100%|██████████| 306M/306M [00:56<00:00, 5.46MB/s]
2021-05-22 14:01:00 INFO: Finished downloading models and saved to /root/stanza_resources.
  formatvalue=lambda value: "")[1:-1]


## Comparison by words _


In [None]:
words_tests = {} # for tagged

In [None]:
def run_tools(tool, tests): # by words segmented previously by fasthan
    if tool == 'ckiptagger':
        # for CKIPtagger you need its dictionaries. Download them and import in any way
        ws = WS('/content/drive/MyDrive/data/ckiptagger') 
        pos = POS("/content/drive/My Drive/data/ckiptagger")
        words = ws(full.token)
        pos_tags = pos(words)
        tests[tool] = pos_tags

    if tool == 'pkuseg':
        seg = pkuseg.pkuseg(postag=True)
        pku_tags = []
        for word in tqdm(full.token):
            try:
                text = seg.cut(word)
                pku_tags.append([i[1] for i in text])
            except IndexError:
                pku_tags.append('')
        tests[tool] = pku_tags

    if tool == 'PyNLPIR':

        pynlpir.open()
        word_pos = []
        for word in full.token:
            try:
                word_pos.append([i[1] for i in pynlpir.segment(word, pos_names='raw')])
            except:
                word_pos.append('')
        tests[tool] = word_pos
        pynlpir.close()

    if tool == 'stanza':

        nlp = stanza.Pipeline('zh', processors='tokenize, pos')
        word_pos = []
        for word in full.token:
            doc = nlp(word)
            word_pos.append([word.upos for sent in doc.sentences for word in sent.words])
        tests[tool] = word_pos

    if tool == 'spacy':
        nlp = zh_core_web_sm.load()
        word_pos = []

        for word in full.token:
            doc = nlp(word)
            word_pos.append([word.pos_ for word in doc])
        tests[tool] = word_pos

    if tool == 'ltp':
        ltp = LTP()
        seg, hidden = ltp.seg(full.token.tolist())
        pos = ltp.pos(hidden)
        word_pos = []
        for i in range(len(seg)):
            word_pos.append([word for word in pos[i]])
        tests[tool] = word_pos

    if tool == 'jiagu':
        word_pos = []
        for word in full.token:
            try:
                word_pos.append(jiagu.pos([word]))
            except:
                word_pos.append('')
        tests[tool] = word_pos

    if tool == 'lac':
        lac = LAC(mode='rank')
        word_pos = []
        for word in full.token:
            try:
                word_pos.append(lac.run(word)[1])
            except:
                word_pos.append([''])
        tests[tool] = word_pos

    if tool == 'snownlp':

        word_pos = []

        for word in full.token:
            try:
                word_pos.append([i[1] for i in SnowNLP(word).tags])
            except:
                word_pos.append('')
        tests[tool] = word_pos

    if tool == 'fastHan':
        model=FastHan()
        word_pos = []
        for word in full.token:
            word_pos.append([tag[3] for tag in model(word, target="Parsing")[0]])
        tests[tool] = word_pos

In [124]:
# This wil take around 20 minutes

for tool in pos_tools:
    print(f'Processing {tool} ...\n')
    run_tools(tool, words_tests)

Processing ckiptagger ...





Processing pkuseg ...



Downloading: "https://github.com/lancopku/pkuseg-python/releases/download/v0.0.16/postag.zip" to /root/.pkuseg/postag.zip
100%|██████████| 41424981/41424981 [00:00<00:00, 70661572.57it/s]
100%|██████████| 21820/21820 [00:03<00:00, 7160.86it/s]


Processing fastHan ...



  0%|          | 0.00/144M [00:00<?, ?B/s]

http://212.129.155.247/fasthan/fasthan_base.zip not found in cache, downloading to /tmp/tmp4wmehv9f


100%|██████████| 144M/144M [00:14<00:00, 9.93MB/s]


Finish download from http://212.129.155.247/fasthan/fasthan_base.zip
Copy file to /root/.fastNLP/fasthan/fasthan_base
loading vocabulary file /root/.fastNLP/fasthan/fasthan_base/vocab.txt
Load pre-trained BERT parameters from file /root/.fastNLP/fasthan/fasthan_base/model.bin.
Processing PyNLPIR ...



2021-05-22 14:29:40 INFO: "zh" is an alias for "zh-hans"
2021-05-22 14:29:40 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package |
-----------------------
| tokenize  | gsdsimp |
| pos       | gsdsimp |

2021-05-22 14:29:40 INFO: Use device: cpu
2021-05-22 14:29:40 INFO: Loading: tokenize
2021-05-22 14:29:40 INFO: Loading: pos


Processing stanza ...



2021-05-22 14:29:46 INFO: Done loading processors!


Processing spacy ...

Processing ltp ...



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=164437832.0, style=ProgressStyle(descri…




  ] for encoding in tokenized.encodings]
  for idx, word_idx in enumerate(encoding.words) if word_idx is not None


Processing jiagu ...

Processing lac ...

Processing snownlp ...



In [141]:
# Saving tagged words

with open('/content/drive/MyDrive/data/words.json', 'w', encoding='utf-8') as f:
    json.dump(words_tests, f, ensure_ascii=False) 

### Metrics on words


In [179]:
### How many splitted words into several?

words_errors = {}
for tool in pos_tools:
    err = 0
    for i in range(len(words_tests['fastHan'])):
        try:
            if len(words_tests[tool][i]) != 1:
                err += 1
        except:
            err += 1
    words_errors[tool] = err

In [196]:
write_to_stats(stat, 'words_errors', 
               pd.DataFrame(data=[words_errors, {k: v/N_words for k, v in words_errors.items()}], 
                            index=['absolute', 'mean']).reset_index())

In [204]:
df_words = pd.DataFrame(words_tests)
df_words['target'] = full.true_pos

In [246]:
def mostFreq(tool, df):
"""
Collect info of the most probable tag 
"""

    most_freq = {}
    for tags in ref_table[tool]:
        for tag in tags:
            if not tag in most_freq:
                try:
                    d = df[df[tool].apply(lambda x: tag in x)]['target'].value_counts()
                    most_freq[tag] = (d.index[0], d[0])
                except:
                    for t, tar in zip(ref_table[tool], ref_table['fastHan']):
                        if tag in t:
                            most_freq[tag] = tar
    return most_freq

In [247]:
# Get dictionary 

words_freq_dict = {i: {} for i in pos_tools}
for tool in pos_tools:
    if tool != 'fastHan':
        words_freq_dict[tool] = mostFreq(tool, df_words)

In [349]:
## Compare tags

def compare(REF, result, test, model, test_name, model_name='fastHan'):
    """
    REF: reference tags table
    result: where to write results
    test: result of a tool
    model: target

    """
    idx = 0
    count = 0
    predict = []
    errors = []
    for m, t in tqdm(zip(model, test)):
        idx += 1
        try:
            if isinstance(t, list):
                for tag in t: 
                    new = tag
                    if tag in REF[REF[model_name] == m][test_name].iloc[0]:
                        new = m
                        break
                    else:
                        new = tag
                        errors.append((idx, m, t))
            else:
                if t in REF[REF[model_name] == m][test_name].iloc[0]:
                    new = m
                else:
                    new = t
                    errors.append((idx, m, t))
            predict.append(new)
        except:
            errors.append((idx, m, t))
            predict.append('')
    predict = [i if i is not None else '' for i in predict]
    result[test_name] = [accuracy_score(predict, model),
                                precision_score(predict, model, average='macro'), 
                                recall_score(predict, model, average='macro'), 
                                f1_score(predict, model, average='macro'), 
                                errors]

In [348]:
words_results = pd.DataFrame(columns = pos_tools)
# words_results.to_csv('/content/drive/MyDrive/data/words_results.csv') #checkpoint

In [None]:
for tool in pos_tools:
    print(f'\nProcessing {tool}\n')
    compare(ref_table, words_results, words_tests[tool], df_words.target, tool)

In [360]:
words_results

Unnamed: 0,ckiptagger,pkuseg,fastHan,PyNLPIR,stanza,spacy,ltp,jiagu,lac,snownlp
0,0.674427,0.812007,0.74505,0.842438,0.80055,0.87516,0.815078,0.835564,0.835105,0.780339
1,0.290183,0.362862,0.426926,0.230922,0.352938,0.444851,0.429876,0.356713,0.414455,0.302276
2,0.416077,0.46772,0.494117,0.28866,0.550862,0.539844,0.568833,0.469697,0.553571,0.422535
3,0.306982,0.390096,0.388543,0.250393,0.384092,0.474718,0.468394,0.387557,0.447759,0.331136
4,"[(10, M, [FW]), (11, NN, [FW]), (15, CD, [Neqa...","[(10, M, [n]), (16, VC, [v]), (18, M, [n]), (3...","[(5, DEC, [SP]), (6, NN, [JJ, NN]), (9, OD, [C...","[(10, M, [n]), (18, M, [n]), (40, SP, [ude1]),...","[(7, SB, [VERB]), (9, OD, [NUM]), (21, OD, [NU...","[(6, NN, [PROPN]), (7, SB, [X]), (9, OD, [NUM]...","[(10, M, [n]), (18, M, [n]), (24, DT, [d]), (2...","[(10, M, [n]), (16, VC, [vl]), (18, M, [n]), (...","[(6, NN, [a, n]), (10, M, [n]), (18, M, [n]), ...","[(1, NR, [j, Rg, Rg]), (1, NR, [j, Rg, Rg]), (..."


In [361]:
## Get examples

for tool in pos_tools:
    print(f'Ошибки работы {tool}: ')
    for i in range(10):
        print(words_results[tool][4][i])
    

Ошибки работы ckiptagger: 
(10, 'M', ['FW'])
(11, 'NN', ['FW'])
(15, 'CD', ['Neqa'])
(17, 'CD', ['FW'])
(18, 'M', ['FW'])
(22, 'NN', ['FW'])
(25, 'CD', ['Neqa'])
(26, 'CC', ['P'])
(29, 'NN', ['FW'])
(33, 'VV', ['FW'])
Ошибки работы pkuseg: 
(10, 'M', ['n'])
(16, 'VC', ['v'])
(18, 'M', ['n'])
(37, 'VC', ['v'])
(42, 'AD', ['c'])
(43, 'VC', ['v'])
(46, 'VV', ['u', 'v'])
(49, 'PN', ['c'])
(62, 'VV', ['u'])
(64, 'JJ', ['m'])
Ошибки работы fastHan: 
(5, 'DEC', ['SP'])
(6, 'NN', ['JJ', 'NN'])
(9, 'OD', ['CD'])
(10, 'M', ['NN'])
(18, 'M', ['NN'])
(19, 'DEG', ['SP'])
(21, 'OD', ['CD'])
(23, 'DEG', ['SP'])
(24, 'DT', ['AD'])
(26, 'CC', ['VV'])
Ошибки работы PyNLPIR: 
(10, 'M', ['n'])
(18, 'M', ['n'])
(40, 'SP', ['ude1'])
(42, 'AD', ['cc'])
(46, 'VV', ['usuo', 'v'])
(49, 'PN', ['rzv'])
(59, 'NN', ['m'])
(64, 'JJ', ['d'])
(73, 'BA', ['v'])
(76, 'VV', ['p'])
Ошибки работы stanza: 
(7, 'SB', ['VERB'])
(9, 'OD', ['NUM'])
(21, 'OD', ['NUM'])
(26, 'CC', ['ADP'])
(28, 'P', ['VERB'])
(30, 'LC', ['NOUN'])

## Comparison by sentences



In [362]:
full['id_sent'] = full.id_sent.apply(int)

In [369]:
sentences = full.groupby(['nickname', 'id_sent']).agg({'token':lambda x: list(x),
                                                      'pos': lambda x: list(x), 
                                                      'id_token': lambda x: list(x)})
sentences['id_sent'] = [i[1] for i in sentences.index]


In [386]:
def get_index(inp):
    lst = [[] for i in inp]
    count = 0
    for k, word in enumerate(inp):
        lst[k] = [count+i for i in range(len(word))]
        count+= len(word)
    return lst

In [390]:
sentences.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,token,pos,id_token,id_sent,sent
nickname,id_sent,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gold2_125,0,"[洛什卡列夫, 一, 家, 住, 的, 简易房, 被, 叫做, 三, 号, 楼, ，, 它,...","[NR, CD, NN, VV, DEC, NN, SB, VV, OD, M, NN, P...","[[0, 1, 2, 3, 4], [5], [6], [7], [8], [9, 10, ...",0,洛什卡列夫一家住的简易房被叫做三号楼，它有一半是两层的，二楼的另一半和楼梯在战争中就被毁了—...
gold2_125,1,"[从, 那, 以后, ，, 要, 到, 二, 楼, 剩, 下来, 的, 部分, 去, 就, ...","[P, PN, LC, PU, VV, VV, OD, NN, VV, VV, DEC, N...","[[0], [1], [2, 3], [4], [5], [6], [7], [8], [9...",1,从那以后，要到二楼剩下来的部分去就得爬单梯。洛什卡列夫从军队医院回来之后把这梯子给加固了。
gold2_125,2,"[脚, 下, 的, 路面, 微微, 发亮, 。]","[NN, LC, DEG, NN, AD, VV, PU]","[[0], [1], [2], [3, 4], [5, 6], [7, 8], [9]]",2,脚下的路面微微发亮。
gold2_125,3,"[这, 部分, 城郊, 当时, 还, 很, 偏僻, ，, 没有, 多少, 房屋, ，, 遍地...","[DT, CD, NN, NT, AD, AD, VA, PU, VE, CD, NN, P...","[[0], [1, 2], [3, 4], [5, 6], [7], [8], [9, 10...",3,这部分城郊当时还很偏僻，没有多少房屋，遍地是杂草。
gold2_125,4,"[她, 的, 瞳孔, 睁, 得, 这么, 大, ，, 几乎, 和, 黑色, 的, 虹膜圈, ...","[PN, DEG, NN, VV, DER, AD, VA, PU, AD, P, JJ, ...","[[0], [1], [2, 3], [4], [5], [6, 7], [8], [9],...",4,她的瞳孔睁得这么大，几乎和黑色的虹膜圈合在一起了，而且眼睛显得不是灰色的，而是黑色的。


In [388]:
sentences['id_token'] = sentences.token.progress_apply(get_index)




100%|██████████| 1036/1036 [00:00<00:00, 25823.53it/s]


In [389]:
sentences['sent'] = sentences.token.apply(lambda x: ''.join(x))
sent_tests = {}

In [392]:
import time

In [393]:
# col_sents = ['words', 'pos', 'indices']

def run_tools_sent(tool): # by words segmented previously by fasthan
    print(f'Preprocessing with {tool}...', '\n')
    if tool == 'ckiptagger':
        from ckiptagger import data_utils, WS, POS
        ws = WS('/content/drive/MyDrive/data/ckiptagger')
        pos = POS("/content/drive/My Drive/data/ckiptagger")
        words = ws(sentences.sent)
        start_time = time.clock()
        pos_tags = pos(words)
        print(f'Время выполнения: {time.clock() - start_time}')
        sent_tests[tool] = [words, pos_tags, list(sentences.id_sent)]

    if tool == 'pkuseg':
        seg = pkuseg.pkuseg(postag=True)
        pos_tags = []
        words = []
        start_time = time.clock()

        for sent in tqdm(sentences.sent):
            try:
                text = seg.cut(sent)
                pos_tags.append([i[1] for i in text])
                words.append([i[0] for i in text])
            except IndexError:
                pos_tags.append([''])
        print(f'Время выполнения: {time.clock() - start_time}')

        sent_tests[tool] = [words, pos_tags, list(sentences.id_sent)]

    if tool == 'PyNLPIR':
        pynlpir.open()
        pos_tags = []
        words = []      
        start_time = time.clock()  
        for sent in sentences.sent:
            try:
                text = pynlpir.segment(sent, pos_names='raw')
                words.append([i[0] for i in text])
                pos_tags.append([i[1] for i in text])
            except:
                pos_tags.append([''])
                words.append([''])
        print(f'Время выполнения: {time.clock() - start_time}')
        sent_tests[tool] = [words, pos_tags, list(sentences.id_sent)]
        pynlpir.close()

    if tool == 'stanza':
        nlp = stanza.Pipeline('zh', processors='tokenize, pos')
        pos_tags = []
        words = []
        xpos = []
        start_time = time.clock()
        for word in sentences.sent:
            doc = nlp(word)
            pos_tags.append([word.upos for sent in doc.sentences for word in sent.words])
            xpos.append([word.xpos for sent in doc.sentences for word in sent.words])
            words.append([word.text for sent in doc.sentences for word in sent.words])
        print(f'Время выполнения: {time.clock() - start_time}')
        sent_tests[tool] = [words, pos_tags, list(sentences.id_sent), xpos]


    if tool == 'spacy':
        nlp = zh_core_web_sm.load()
        pos_tags = []
        words = []
        xpos = []
        start_time = time.clock()
        for word in sentences.sent:
            doc = nlp(word)
            pos_tags.append([word.pos_ for word in doc])
            xpos.append([word.tag_ for word in doc])
            words.append([word.text for word in doc])
        print(f'Время выполнения: {time.clock() - start_time}')
        sent_tests[tool] = [words, pos_tags, list(sentences.id_sent), xpos]

    if tool == 'ltp':
        ltp = LTP()
        seg, hidden = ltp.seg(sentences.sent.tolist())
        pos = ltp.pos(hidden)
        words = []
        pos_tags = []
        start_time = time.clock()
        for i in range(len(seg)):
            pos_tags.append([word for word in pos[i]])
            words.append([word for word in seg[i]])
        print(f'Время выполнения: {time.clock() - start_time}')
        sent_tests[tool] = [words, pos_tags, list(sentences.id_sent)]

    if tool == 'jiagu':
        pos_tags = []
        words = []
        start_time = time.clock()
        for sent in sentences.sent:
            try:
                seg = jiagu.seg(sent)
                pos_tags.append(jiagu.pos(seg))
                words.append(seg)
            except:
                pos_tags.append([''])
                words.append([''])
        print(f'Время выполнения: {time.clock() - start_time}')
        sent_tests[tool] = [words, pos_tags, list(sentences.id_sent)]

    if tool == 'lac':
        lac = LAC(mode='rank')
        pos_tags = []
        words = []
        start_time = time.clock()
        for sent in sentences.sent:
            doc = lac.run(sent)
            try:
                pos_tags.append(doc[1])
                words.append(doc[0])
            except:
                pos_tags.append([''])
                words.append([''])
        print(f'Время выполнения: {time.clock() - start_time}')
        sent_tests[tool] = [words, pos_tags, list(sentences.id_sent)]

    if tool == 'snownlp':
        pos_tags = []
        words = []
        start_time = time.clock()
        for sent in sentences.sent:
            # doc = SnowNLP(sent).tags
            try:
                pos_tags.append([i[1] for i in SnowNLP(sent).tags])
                words.append([i[0] for i in SnowNLP(sent).tags])
            except:
                words.append([''])
                pos_tags.append([''])
        print(f'Время выполнения: {time.clock() - start_time}')
        sent_tests[tool] = [words, pos_tags, list(sentences.id_sent)]

    if tool == 'fastHan':
        model=FastHan()
        words = []
        pos_tags = []
        start_time = time.clock()
        for sent in sentences.sent:
            doc = model(sent, target="Parsing")[0]
            pos_tags.append([tag[3] for tag in doc])
            words.append([tag[0] for tag in doc])
        print(f'Время выполнения: {time.clock() - start_time}')
        sent_tests[tool] = [words, pos_tags, list(sentences.id_sent)]

In [394]:
for tool in pos_tools:
    run_tools_sent(tool)

Preprocessing with ckiptagger... 





21092it [1:12:33,  4.84it/s]
6022it [48:10,  2.08it/s] 
  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':


Время выполнения: 45.717923000000155
Preprocessing with pkuseg... 



 97%|█████████▋| 1003/1036 [00:01<00:00, 625.25it/s]
100%|██████████| 1036/1036 [00:01<00:00, 567.91it/s]


Время выполнения: 1.8266349999998965
Preprocessing with fastHan... 



 67%|██████▋   | 696/1036 [09:40<00:01, 170.73it/s]

loading vocabulary file /root/.fastNLP/fasthan/fasthan_base/vocab.txt


 67%|██████▋   | 696/1036 [09:41<00:01, 170.73it/s]

Load pre-trained BERT parameters from file /root/.fastNLP/fasthan/fasthan_base/model.bin.




Время выполнения: 156.4073410000001
Preprocessing with PyNLPIR... 



2021-05-22 17:23:53 INFO: "zh" is an alias for "zh-hans"
2021-05-22 17:23:53 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package |
-----------------------
| tokenize  | gsdsimp |
| pos       | gsdsimp |

2021-05-22 17:23:53 INFO: Use device: cpu
2021-05-22 17:23:53 INFO: Loading: tokenize
2021-05-22 17:23:53 INFO: Loading: pos


Время выполнения: 0.3600859999996828
Preprocessing with stanza... 



2021-05-22 17:23:59 INFO: Done loading processors!


Время выполнения: 41.30156699999998
Preprocessing with spacy... 





Время выполнения: 10.520791999999801
Preprocessing with ltp... 



  ] for encoding in tokenized.encodings]
  for idx, word_idx in enumerate(encoding.words) if word_idx is not None


Время выполнения: 0.004472999999961758
Preprocessing with jiagu... 





Время выполнения: 4.502116000000115
Preprocessing with lac... 





Время выполнения: 2.8567549999997937
Preprocessing with snownlp... 

Время выполнения: 41.6698779999997




In [398]:
# with open('/content/drive/MyDrive/data/sentences.json', 'w', encoding='utf-8') as f:
#     json.dump(sent_tests, f, ensure_ascii=False)

In [6]:
# with open('/content/drive/MyDrive/data/sentences.json') as f:
    # sent_tests = json.load(f)

In [402]:
chars = [] # all characters in a row
for token in full.token:
    for char in list(token):
        chars.append(char)

In [404]:
def squeeze(sentences): # tags for each char
    lst_char = []
    sents, pos_tags = sentences[0], sentences[1]

    for k1, sent in enumerate(sents):
        for k2, word in enumerate(sent):
            for k3, char in enumerate(list(word)):
                lst_char.append(pos_tags[k1][k2])

    return lst_char


In [412]:
## true tags in a row

true = []
sent = [] # sent_ids
for i, token in enumerate(full.token):
    for char in token:
        true.append(full.true_pos[i])
        sent.append(full.id_sent[i])

In [406]:
df_char = pd.DataFrame(columns = pos_tools)
for tool in pos_tools:
    print(tool)
    df_char[tool] = squeeze(sent_tests[tool])

ckiptagger
pkuseg
fastHan
PyNLPIR
stanza
spacy
ltp
jiagu
lac
snownlp


In [413]:
df_char['target'] = true
df_char['char'] = chars
df_char['sent'] = sent

In [415]:
df_char.head()

Unnamed: 0,ckiptagger,pkuseg,fastHan,PyNLPIR,stanza,spacy,ltp,jiagu,lac,snownlp,target,char,sent
0,Nb,nr,NR,nrf,PROPN,PROPN,nh,nh,PER,j,NR,洛,0
1,Nb,nr,NR,nrf,PROPN,PROPN,nh,m,PER,nz,NR,什,0
2,Nb,nr,NR,nrf,PROPN,PROPN,nh,n,PER,nz,NR,卡,0
3,Nb,nr,NR,nrf,PROPN,PROPN,nh,v,PER,nx,NR,列,0
4,Nb,nr,NR,nrf,PROPN,PROPN,nh,n,PER,nx,NR,夫,0


In [535]:
ref_char = pd.DataFrame(columns=pos_tools) # create table with results

def compare_char(REF, test, model, test_name, model_name='fastHan'):
    idx = 0
    count = 0
    predict = []
    errors = []
    for m, t in tqdm(zip(model, test)):
        if t != '':
            try:
                if t in REF[REF[model_name] == m][test_name].iloc[0]:
                    new = m
                else:
                    new = t
                    errors.append((idx, m, t))
            except IndexError:
                new = t
        else:
            new = t
        predict.append(new)

    ref_char[test_name] = predict

In [427]:
for tool in pos_tools:
    print(tool)
    compare_char(ref_table, df_char[tool], df_char['target'], tool)
ref_char['target'] = true
ref_char['char'] = df_char.char

In [428]:
ref_char[ref_char.target == 'PU'][ref_char.ltp != 'PU']

  """Entry point for launching an IPython kernel.


Unnamed: 0,ckiptagger,pkuseg,fastHan,PyNLPIR,stanza,spacy,ltp,jiagu,lac,snownlp,target,char
3177,I,PU,PU,PU,PU,PU,z,PU,PU,PU,PU,“
3955,PU,PU,PU,PU,VERB,PU,m,PU,PU,Bg,PU,…
3956,PU,PU,PU,PU,PART,PU,m,PU,PU,Bg,PU,…
3957,P,PU,PU,PU,PU,PU,m,PU,PU,Bg,PU,”
6291,PU,PU,PU,PU,PU,PU,b,PU,PU,PU,PU,—
6292,PU,PU,PU,PU,PU,PU,b,PU,PU,PU,PU,—
6872,VE,PU,PU,PU,PU,PU,d,PU,PU,PU,PU,”
9356,Na,PU,PU,PU,PU,NOUN,n,PU,PU,PU,PU,·
11310,PU,PU,PU,PU,PU,PU,v,PU,PU,PU,PU,—
11311,PU,PU,PU,PU,PU,PU,v,PU,PU,PU,PU,—


In [449]:
# ref_char.to_csv('/content/drive/MyDrive/data/ref_char.csv', sep='\t')
# df_char.to_csv('/content/drive/MyDrive/data/df_char.csv', sep='\t')

In [432]:
res_sents = pd.DataFrame(columns=pos_tools, index=['accuracy', 'precision', 'recall', 'f1_score'])
for tool in pos_tools:
    res_sents[tool] = [accuracy_score(ref_char[tool], ref_char.target),
                                precision_score(ref_char[tool], ref_char.target, average='macro'), 
                                recall_score(ref_char[tool], ref_char.target, average='macro'), 
                                f1_score(ref_char[tool], ref_char.target, average='macro')]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [559]:
# res_sents.to_csv('/content/drive/MyDrive/data/res_sents.csv')
# res_sents = pd.read_csv('/content/drive/MyDrive/data/res_sents.csv')

In [560]:
res_sents # Preliminar results

Unnamed: 0.1,Unnamed: 0,ckiptagger,pkuseg,fastHan,PyNLPIR,stanza,spacy,ltp,jiagu,lac,snownlp
0,accuracy,0.66367,0.8277,0.976035,0.830801,0.807024,0.871558,0.819053,0.780865,0.847373,0.703267
1,precision,0.276166,0.388939,0.899176,0.223502,0.41943,0.443491,0.451724,0.369516,0.43032,0.27394
2,recall,0.379683,0.5,0.919571,0.27193,0.604167,0.571429,0.578947,0.515625,0.551724,0.384615
3,f1_score,0.290562,0.424474,0.907689,0.243067,0.46811,0.484565,0.496496,0.410596,0.468906,0.3094


### Replace wrong tags by the most frequent match


In [450]:
## наиболее вероятное значение тега
def mostFreq(tool):
    most_freq = {}
    for tags in ref_table[tool]:
        for tag in tags:
            if not tag in most_freq:
                try:
                    most_freq[tag] = df_char[df_char[tool] == tag]['target'].value_counts().index[0]
                except:
                    for t, tar in zip(ref_table[tool], ref_table['fastHan']):
                        if tag in t:
                            most_freq[tag] = tar
    return most_freq

In [451]:
mostFreqDict = {i: {} for i in pos_tools}
for tool in pos_tools:
    mostFreqDict[tool] = mostFreq(tool)

In [452]:
### Replace
for tool in pos_tools:
    print(tool)
    for k, pair in enumerate(zip(ref_char[tool], ref_char.target)):
        tag, tar = pair[0], pair[1]
        if tag != tar:
            if tag not in mostFreqDict[tool]:
                continue
            else:
                ref_char[tool][k] = mostFreqDict[tool][tag]

ckiptagger
pkuseg
fastHan
PyNLPIR
stanza
spacy
ltp
jiagu
lac
snownlp


In [548]:
# with open('/content/drive/MyDrive/data/frequent_replacement.json', 'w', encoding='utf-8') as f:
#     json.dump(mostFreqDict, f, ensure_ascii=False)

In [459]:
## Results after replacement
res_sents = pd.DataFrame(columns=pos_tools, index=['accuracy', 'precision', 'recall', 'f1_score'])
for tool in pos_tools:
    res_sents[tool] = [accuracy_score(ref_char[tool], ref_char.target),
                                precision_score(ref_char[tool], ref_char.target, average='macro'), 
                                recall_score(ref_char[tool], ref_char.target, average='macro'), 
                                f1_score(ref_char[tool], ref_char.target, average='macro')]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [460]:
res_sents

Unnamed: 0,ckiptagger,pkuseg,fastHan,PyNLPIR,stanza,spacy,ltp,jiagu,lac,snownlp
accuracy,0.717647,0.846684,0.976035,0.8631,0.807024,0.873907,0.833683,0.800758,0.859309,0.753078
precision,0.661129,0.693166,0.899176,0.618964,0.592136,0.64836,0.760681,0.703405,0.736919,0.548268
recall,0.77559,0.795851,0.919571,0.633303,0.755763,0.761584,0.846003,0.830302,0.872074,0.62524
f1_score,0.684558,0.720596,0.907689,0.61632,0.612906,0.674977,0.781148,0.724648,0.769829,0.56163


## Parallel alignment


In [475]:
# align tags 
def ruRef(table, model, test, model_name, test_name):
    tags = []
    for m, t in zip(model, test):
        if t in table[table[model] == m][test].iloc[0]:
            new = m
        else:
            new = t
        tags.append(new)
    return tag_rus

In [476]:
# load phar files
def open_phar(path):
    with open(path, 'r', encoding='utf-8') as f:
        phar = f.readlines()
    return [p.split() for p in phar]

In [477]:
phar125 = open_phar('/pos/alignment/gold2_125.phar')
phar365 = open_phar('pos/alignment/gold4_900.phar')

In [492]:
# to concatenate later
char_125 = sum(sentences.loc['gold2_125'].token.apply(lambda x: sum([len(i) for i in x])))
char_900 = sum(sentences.loc['gold4_365'].token.apply(lambda x: sum([len(i) for i in x])))+sum(sentences.loc['gold4_continue'].token.apply(lambda x: sum([len(i) for i in x])))


In [493]:
## Get aligned tags

def align(zh, ru, phar, N):
    '''
    zh -> 
    '''
    length = len(zh)
    ruword = ['' for i in range(N)]
    rupos = ['' for i in range(N)]
    count = 0
    for id_sent in range(length):
        df = zh[zh.id_sent == id_sent][['id_token', 'pos']]
        for lst in df.id_token[0]:
            for id_token in lst:
                id_token = int(id_token)
                for pair in phar[id_sent]:
                    pair = [int(i) for i in pair.split('-')]
                    if id_token == pair[1]:
                        try:
                            ruword[count] = ru[(ru.id_sent == id_sent) & (ru.id_token == pair[0])].word.iloc[0]
                            rupos[count] = ru[(ru.id_sent == id_sent) & (ru.id_token == pair[0])].pos.iloc[0]
                        except:
                            print(count)

                count += 1
    return ruword, rupos

In [500]:
a, b = align(sentences[:126], ru[ru.nickname == 'gold2'], phar125, char_125)
c, d = align(sentences[126:], ru[(ru.nickname == 'gold4')|(ru.nickname == 'gold4_continue')], phar365, char_900)

In [519]:
ref_char['pymorphy'] = b+d
ref_char['rus'] = a+c
ref_char['pymorphy'] = ref_char.pymorphy.apply(lambda x: str(x))

In [5]:
# ref_char.to_csv('/content/drive/MyDrive/data/parallel_rus_22.csv') ##ckeckpoint
# ref_char = pd.read_csv('/content/drive/MyDrive/data/parallel_rus_22.csv') #load back

In [29]:
# Accuracy by proper nouns

N_nr = ref_char[ref_char.target=='NR'].shape[0]
NRs = pd.DataFrame(columns = pos_tools)
for tool in pos_tools:
    a = ref_char[(ref_char.target=='NR')&(ref_char[tool] == 'NR')].shape[0]/N_nr
    NRs[tool] = [a]
NRs

Unnamed: 0,ckiptagger,pkuseg,fastHan,PyNLPIR,stanza,spacy,ltp,jiagu,lac,snownlp
0,0.672262,0.824591,0.986991,0.835501,0.866135,0.728493,0.874528,0.405791,0.928242,0.602602


### Describe parallel


In [7]:
print(f'Число сопоставленных символов: {ref_char[ref_char.pymorphy != ""].shape[0]}')
print(f'Процент сопоставленных символов: {ref_char[ref_char.pymorphy != ""].shape[0]/N_char}')

Число сопоставленных символов: 31921


In [528]:
print('Среди сопоставленных слов преобладают части речи: ')
ref_char[ref_char.pymorphy != ""].target.value_counts()

Среди сопоставленных слов преобладают части речи: 


NN     2511
VV     2219
PU     1460
AD     1163
NR      827
PN      606
VA      402
P       242
JJ      231
AS      205
CD      183
DT      144
M       140
DEC     138
DEG     138
LC      130
NT      115
SP       94
CC       76
VC       61
VE       57
DEV      56
CS       34
OD       21
IJ       20
DER      18
SB       12
MSP       6
BA        6
LB        5
ON        4
Name: target, dtype: int64

### Transfer russian tags to chinese


In [551]:
ref_char = ref_char.fillna('')

In [562]:
## Get correct tags
compare_char(ref_table, ref_char['pymorphy'], ref_char['target'], 'pymorphy')
res_sents['pymorphy'] = [accuracy_score(ref_char['pymorphy'], ref_char.target),
                                precision_score(ref_char['pymorphy'], ref_char.target, average='macro'), 
                                recall_score(ref_char['pymorphy'], ref_char.target, average='macro'), 
                                f1_score(ref_char['pymorphy'], ref_char.target, average='macro')]

31921it [00:06, 5140.21it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [542]:
# ref_char.to_csv('/content/drive/MyDrive/data/parallel_DONE.csv') #вариант после перевода

In [546]:
print(f'Теги соответствуют верным в {ref_char[ref_char.target == ref_char.pymorphy].shape[0]} случаях из {N_char}')

Теги соответствуют верным в 7300 случаях из 31921


In [None]:
df_char['pymorphy'] = a.pymorphy

In [555]:
mostFreqDict['pymorphy'] = mostFreq('pymorphy')

In [556]:
### Replace with most fequent
for tool in ['pymorphy']:
    print(tool)
    for k, pair in enumerate(zip(ref_char[tool], ref_char.target)):
        tag, tar = pair[0], pair[1]
        if tag != tar:
            if tag not in mostFreqDict[tool]:
                continue
            else:
                ref_char[tool][k] = mostFreqDict[tool][tag]

pymorphy


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [573]:
def metrics(yt, yp):
    return [accuracy_score(yt, yp),
                                precision_score(yt, yp, average='macro'), 
                                recall_score(yt, yp, average='macro'), 
                                f1_score(yt, yp, average='macro')]

In [557]:
# Add metrics

res_sents['pymorphy_new'] = [accuracy_score(ref_char['pymorphy'], ref_char.target),
                                precision_score(ref_char['pymorphy'], ref_char.target, average='macro'), 
                                recall_score(ref_char['pymorphy'], ref_char.target, average='macro'), 
                                f1_score(ref_char['pymorphy'], ref_char.target, average='macro')]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# ref_char.to_csv('/content/drive/MyDrive/data/parallel.csv')
# ref_char = pd.read_csv('/content/drive/MyDrive/data/parallel.csv')

### Apply Russian tags only on aligned parts


In [567]:
ref_char['experiment'] = [tag if tag != '' else '' for i, tag in enumerate(ref_char.pymorphy)]

In [574]:
res_sents['pymorphy_only_exist'] = metrics(ref_char[ref_char.experiment != ''].target, 
                                           ref_char[ref_char.experiment != ''].pymorphy)
# accuracy_score(ref_char[ref_char.experiment != ''].target, ref_char[ref_char.experiment != ''].pymorphy)
## очень неплохо :)

  _warn_prf(average, modifier, msg_start, len(result))


In [576]:
## fastHan + Pymorphy2 if fastHan is wrong

ref_char['combo'] = [tag if tag == ref_char.target[i] else ref_char.pymorphy[i] for i, tag in enumerate(ref_char.fastHan)]

In [578]:
res_sents['combo'] = metrics(ref_char.target, ref_char.combo)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [579]:
## Full results
res_sents 

Unnamed: 0.1,Unnamed: 0,ckiptagger,pkuseg,fastHan,PyNLPIR,stanza,spacy,ltp,jiagu,lac,snownlp,pymorphy,pymorphy_only_exist,combo
0,accuracy,0.66367,0.8277,0.976035,0.830801,0.807024,0.871558,0.819053,0.780865,0.847373,0.703267,0.253657,0.71503,0.976692
1,precision,0.276166,0.388939,0.899176,0.223502,0.41943,0.443491,0.451724,0.369516,0.43032,0.27394,0.124686,0.71186,0.914032
2,recall,0.379683,0.5,0.919571,0.27193,0.604167,0.571429,0.578947,0.515625,0.551724,0.384615,0.630504,0.405064,0.875544
3,f1_score,0.290562,0.424474,0.907689,0.243067,0.46811,0.484565,0.496496,0.410596,0.468906,0.3094,0.190675,0.455018,0.892843
