# I этап: подготовка текстов для НКРЯ и Balapan corpora (сайта)

## 0. Предочистка файла

In [None]:
import re
import xml.etree.ElementTree as ET

def preprocess_xml(input_file):
    output_file = input_file.replace('.xml', '_precleaned.xml')

    tree = ET.parse(input_file)
    root = tree.getroot()

    for para in root.findall('.//para'):
        for text in para.findall('se'):
            if text.text is not None and '~~~' in text.text:
                text.text = re.sub(r'~~~', '', text.text)
            elif text.text is not None and re.findall(r'\[\d+\]', text.text):
                text.text = re.sub(r'\[\d+\]', '', text.text)

    for para in root.findall('.//para'):
        for weight in para.findall('weight'):
            para.remove(weight)
    for se in root.findall('.//se'):
        se.attrib.pop('variant_id', None)
    for se in root.findall('.//se[@lang="fr"]'):
        se.attrib['lang'] = 'kk'

    tree.write(output_file, encoding='utf-8', xml_declaration=True)

### сразу делаем конверт очищенного xml в json формата сайта

In [None]:
import xml.etree.ElementTree as ET
import json

def convert_xml_to_json(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        xml_string = file.read()

    root = ET.fromstring(xml_string)
    paragraphs = root.findall('.//para')

    result = []
    for para in paragraphs:
        para_id = para.get('id')
        se_elements = para.findall('se')

        paragraph = {
            'id': para_id,
            'kk': se_elements[0].text,
            'ru': se_elements[1].text
        }
        result.append(paragraph)

    with open(output_file, 'w', encoding='utf-8') as file:
        for item in result:
            file.write(json.dumps(item, ensure_ascii=False) + '\n')

In [None]:
input_file = 'aidar_precleaned.xml'
output_file = 'aidar_precleaned.json'

convert_xml_to_json(input_file, output_file)

In [None]:
{"id": "0", "kk": "Ертеде бір ханнің үш қызы болыпты.", "kk_words": [], "ru": "В прежние времена у одного хана было три дочери.", "ru_words": []}
{"id": "1", "kk": "Екі үлкен қызын ұзатқаннан кейін, хан қолында қалған кенже қызын күйеуге беріп, ойын-тойын жасайды.", "kk_words": [], "ru": "Выдав двух старших дочерей, хан нашел мужа и младшей дочери, которая была у него на руках, устроил той (пир) и игрища.", "ru_words": []}

### Функция для преобразования json в формат, подходящий для подключения Elasticsearch ###

In [None]:
pip install stanza

In [None]:
import stanza
import json

stanza.download('kk')
stanza.download('ru')

nlp_kk = stanza.Pipeline('kk', processors='tokenize,pos,lemma')
nlp_ru = stanza.Pipeline('ru', processors='tokenize,pos,lemma')

In [None]:
# Функция для добавления морфологического анализа к предложению на заданном языке
def add_morphology(text, nlp, lang):
    doc = nlp(text)
    words = []
    for sent in doc.sentences:
        for word in sent.words:
            if word.upos == "PUNCT":
                word_data = {
                    "wf": word.text,
                    "wtype": "punct"
                }
            else:
                word_data = {
                    "wf": word.text,
                    "wtype": "word",
                    "lex": word.lemma,
                    "gr.pos": word.upos,
                    "gr.feats": word.feats
                }
            words.append(word_data)
    return words

input_file = 'aidar_precleaned.json'

output_file = 'aidar_output.json'

with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
    for line in f_in:
        data = json.loads(line)
        
        kk_text = data['kk']
        ru_text = data['ru']
        
        kk_words = add_morphology(kk_text, nlp_kk, 'kk')
        ru_words = add_morphology(ru_text, nlp_ru, 'ru')
        
        data['kk_words'] = kk_words
        data['ru_words'] = ru_words
        
        output_string = json.dumps(data, ensure_ascii=False)
        
        f_out.write(output_string + '\n')

print("Обработка файла завершена.")


Обработка файла завершена.


In [None]:
input_file = 'Abai-zholy-1-2.xml'
preprocess_xml(input_file)   

In [None]:
input_file = 'Abai-zholy-3-4.xml'
preprocess_xml(input_file)

In [None]:
input_file = 'adam_balyq.xml'
preprocess_xml(input_file)   

In [None]:
input_file = 'koshpendiler-1.xml'
preprocess_xml(input_file)

In [None]:
input_file = 'koshpendiler-2.xml'
preprocess_xml(input_file)

In [None]:
input_file = 'koshpendiler-3.xml'
preprocess_xml(input_file)

In [None]:
input_file = 'lisa_i_volk.xml'
preprocess_xml(input_file)

## 1. Морфологическая разметка

### Смотрю на библиотеку kaznlp

In [None]:
from __future__ import print_function
import os

from kaznlp.tokenization.tokrex import TokenizeRex
from kaznlp.tokenization.tokhmm import TokenizerHMM

from kaznlp.morphology.analyzers import AnalyzerDD
from kaznlp.morphology.taggers import TaggerHMM

In [None]:
# ==============
# TOKENIZATION =
# ==============


mdl = os.path.join('kaznlp', 'tokenization', 'tokhmm.mdl')
tokhmm = TokenizerHMM(model=mdl)
sents_toks = tokhmm.tokenize(txt)
print(sents_toks)

[['Еңбек', 'етсең', 'ерінбей', ',', 'тояды', 'қарның', 'тіленбей', '.']]


In [None]:
# ============
# MORPHOLOGY =
# ============

# create a morphological analyzer instance
analyzer = AnalyzerDD()
analyzer.load_model(os.path.join('kaznlp', 'morphology', 'mdl'))

# try analysis
print()
wrd = 'алмасын'
[iscovered, alist] = analyzer.analyze(wrd)
print('"{}" is covered by the analyzer.'.format(wrd))
print('Analyses are:')
for i, a in enumerate(alist):
    print(f'{str(i+1).rjust(2)}) {a}')


"алмасын" is covered by the analyzer.
Analyses are:
 1) алма_R_ZE сы_S3 н_C4
 2) ал_R_ET ма_ET_ETB с_ETB_ESM ы_S3 н_C4
 3) ал_R_ET ма_ET_ETB сын_M2


**3 варианта разбора одного слова с непонятными обозначения гр. значений -> непонятно и неудобно..**

In [None]:
tagger = TaggerHMM(lyzer=analyzer)
tagger.load_model(os.path.join('kaznlp', 'morphology', 'mdl'))

txt = u'Еңбек етсең ерінбей, тояды қарның тіленбей.'
tokenizer = TokenizerHMM(model=mdl)
for sentence in tokenizer.tokenize(txt):
    print(f'input sentence:\n{sentence}\n')
    print('tagged sentence:')
    lower_sentence = map(lambda x: x.lower(), sentence)
    for i, a in enumerate(tagger.tag_sentence(lower_sentence)):
        print(f'{str(i+1).rjust(2)}) {sentence[i].ljust(15)}{a}')


input sentence:
['Еңбек', 'етсең', 'ерінбей', ',', 'тояды', 'қарның', 'тіленбей', '.']

tagged sentence:
 1) Еңбек          еңбек_R_ZE
 2) етсең          ет_R_ET се_M4 ң_P2
 3) ерінбей        ерінбей_R_X
 4) ,              ,_R_UTR
 5) тояды          то_R_ET я_T1 ды_P3
 6) қарның         қар_R_ZE ның_C2
 7) тіленбей       тіленбей_R_X
 8) .              ._R_NKT


## Функция для разметки

### Используем проверенную библиотеку *stanza*

In [None]:
!pip3 install stanza








In [None]:
!pip3 install lxml








In [None]:
import stanza
stanza.download('kk')
pipeline = stanza.Pipeline(lang='kk')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-05-17 18:25:15 INFO: Downloading default packages for language: kk (Kazakh) ...
2023-05-17 18:25:17 INFO: File exists: C:\Users\varva\stanza_resources\kk\default.zip
2023-05-17 18:25:22 INFO: Finished downloading models and saved to C:\Users\varva\stanza_resources.
2023-05-17 18:25:22 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-05-17 18:25:24 INFO: Loading these models for language: kk (Kazakh):
| Processor | Package |
-----------------------
| tokenize  | ktb     |
| mwt       | ktb     |
| pos       | ktb     |
| lemma     | ktb     |
| depparse  | ktb     |
| ner       | kazNERD |

2023-05-17 18:25:24 INFO: Using device: cpu
2023-05-17 18:25:24 INFO: Loading: tokenize
2023-05-17 18:25:24 INFO: Loading: mwt
2023-05-17 18:25:24 INFO: Loading: pos
2023-05-17 18:25:25 INFO: Loading: lemma
2023-05-17 18:25:25 INFO: Loading: depparse
2023-05-17 18:25:25 INFO: Loading: ner
2023-05-17 18:25:26 INFO: Done loading processors!


In [None]:
pos_from_ud_to_gr_table = {
            "ADJ": "A",
            "ADP": "PR",
            "ADV": "ADV",
            "AUX": "V, aux",  
            "CCONJ": "CONJ",
            "DET": "DET",
            "INTJ": "INTJ",
            "NOUN": "S",
            "NUM": "NUM",
            "PART": "PART",
            "PRON": "PRO",
            "PROPN": "S, propn", 
            "SCONJ": "CONJ",
            "VERB": "V",
        }

feats_from_ud_to_gr_table = {
            "Gender": {
                "Fem":"f",
                "Masc":"m"
            },
            "AdpType": {
                "POST":"POSL",
            },
            "Number": {
                "Plur":"pl",
                "Sing":"sg"
            },
            "Case": {
                "Acc":"acc", 
                "Dat":"dat", 
                "Erg":"erg", 
                "Gen":"gen", 
                "Ine":"voc", 
                "Ins":"ins", 
                "Nom":"nom"
            },
            "VerbForm": {
                "Conv":"ger", 
                "Fin":"fin", 
                "Inf":"inf", 
                "Part":"partcp"
            },
            "Mood": {
                "Imp":"imper", 
                "Ind":"indic", 
                "Sub":"sub"
            },
            "Tense": {
                "Fut":"fut", 
                "Past":"praet", 
                "Pres":"praes"
            },
            "Voice": {
                "Act":"act", 
                "Pass":"pass"
            },
            "Person": {
                "1":"1p", 
                "2":"2p", 
                "3":"3p"
            }
}

In [None]:
import stanza
import xml.etree.ElementTree as ET

class XmlProcessor:
    def __init__(self, lang):
        stanza.download(lang)
        self.pipeline = stanza.Pipeline(lang='kk')

    def parse_ud_feats(self, token):
        dummy = dict()
        ud = [token.upos, dummy]
        if token.feats is None:
            return ud
        feats = dict()
        raw_feats = token.feats.split('|')
        for raw_feat in raw_feats:
            split = raw_feat.split('=')
            feats[split[0]] = split[1]
        ud[1] = feats
        return ud

    def from_ud_to_gr(self, ud_feats):
        gr_feats = []
        if len(ud_feats) == 0:
            return ""
        if ud_feats[0] in pos_from_ud_to_gr_table:
            gr_feats.append(pos_from_ud_to_gr_table[ud_feats[0]])
        
        for key, value in ud_feats[1].items():
            if key in feats_from_ud_to_gr_table:
                if value in feats_from_ud_to_gr_table[key]:
                    gr_feats.append(feats_from_ud_to_gr_table[key][value])
        return ','.join(gr_feats)

    def process_xml(self, input_file, output_file):
        tree = ET.parse(input_file)
        root = tree.getroot()

        for para in root.iter('para'):
            for se in para.iter('se'):
                if se.get('lang') == 'kk':
                    text = se.text.strip()
                    doc = self.pipeline(text)
                    se.text = ''
                    for sent in doc.sentences:
                        for token in sent.tokens:
                            word_element = ET.Element('w')
                            for word in token.words:
                                ana_element = ET.Element('ana')
                                ana_element.set('lex', word.lemma)
                                ud_feats = self.parse_ud_feats(word)
                                ana_element.set('gr', self.from_ud_to_gr(ud_feats))
                                word_text_element = ET.Element('text')
                                word_text_element.text = token.text
                                word_element.append(ana_element)
                                word_element.append(word_text_element)
                            se.append(word_element)

        tree.write(output_file)

In [None]:
xml_processor = XmlProcessor('kk')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-05-17 18:26:22 INFO: Downloading default packages for language: kk (Kazakh) ...
2023-05-17 18:26:23 INFO: File exists: C:\Users\varva\stanza_resources\kk\default.zip
2023-05-17 18:26:27 INFO: Finished downloading models and saved to C:\Users\varva\stanza_resources.
2023-05-17 18:26:27 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-05-17 18:26:29 INFO: Loading these models for language: kk (Kazakh):
| Processor | Package |
-----------------------
| tokenize  | ktb     |
| mwt       | ktb     |
| pos       | ktb     |
| lemma     | ktb     |
| depparse  | ktb     |
| ner       | kazNERD |

2023-05-17 18:26:29 INFO: Using device: cpu
2023-05-17 18:26:29 INFO: Loading: tokenize
2023-05-17 18:26:29 INFO: Loading: mwt
2023-05-17 18:26:29 INFO: Loading: pos
2023-05-17 18:26:30 INFO: Loading: lemma
2023-05-17 18:26:30 INFO: Loading: depparse
2023-05-17 18:26:30 INFO: Loading: ner
2023-05-17 18:26:31 INFO: Done loading processors!


In [None]:
processor.process_xml('drakon.xml', 'drakon_processed.xml')

## 2. Посточистка файла

остается:
- убрать разметку со знаков препинания
- вставить пробелы между тэгами </w\><w
- убрать тег <text\>

In [None]:
with open("aidar_processed.xml", 'r', encoding='utf-8') as f:
    lines = f.readlines()
    with open("aidar_nkrya.xml", 'w', encoding='utf-8') as f_perf:
        for line in lines:
            line = re.sub('</w><w', '</w> <w', line)
            line = re.sub('<text>', '', line)
            line = re.sub('</text>', '', line)
            line = re.sub('<w><ana lex="«" gr="" />«</w> ', '«', line)
            line = re.sub(' <w><ana lex="»" gr="" />»</w>', '»', line)
            line = re.sub(' <w><ana lex="..." gr="" />...</w>', '...', line)
            line = re.sub(' <w><ana lex="." gr="" />.</w>', '.', line)
            line = re.sub(' <w><ana lex="," gr="" />,</w>', ',', line)
            line = re.sub(' <w><ana lex="!" gr="" />!</w>', '!', line)
            line = re.sub(' <w><ana lex="?" gr="" />?</w>', '?', line)
            line = re.sub(' <w><ana lex=":" gr="" />:</w>', ':', line)
            line = re.sub(' <w><ana lex=";" gr="" />;</w>', ';', line)
            line = re.sub('<w><ana lex="—" gr="" />—</w>', '—', line)
            line = re.sub(' <w><ana lex=".." gr="" />..</w>', '..', line)
            f_perf.write(line)

# II этап: подготовка текстов для Tsacorpus

### Как выглядит json-файл для Цакорпуса:

**База:**

In [None]:
{
    "meta": {
        "title": "...",
        "author": "...",
        "year": "...",
        "translator": "...",
    },
    "sentences": [
        {
            "text": "...",
            "words": [
            ],
            "lang": 0,
            "para_alignment": {
            },
        },
        {
            "text": "...",
            "words": [
            ],
            "lang": 1,
            "para_alignment": {
            },
        }
    ]
}

**"words"**

In [None]:
"words": [
    {
        "wf": "...",
        "wtype": "word/punct",   # word - если слово, punct - если пунктуация 
        "ana": [
            "lex": "...",        # лемма
            "gr.pos": "...",     # часть речи
            "gr.number": "...",  # число, если есть
            "gr.case": "..."     # падеж, если есть
        ],
        "sentence_index": 0,     # индекс слова в предложении
        "off_start": 0,          # индекс первой буквы словоформы в полном предложении
        "off_end": 0,            # индекс последней буквы словоформы в полном предложении
        "next_word": 1,          # индекс следующего слова в предложении
        "sentence_index_neg": 5, # количество слов до конца предложения
    },
    {
        "wf": "...",
        "wtype": "word/punct",
        "ana": [
            "lex": "...",
            "gr.pos": "...",
            "gr.number": "...",
            "gr.case": "..."
        ],
        "sentence_index": 0,    
        "off_start": 0,        
        "off_end": 0,
        "next_word": 1,
        "sentence_index_neg": 5,
    },
    ...
]

### Функция для преобразования XML в JSON без разметки

In [None]:
import xml.etree.ElementTree as ET
import json

def xml_to_json(file_path, output_file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    sentences = []

    off_start_ru = 0
    off_start_kk = 0

    for para in root.iter('para'):
        para_id = para.attrib['id']
        for se in para.iter('se'):
            lang = se.attrib['lang']
            text = se.text

            if text is not None:
                text = text.strip()
                words = []

                if lang == 'ru':
                    off_start = off_start_ru
                    off_start_ru += len(text) + 2
                else:
                    off_start = off_start_kk
                    off_start_kk += len(text) + 2

                sentence = {
                    'text': text,
                    'words': words,
                    'lang': lang,
                    'para_alignment': {
                        'off_start': off_start,
                        'off_end': off_start + len(text),
                        'para_id': para_id
                    }
                }

                sentences.append(sentence)

    json_data = {'sentences': sentences}

    with open(output_file_path, 'w', encoding="utf-8") as output_file:
        json.dump(json_data, output_file, ensure_ascii=False, indent=2)

In [None]:
xml_file_path = 'drakon_postcleaned.xml'
output_file_path = 'drakon_tsakorpus.json'
xml_to_json(xml_file_path, output_file_path)
print("Преобразование XML в JSON завершено. Разультат сохранен в файл", output_file_path)

Преобразование XML в JSON завершено. Разультат сохранен в файл suka.json


**Далее конвертируем файл в utf-8 в конвертере**

### Добавляем морфологическую разметку

In [None]:
import json
import stanza

nlp_kk = stanza.Pipeline('kk', processors='tokenize,mwt,pos,lemma')
nlp_ru = stanza.Pipeline('ru', processors='tokenize,pos,lemma')

In [None]:
def add_morphology(data):

    for sentence in data['sentences']:
        lang = sentence['lang']
        text = sentence['text']

        if lang == 'kk':
            nlp = nlp_kk
        elif lang == 'ru':
            nlp = nlp_ru
        else:
            continue

        doc = nlp(text)

        words = []
        word_index = 0
        for sent in doc.sentences:
            for word in sent.words:
                feats = word.feats
                case_value = None
                number_value = None
                if feats is not None:
                    feats_list = feats.split('|')
                    for feat in feats_list:
                        if feat.startswith('Case='):
                            case_value = feat.split('=')[1]
                        elif feat.startswith('Number='):
                            number_value = feat.split('=')[1]
                            break
                        else:
                            case_value = None
                else:
                    case_value = None
                
                token = {
                    'wf': word.text,
                    'wtype': 'word' if word.upos != 'PUNCT' else 'punct',
                    'ana': [
                        {
                            'lex': word.lemma,
                            'gr.pos': word.upos,
                            'gr.number': number_value,
                            'gr.case': case_value,
                        }
                    ],
                    'sentence_index': word_index,
                    'off_start': word.start_char,
                    'off_end': word.end_char - 1 if word.end_char is not None else None,
                    'next_word': word_index + 1,
                    'sentence_index_neg': len(sent.words) - word_index - 1
                }
                words.append(token)
                word_index += 1

        sentence['words'] = words

    return data

In [None]:
json_file_path = 'drakon_json.json'
output_file_path = 'drakon_tsakorpus.json'

with open(json_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

data_with_morphology = add_morphology(data)

with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(data_with_morphology, file, ensure_ascii=False, indent=2)

print("Морфологическая разметка добавлена и сохранена в файл:", output_file_path)

Морфологическая разметка добавлена и сохранена в файл: aidar_tsacorpus.json
