In [1]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
%cd gdrive/MyDrive

/content/gdrive/MyDrive


### Установки и импорты

In [3]:
!pip install pyyaml
!pip install spacy-conll
!python -m spacy download ru_core_news_sm
!pip install pymorphy2

2023-09-28 08:52:32.793438: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting ru-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.6.0/ru_core_news_sm-3.6.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_sm')


In [4]:
from pathlib import Path
import importlib
import sys

import yaml
import spacy
from spacy_conll import init_parser
from spacy.language import Language

In [5]:
scripts_dir = Path('./project_scripts').resolve()
scripts_dir

PosixPath('/content/gdrive/MyDrive/project_scripts')

In [6]:
sys.path.append(str(scripts_dir))
sys.path

['/content',
 '/env/python',
 '/usr/lib/python310.zip',
 '/usr/lib/python3.10',
 '/usr/lib/python3.10/lib-dynload',
 '',
 '/usr/local/lib/python3.10/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.10/dist-packages/IPython/extensions',
 '/root/.ipython',
 '/content/gdrive/MyDrive/project_scripts']

In [7]:
from project_scripts.mathematicon_morph_parser import MorphologyCorrectionHandler, remove_double_spaces

In [None]:
importlib.reload(sys.modules['project_scripts.mathematicon_morph_parser'])

<module 'project_scripts.mathematicon_morph_parser' from '/content/gdrive/MyDrive/project_scripts/mathematicon_morph_parser.py'>

### Работа с txt файлами

In [8]:
texts_folder = Path('./2_clean_texts').resolve()
conllu_folder = Path('./3_grammar_annotated').resolve()

In [14]:
def find_new_files(texts_folder, conllu_folder):
    conllu_files = [f.stem for f in conllu_folder.iterdir()]
    return [f for f in texts_folder.iterdir() if f.stem not in conllu_files]

In [19]:
new_files = find_new_files(texts_folder, conllu_folder)
new_files

[PosixPath('/content/gdrive/MyDrive/2_clean_texts/romanovvladimir_deleniiedrobei_5klass6klassmatematika_vsetemy.txt'),
 PosixPath('/content/gdrive/MyDrive/2_clean_texts/romanovvladimir_desiatichnayadrob_slozheniiedesyatychnyhdrobei_vychitaniyedesyatichnyhdrobei.txt'),
 PosixPath('/content/gdrive/MyDrive/2_clean_texts/videourokimatematiky_robnyeracionalnyevyrazheniya_dejstviyasracionalnymidrobyami_algebra8klass_Urok13.txt'),
 PosixPath('/content/gdrive/MyDrive/2_clean_texts/videourokimatematiky_kvadratniiyekorni_svoistvakvadratnyhkornei_algebra8klass_urok16.txt'),
 PosixPath('/content/gdrive/MyDrive/2_clean_texts/unknown_kvadratnyieurovneniya_razlozheniyekvadratnogotrehchlenanamnozhyteli_algebra8klass_urok19.txt'),
 PosixPath('/content/gdrive/MyDrive/2_clean_texts/romanovvladimir_kvadratniykoren_8klass_resheniyeprimerov.txt'),
 PosixPath('/content/gdrive/MyDrive/2_clean_texts/romanovvladimir_kvadratniykorenizdrobi_8klass_algebra.txt'),
 PosixPath('/content/gdrive/MyDrive/2_clean_texts/r

In [16]:
from yaml.parser import ParserError

In [20]:
files_info = {}
for f in new_files:
    with open(f, encoding='utf-8') as fh:
        try:
            read_data = yaml.load(fh, Loader=yaml.FullLoader)
            read_data['text'] = remove_double_spaces(read_data['text'])
            files_info[f] = read_data
        except ParserError as e:
            print(e)
            print()
            print(f'Some problems with file {f}')
        finally:
            continue

### Парсинг

In [21]:
@Language.factory(
    "morphology_corrector",
    assigns=["token.lemma", "token.tag"],
    requires=["token.pos"],
    default_config={"mode": 'allpos+ptcp+conv'},
)
def morphology_corrector(nlp, name, mode):
    return MorphologyCorrectionHandler(mode)

In [22]:
nlp = init_parser("ru_core_news_sm", 'spacy', include_headers=True, exclude_spacy_components=['ner'])
nlp.add_pipe('morphology_corrector', before="conll_formatter")

<project_scripts.mathematicon_morph_parser.MorphologyCorrectionHandler at 0x79ad3dd3c5e0>

In [23]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x79ad46465780>),
 ('morphologizer',
  <spacy.pipeline.morphologizer.Morphologizer at 0x79ad46465ba0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x79ad466e2110>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x79ad4671d700>),
 ('lemmatizer',
  <spacy.lang.ru.lemmatizer.RussianLemmatizer at 0x79ad464e1ac0>),
 ('morphology_corrector',
  <project_scripts.mathematicon_morph_parser.MorphologyCorrectionHandler at 0x79ad3dd3c5e0>),
 ('conll_formatter',
  ConllFormatter(conversion_maps=None, ext_names={'conll_str': 'conll_str', 'conll': 'conll', 'conll_pd': 'conll_pd'}, field_names={'ID': 'ID', 'FORM': 'FORM', 'LEMMA': 'LEMMA', 'UPOS': 'UPOS', 'XPOS': 'XPOS', 'FEATS': 'FEATS', 'HEAD': 'HEAD', 'DEPREL': 'DEPREL', 'DEPS': 'DEPS', 'MISC': 'MISC'}, include_headers=True, disable_pandas=False))]

In [24]:
def write_conllu(doc, source_filename, dest_dir):
    result_path = Path(dest_dir, source_filename.with_suffix('.conllu').name)
    with open(result_path, 'w', encoding='utf-8') as f:
        f.write(doc._.conll_str)
    return result_path

In [25]:
written_files = []
for filename, info in files_info.items():
    parsed_text = nlp(info['text'])
    written_files.append(write_conllu(parsed_text, filename, conllu_folder))
written_files

Exception "'NoneType' object has no attribute 'word'" was raised on token (бэ, VERB, бэ) during execution of "ptcp_corrector" function
Exception "'NoneType' object has no attribute 'word'" was raised on token (бэ, VERB, бэ) during execution of "ptcp_corrector" function
Exception "'NoneType' object has no attribute 'word'" was raised on token (бэ, VERB, бэ) during execution of "ptcp_corrector" function
Exception "'NoneType' object has no attribute 'word'" was raised on token (бэ, VERB, бэ) during execution of "ptcp_corrector" function
Exception "'NoneType' object has no attribute 'word'" was raised on token (бэ, VERB, бэ) during execution of "ptcp_corrector" function
Exception "'NoneType' object has no attribute 'word'" was raised on token (бэ, VERB, бэ) during execution of "ptcp_corrector" function
Exception "'NoneType' object has no attribute 'word'" was raised on token (бэ, VERB, бэ) during execution of "ptcp_corrector" function
Exception "'NoneType' object has no attribute 'word'" w

[PosixPath('/content/gdrive/MyDrive/3_grammar_annotated/romanovvladimir_deleniiedrobei_5klass6klassmatematika_vsetemy.conllu'),
 PosixPath('/content/gdrive/MyDrive/3_grammar_annotated/romanovvladimir_desiatichnayadrob_slozheniiedesyatychnyhdrobei_vychitaniyedesyatichnyhdrobei.conllu'),
 PosixPath('/content/gdrive/MyDrive/3_grammar_annotated/videourokimatematiky_robnyeracionalnyevyrazheniya_dejstviyasracionalnymidrobyami_algebra8klass_Urok13.conllu'),
 PosixPath('/content/gdrive/MyDrive/3_grammar_annotated/videourokimatematiky_kvadratniiyekorni_svoistvakvadratnyhkornei_algebra8klass_urok16.conllu'),
 PosixPath('/content/gdrive/MyDrive/3_grammar_annotated/unknown_kvadratnyieurovneniya_razlozheniyekvadratnogotrehchlenanamnozhyteli_algebra8klass_urok19.conllu'),
 PosixPath('/content/gdrive/MyDrive/3_grammar_annotated/romanovvladimir_kvadratniykoren_8klass_resheniyeprimerov.conllu'),
 PosixPath('/content/gdrive/MyDrive/3_grammar_annotated/romanovvladimir_kvadratniykorenizdrobi_8klass_algebr