In [None]:
!pip install conllu

Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Downloading conllu-6.0.0-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-6.0.0


In [None]:
import conllu
import zipfile
import requests
import os
import gdown
import json
from conllu import parse
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from itertools import chain

In [None]:
file_id = "1PtIEuhhpY_NUbTd2dp9ZrL86fZi2OXI-"
output = "fakenews_arsamas.zip"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output, quiet=False)

extract_path = './fakenews_arzamas/'
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

Downloading...
From: https://drive.google.com/uc?id=1PtIEuhhpY_NUbTd2dp9ZrL86fZi2OXI-
To: /content/fakenews_arsamas.zip
100%|██████████| 8.11M/8.11M [00:00<00:00, 32.2MB/s]


In [None]:
folder_path = 'fakenews_arzamas/Arzamas/texts_tagged/'
all_sentences = []

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)

        with open(file_path, 'r', encoding='utf-8') as f:
            data = f.read()

        sentences = parse(data)
        all_sentences.extend(sentences)

print(f"Всего предложений: {len(all_sentences)}")

Всего предложений: 22486


In [None]:
json_data = []
for sentence in all_sentences:
    tokens = []
    for token in sentence:
        tokens.append({"id": token["id"],
                       "Word": token["form"],
                       "Lemma": token["lemma"],
                       "POS": token["upos"],
                       "Morphology": token["feats"],
                       "Syntax": f"{token['deprel']} → head={token['head']}"})

    json_data.append({"text": sentence.metadata.get("text", ""),
                      "sent_id": sentence.metadata.get("sent_id", ""),
                      "tokens": tokens})

In [None]:
pos_exc = []
ambig_words = {}

for sent in json_data:
    for token in sent['tokens']:
        word = token['Word'].lower()
        lemma = token['Lemma'].lower()
        pos = token['POS']
        morph = token.get('Morphology', {})

        if pos not in pos_exc:
            if word not in ambig_words:
                ambig_words[word] = {pos: {lemma: {'count': 1,
                                                   'morph': morph}}}

            else:
                if pos not in ambig_words[word]:
                    ambig_words[word][pos] = {lemma: {'count': 1,
                                                      'morph': morph}}

                else:
                    if lemma not in ambig_words[word][pos]:
                        ambig_words[word][pos][lemma] = {'count': 1,
                                                         'morph': morph}
                    else:
                        ambig_words[word][pos][lemma]['count'] += 1

In [None]:
ambig_words['ели']

{'NOUN': {'ель': {'count': 5,
   'morph': {'Animacy': 'Inan',
    'Case': 'Gen',
    'Gender': 'Fem',
    'Number': 'Sing'}}},
 'VERB': {'есть': {'count': 8,
   'morph': {'Aspect': 'Imp',
    'Mood': 'Ind',
    'Number': 'Plur',
    'Tense': 'Past',
    'VerbForm': 'Fin',
    'Voice': 'Act'}},
  'евать': {'count': 1,
   'morph': {'Aspect': 'Perf',
    'Mood': 'Ind',
    'Number': 'Plur',
    'Tense': 'Past',
    'VerbForm': 'Fin',
    'Voice': 'Act'}}}}

In [None]:
def add_variants_to_json(json_data, ambig_words):
    for sent in json_data:
        for token in sent['tokens']:
            word_lower = token['Word'].lower()

            if word_lower in ambig_words:
                variants = []
                original_lemma = token.get('Lemma', '')
                original_pos = token.get('POS', '')

                for pos, lemmas in ambig_words[word_lower].items():
                    for lemma, data in lemmas.items():
                        variant = {'Lemma': lemma,
                                   'POS': pos,
                                   'Morphology': data['morph']}

                        if lemma == original_lemma and pos == original_pos:
                            variant['IsCorrect'] = True
                        variants.append(variant)

                if len(variants) > 1:
                    token['Vars'] = variants

    return json_data

modified_data = add_variants_to_json(json_data, ambig_words)
modified_data[0]

{'text': 'Запись организовал в 1890 году Юлий Иванович Блок -- меломан и энтузиаст ранней звукозаписи, который в 1889 году первым привез в Россию фонограф Эдисона.',
 'sent_id': '1325',
 'tokens': [{'id': 1,
   'Word': 'Запись',
   'Lemma': 'запись',
   'POS': 'NOUN',
   'Morphology': {'Animacy': 'Inan',
    'Case': 'Nom',
    'Gender': 'Fem',
    'Number': 'Sing'},
   'Syntax': 'nsubj → head=2'},
  {'id': 2,
   'Word': 'организовал',
   'Lemma': 'рганизовать',
   'POS': 'VERB',
   'Morphology': {'Aspect': 'Perf',
    'Gender': 'Masc',
    'Mood': 'Ind',
    'Number': 'Sing',
    'Tense': 'Past',
    'VerbForm': 'Fin',
    'Voice': 'Act'},
   'Syntax': 'root → head=0',
   'Vars': [{'Lemma': 'рганизовать',
     'POS': 'VERB',
     'Morphology': {'Aspect': 'Perf',
      'Gender': 'Masc',
      'Mood': 'Ind',
      'Number': 'Sing',
      'Tense': 'Past',
      'VerbForm': 'Fin',
      'Voice': 'Act'},
     'IsCorrect': True},
    {'Lemma': 'организовывать',
     'POS': 'VERB',
     'Morp

In [None]:
MORPH_FEATURES = ['POS', 'Animacy', 'Aspect', 'Case', 'Degree', 'Foreign',
                  'Gender', 'Mood', 'Number', 'Person', 'Polarity', 'Tense',
                  'Variant', 'VerbForm', 'Voice']

MORPH_VALUE_MAPPING = {
    # Animacy
    "Anim": 1,
    "Inan": 2,

    # POS
    "ADJ": 1,
    "ADP": 2,
    "ADV": 3,
    "AUX": 4,
    "CCONJ": 5,
    "DET": 6,
    "INTJ": 7,
    "NOUN": 8,
    "NUM": 9,
    "PART": 10,
    "PRON": 11,
    "PROPN": 12,
    "PUNCT": 13,
    "SCONJ": 14,
    "SYM": 15,
    "VERB": 16,
    "X": 17,

    # Aspect
    "Imp": 1,
    "Perf": 2,

    # Case
    "Nom": 1,
    "Acc": 2,
    "Dat": 3,
    "Gen": 4,
    "Ins": 5,
    "Loc": 6,
    "Par": 7,
    "Voc": 8,

    # Degree
    "Pos": 1,
    "Cmp": 2,
    "Sup": 3,

    # Foreign
    "Yes": 1,

    # Gender
    "Fem": 1,
    "Masc": 2,
    "Neut": 3,

    # Mood
    "Imp": 1,
    "Cnd": 2,
    "Ind": 3,

    # Number
    "Sing": 1,
    "Plur": 2,

    # Person
    "1": 1,
    "2": 2,
    "3": 3,

    # Polarity
    "Neg": 1,

    # Tense
    "Fut": 1,
    "Past": 2,
    "Pres": 3,

    # Variant
    "Short": 1,

    # VerbForm
    "Conv": 1,
    "Fin": 2,
    "Inf": 3,
    "Part": 4,

    # Voice
    "Act": 1,
    "Mid": 2,
    "Pass": 3
}

In [None]:
def map_morph_value(value):
    if value == -1 or isinstance(value, int):
        return value
    return MORPH_VALUE_MAPPING.get(value, -1)


def extract_morph_features(token):
    pos_value = token.get('POS', -1)
    features = {'POS': map_morph_value(pos_value)}
    morph = token.get('Morphology', {}) or {}
    for feat in MORPH_FEATURES[1:]:
        value = morph.get(feat, -1)
        features[feat] = map_morph_value(value)

    return features


def process_sentence(sent):
    tokens = sent['tokens']
    sentence_data = []

    for i, token in enumerate(tokens):
        if token['POS'] == 'PUNCT':
            continue

        variants = token.get('Vars', [])
        if not variants:
            variants = [{
                'Lemma': token['Lemma'],
                'POS': token['POS'],
                'Morphology': token.get('Morphology', {}),
                'IsCorrect': True
            }]

        for variant in variants:
            row = {
                'word_id': token['id'],
                'word': token['Word'],
                'lemma': variant['Lemma'],
                'is_correct': variant.get('IsCorrect', False)
            }

            morph_features = extract_morph_features(variant)
            for feat, val in morph_features.items():
                row[f'curr_{feat.lower()}'] = val

            for offset in [-3, -2, -1, 1, 2, 3]:
                pos = i + offset
                if 0 <= pos < len(tokens):
                    context_token = tokens[pos]
                    ctx_features = extract_morph_features(context_token)
                    for feat, val in ctx_features.items():
                        row[f'ctx_{offset}_{feat.lower()}'] = val
                else:
                    for feat in MORPH_FEATURES:
                        row[f'ctx_{offset}_{feat.lower()}'] = -1

            sentence_data.append(row)

    return sentence_data


def create_dataframe(json_data):
    all_data = []

    for sent in json_data:
        all_data.extend(process_sentence(sent))

    df = pd.DataFrame(all_data)

    columns_order = ['word_id', 'word', 'lemma', 'is_correct']

    for feat in MORPH_FEATURES:
        columns_order.append(f'curr_{feat.lower()}')

    for offset in [-3, -2, -1, 1, 2, 3]:
        for feat in MORPH_FEATURES:
            columns_order.append(f'ctx_{offset}_{feat.lower()}')

    return df[columns_order]

In [None]:
disambiguation_ds = create_dataframe(json_data)

In [None]:
ctx_columns = [col for col in disambiguation_ds.columns if col.startswith('ctx')]
disambiguation_ds['is_in_group'] = False

for i in range(1, len(disambiguation_ds)):
    if (disambiguation_ds.loc[i, 'word'] == disambiguation_ds.loc[i-1, 'word']) and \
       (disambiguation_ds.loc[i, ctx_columns].equals(disambiguation_ds.loc[i-1, ctx_columns])):
        disambiguation_ds.loc[i, 'is_in_group'] = True
        disambiguation_ds.loc[i-1, 'is_in_group'] = True

if len(disambiguation_ds) > 1:
    last_idx = len(disambiguation_ds) - 1
    if (disambiguation_ds.loc[last_idx, 'word'] == disambiguation_ds.loc[last_idx-1, 'word']) and \
       (disambiguation_ds.loc[last_idx, ctx_columns].equals(disambiguation_ds.loc[last_idx-1, ctx_columns])):
        disambiguation_ds.loc[last_idx, 'is_in_group'] = True

In [None]:
disambiguation_ds = disambiguation_ds[disambiguation_ds['is_in_group']].drop(columns=['is_in_group'])

In [None]:
df_filtered = disambiguation_ds.head(65000)
df_filtered.to_csv('disambiguation_ds_65k.csv', index=False)
disambiguation_ds.to_csv('disambiguation_ds.csv', index=False)