In [2]:
!pip install conllu



In [3]:
import conllu
import zipfile
import requests
import os
import gdown
import json
from conllu import parse
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [4]:
file_id = "1PtIEuhhpY_NUbTd2dp9ZrL86fZi2OXI-"
output = "fakenews_arsamas.zip"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output, quiet=False)

extract_path = './fakenews_arzamas/'
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

Downloading...
From: https://drive.google.com/uc?id=1PtIEuhhpY_NUbTd2dp9ZrL86fZi2OXI-
To: /content/fakenews_arsamas.zip
100%|██████████| 8.11M/8.11M [00:00<00:00, 40.3MB/s]


In [5]:
folder_path = 'fakenews_arzamas/Arzamas/texts_tagged/'
all_sentences = []

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)

        with open(file_path, 'r', encoding='utf-8') as f:
            data = f.read()

        sentences = parse(data)
        all_sentences.extend(sentences)

print(f"Всего предложений: {len(all_sentences)}")

Всего предложений: 22486


In [6]:
json_data = []
for sentence in all_sentences:
    tokens = []
    for token in sentence:
        tokens.append({"id": token["id"],
                       "Word": token["form"],
                       "Lemma": token["lemma"],
                       "POS": token["upos"],
                       "Morphology": token["feats"],
                       "Syntax": f"{token['deprel']} → head={token['head']}"})

    json_data.append({"text": sentence.metadata.get("text", ""),
                      "sent_id": sentence.metadata.get("sent_id", ""),
                      "tokens": tokens})

In [7]:
pos_exc = []
ambig_words = {}

for sent in json_data:
    for token in sent['tokens']:
        word = token['Word'].lower()
        lemma = token['Lemma'].lower()
        pos = token['POS']
        morph = token.get('Morphology', {})

        if pos not in pos_exc:
            if word not in ambig_words:
                ambig_words[word] = {pos: {lemma: {'count': 1,
                                                   'morph': morph}}}

            else:
                if pos not in ambig_words[word]:
                    ambig_words[word][pos] = {lemma: {'count': 1,
                                                      'morph': morph}}

                else:
                    if lemma not in ambig_words[word][pos]:
                        ambig_words[word][pos][lemma] = {'count': 1,
                                                         'morph': morph}
                    else:
                        ambig_words[word][pos][lemma]['count'] += 1

In [8]:
ambig_words['ели']

{'VERB': {'есть': {'count': 8,
   'morph': {'Aspect': 'Imp',
    'Mood': 'Ind',
    'Number': 'Plur',
    'Tense': 'Past',
    'VerbForm': 'Fin',
    'Voice': 'Act'}},
  'евать': {'count': 1,
   'morph': {'Aspect': 'Perf',
    'Mood': 'Ind',
    'Number': 'Plur',
    'Tense': 'Past',
    'VerbForm': 'Fin',
    'Voice': 'Act'}}},
 'NOUN': {'ель': {'count': 5,
   'morph': {'Animacy': 'Inan',
    'Case': 'Loc',
    'Gender': 'Fem',
    'Number': 'Sing'}}}}

In [9]:
def add_variants_to_json(json_data, ambig_words):
    for sent in json_data:
        for token in sent['tokens']:
            word_lower = token['Word'].lower()

            # Проверяем, есть ли слово в ambig_words и есть ли неоднозначности
            if word_lower in ambig_words:
                variants = []

                # Собираем все варианты из ambig_words
                for pos, lemmas in ambig_words[word_lower].items():
                    for lemma, data in lemmas.items():
                        variants.append({
                            'Lemma': lemma,
                            'POS': pos,
                            'Morphology': data['morph']
                        })

                # Если вариантов больше одного - добавляем поле Vars
                if len(variants) > 1:
                    token['Vars'] = variants

    return json_data

modified_data = add_variants_to_json(json_data, ambig_words)

In [10]:
def add_variants_to_json(json_data, ambig_words):
    for sent in json_data:
        for token in sent['tokens']:
            word_lower = token['Word'].lower()

            # Проверяем, есть ли слово в ambig_words и есть ли неоднозначности
            if word_lower in ambig_words:
                variants = []
                original_lemma = token.get('Lemma', '')
                original_pos = token.get('POS', '')

                # Собираем все варианты из ambig_words
                for pos, lemmas in ambig_words[word_lower].items():
                    for lemma, data in lemmas.items():
                        variant = {
                            'Lemma': lemma,
                            'POS': pos,
                            'Morphology': data['morph']
                        }
                        # Помечаем исходный вариант как правильный
                        if lemma == original_lemma and pos == original_pos:
                            variant['IsCorrect'] = True
                        variants.append(variant)

                # Если вариантов больше одного - добавляем поле Vars
                if len(variants) > 1:
                    token['Vars'] = variants

    return json_data

modified_data = add_variants_to_json(json_data, ambig_words)
modified_data[0]

{'text': 'В отрывке из короткометражного фильма, снятого в 1946 году в студии художника в Ницце, Анри Матисс рисует углем портрет своего внука Жерара и рассуждает о рисунке и искусстве живописи.',
 'sent_id': '6414',
 'tokens': [{'id': 1,
   'Word': 'В',
   'Lemma': 'в',
   'POS': 'ADP',
   'Morphology': None,
   'Syntax': 'case → head=2',
   'Vars': [{'Lemma': 'в',
     'POS': 'ADP',
     'Morphology': None,
     'IsCorrect': True},
    {'Lemma': 'в1', 'POS': 'ADP', 'Morphology': None},
    {'Lemma': 'в', 'POS': 'PROPN', 'Morphology': {'Foreign': 'Yes'}},
    {'Lemma': 'век',
     'POS': 'NOUN',
     'Morphology': {'Animacy': 'Inan',
      'Case': 'Gen',
      'Gender': 'Masc',
      'Number': 'Sing'}}]},
  {'id': 2,
   'Word': 'отрывке',
   'Lemma': 'отрывка',
   'POS': 'NOUN',
   'Morphology': {'Animacy': 'Inan',
    'Case': 'Loc',
    'Gender': 'Fem',
    'Number': 'Sing'},
   'Syntax': 'obl → head=19',
   'Vars': [{'Lemma': 'отрывка',
     'POS': 'NOUN',
     'Morphology': {'Anima

In [15]:
import pandas as pd
from itertools import chain

MORPH_FEATURES = ['POS', 'Animacy', 'Aspect', 'Case', 'Degree', 'Foreign',
                  'Gender', 'Mood', 'Number', 'Person', 'Polarity', 'Tense',
                  'Variant', 'VerbForm', 'Voice']

MORPH_VALUE_MAPPING = {
    # Animacy
    "Anim": 1,
    "Inan": 2,

    # POS
    "ADJ": 1,
    "ADP": 2,
    "ADV": 3,
    "AUX": 4,
    "CCONJ": 5,
    "DET": 6,
    "INTJ": 7,
    "NOUN": 8,
    "NUM": 9,
    "PART": 10,
    "PRON": 11,
    "PROPN": 12,
    "PUNCT": 13,
    "SCONJ": 14,
    "SYM": 15,
    "VERB": 16,
    "X": 17,

    # Aspect
    "Imp": 1,
    "Perf": 2,

    # Case
    "Nom": 1,
    "Acc": 2,
    "Dat": 3,
    "Gen": 4,
    "Ins": 5,
    "Loc": 6,
    "Par": 7,
    "Voc": 8,

    # Degree
    "Pos": 1,
    "Cmp": 2,
    "Sup": 3,

    # Foreign
    "Yes": 1,

    # Gender
    "Fem": 1,
    "Masc": 2,
    "Neut": 3,

    # Mood
    "Imp": 1,
    "Cnd": 2,
    "Ind": 3,

    # Number
    "Sing": 1,
    "Plur": 2,

    # Person
    "1": 1,
    "2": 2,
    "3": 3,

    # Polarity
    "Neg": 1,

    # Tense
    "Fut": 1,
    "Past": 2,
    "Pres": 3,

    # Variant
    "Short": 1,

    # VerbForm
    "Conv": 1,
    "Fin": 2,
    "Inf": 3,
    "Part": 4,

    # Voice
    "Act": 1,
    "Mid": 2,
    "Pass": 3
}

In [16]:
def map_morph_value(value):
    if value == -1 or isinstance(value, int):
        return value
    return MORPH_VALUE_MAPPING.get(value, -1)


def extract_morph_features(token):
    pos_value = token.get('POS', -1)
    features = {'POS': map_morph_value(pos_value)}
    morph = token.get('Morphology', {}) or {}
    for feat in MORPH_FEATURES[1:]:
        value = morph.get(feat, -1)
        features[feat] = map_morph_value(value)

    return features


def process_sentence(sent):
    tokens = sent['tokens']
    sentence_data = []

    for i, token in enumerate(tokens):
        if token['POS'] == 'PUNCT':
            continue

        variants = token.get('Vars', [])
        if not variants:
            variants = [{
                'Lemma': token['Lemma'],
                'POS': token['POS'],
                'Morphology': token.get('Morphology', {}),
                'IsCorrect': True
            }]

        for variant in variants:
            row = {
                'word_id': token['id'],
                'word': token['Word'],
                'lemma': variant['Lemma'],
                'is_correct': variant.get('IsCorrect', False)
            }

            morph_features = extract_morph_features(variant)
            for feat, val in morph_features.items():
                row[f'curr_{feat.lower()}'] = val

            for offset in [-3, -2, -1, 1, 2, 3]:
                pos = i + offset
                if 0 <= pos < len(tokens):
                    context_token = tokens[pos]
                    ctx_features = extract_morph_features(context_token)
                    for feat, val in ctx_features.items():
                        row[f'ctx_{offset}_{feat.lower()}'] = val
                else:
                    for feat in MORPH_FEATURES:
                        row[f'ctx_{offset}_{feat.lower()}'] = -1

            sentence_data.append(row)

    return sentence_data


def create_dataframe(json_data):
    all_data = []

    for sent in json_data:
        all_data.extend(process_sentence(sent))

    df = pd.DataFrame(all_data)

    columns_order = ['word_id', 'word', 'lemma', 'is_correct']

    for feat in MORPH_FEATURES:
        columns_order.append(f'curr_{feat.lower()}')

    for offset in [-3, -2, -1, 1, 2, 3]:
        for feat in MORPH_FEATURES:
            columns_order.append(f'ctx_{offset}_{feat.lower()}')

    return df[columns_order]

In [17]:
disambiguation_ds = create_dataframe(json_data)

In [19]:
disambiguation_ds.head()

Unnamed: 0,word_id,word,lemma,is_correct,curr_pos,curr_animacy,curr_aspect,curr_case,curr_degree,curr_foreign,...,ctx_3_foreign,ctx_3_gender,ctx_3_mood,ctx_3_number,ctx_3_person,ctx_3_polarity,ctx_3_tense,ctx_3_variant,ctx_3_verbform,ctx_3_voice
0,1,В,в,True,2,-1,-1,-1,-1,-1,...,-1,2,-1,1,-1,-1,-1,-1,-1,-1
1,1,В,в1,False,2,-1,-1,-1,-1,-1,...,-1,2,-1,1,-1,-1,-1,-1,-1,-1
2,1,В,в,False,12,-1,-1,-1,-1,1,...,-1,2,-1,1,-1,-1,-1,-1,-1,-1
3,1,В,век,False,8,2,-1,4,-1,-1,...,-1,2,-1,1,-1,-1,-1,-1,-1,-1
4,2,отрывке,отрывка,True,8,2,-1,6,-1,-1,...,-1,2,-1,1,-1,-1,-1,-1,-1,-1


In [18]:
disambiguation_ds.to_csv('prepared_dataset.csv')