In [1]:
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
!pip install spacy
!python -m spacy download ru_core_news_lg
import spacy
import ru_core_news_lg
nlp = ru_core_news_lg.load()

Collecting ru-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.8.0/ru_core_news_lg-3.8.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymorphy3>=1.0.0 (from ru-core-news-lg==3.8.0)
  Downloading pymorphy3-2.0.6-py3-none-any.whl.metadata (2.4 kB)
Collecting dawg2-python>=0.8.0 (from pymorphy3>=1.0.0->ru-core-news-lg==3.8.0)
  Downloading dawg2_python-0.9.0-py3-none-any.whl.metadata (7.5 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-lg==3.8.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.6-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.9/53.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dawg2_python-0.9.0-py3-none-any.whl (9.3 kB)
Downloading pymorphy

In [4]:
import re

In [95]:
from sklearn.model_selection import train_test_split
import numpy as np

# Сбор данных

## Данные из UniversalCEFR

In [5]:
df_universalCEFR = pd.read_json("hf://datasets/UniversalCEFR/readme_ru/readme_ru.json")
df_universalCEFR = df_universalCEFR[['text', 'cefr_level']]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Данные из RuAdapt

In [7]:
!git clone https://github.com/Digital-Pushkin-Lab/RuAdapt.git

Cloning into 'RuAdapt'...
remote: Enumerating objects: 83, done.[K
remote: Counting objects: 100% (83/83), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 83 (delta 47), reused 75 (delta 42), pack-reused 0 (from 0)[K
Receiving objects: 100% (83/83), 4.15 MiB | 9.91 MiB/s, done.
Resolving deltas: 100% (47/47), done.


In [8]:
def prepare_cefr_training_samples(df, min_similarity=0.6):
    """
    Подготавливает сэмплы для обучения CEFR-классификатора.

    Отбирает пары с высокой семантической близостью и
    нормализует уровень.

    Args:
        df: исходный DataFrame
        min_similarity: минимальная косинусная близость (0.0-1.0)

    Returns:
        DataFrame с колонками ['text', 'cefr_level']
    """

    mask = df['cos_sim'] > min_similarity
    result = df.loc[mask, ['target', 'level']].copy()
    result.columns = ['text', 'cefr_level']
    result = result.reset_index()

    return result[['text', 'cefr_level']]

In [9]:
# Обработка подкорпуса сказок
df_fairytales = pd.read_csv('/content/RuAdapt/Fairytales/df_fairytales_sent.csv')
df_fairytales = prepare_cefr_training_samples(df_fairytales)
# Однозначное определение уровня (берем более низкий уровень)
df_fairytales['cefr_level'] = df_fairytales['cefr_level'].str.split(',').str[0].str.strip()

In [11]:
# Обработка подкорпуса адаптированной литературы
df_adapted_literature = pd.read_csv('/content/RuAdapt/Adapted_literature/zlatoust_sentence_aligned_with_CATS.csv')
df_adapted_literature = prepare_cefr_training_samples(df_adapted_literature)
# Однозначное определение уровня (берем более низкий уровень)
df_adapted_literature['cefr_level'] = df_adapted_literature['cefr_level'].str.split('_').str[0].str.upper().str.strip()

## Данные из FluencyDrop

In [13]:
def split_into_sents(texts):
    final_list = []
    for text in tqdm(texts, desc="Разбиение текстов на предложения"):
        doc = nlp(text)
        final_list.extend([sent.text.strip() for sent in doc.sents])
    return final_list

In [15]:
with open('/content/texts_c1.txt', 'r', encoding='utf-8') as file:
    texts_c1 = file.read().replace('\n\n', '\n').split('\n')
    sents_c1 = split_into_sents(texts_c1)

Разбиение текстов на предложения:   0%|          | 0/226 [00:00<?, ?it/s]

In [16]:
df_c1 = pd.DataFrame({ 'text': sents_c1,'cefr_level': ['C1'] * len(sents_c1)})

In [17]:
with open('/content/texts_c2.txt', 'r', encoding='utf-8') as file:
    texts_c2 = file.read().replace('\n\n', '\n').split('\n')
    sents_c2 = split_into_sents(texts_c2)

Разбиение текстов на предложения:   0%|          | 0/222 [00:00<?, ?it/s]

In [18]:
df_c2 = pd.DataFrame({ 'text': sents_c2,'cefr_level': ['C2'] * len(sents_c2)})

In [19]:
with open('/content/texts_a1.txt', 'r', encoding='utf-8') as file:
    texts_a1 = file.read().replace('\n\n', '\n').split('\n')
    sents_a1 = split_into_sents(texts_a1)

Разбиение текстов на предложения:   0%|          | 0/243 [00:00<?, ?it/s]

In [20]:
df_a1 = pd.DataFrame({ 'text': sents_a1,'cefr_level': ['A1'] * len(sents_a1)})

## Данные книг и пособий

In [21]:
def remove_chapter_headers(text):
    lines, result = text.split('\n'), []

    header_pattern = re.compile(
        r'^(?:'
        r'[IVXLCDM\d]+\.?\s*[А-ЯЁ]?'
        r'|(?:ГЛАВА|ЧАСТЬ|ДЕЙСТВИЕ|КАРТИНА|ЗАПИСЬ)\b'
        r'|[А-ЯЁ\s]{3,}$'
        r')',
        re.IGNORECASE
    )

    for line in lines:
        s = line.strip()
        if s and not header_pattern.match(s):
            result.append(line)

    return '\n'.join(result)

In [22]:
with open('/content/the_12_chairs.txt', 'r', encoding='utf-8') as file:
    twelve_chairs_text = file.read().replace('\n\n', '\n')
    twelve_chairs_text = remove_chapter_headers(twelve_chairs_text)
    twelve_chairs_par = twelve_chairs_text.split('\n')
    twelve_chairs_sents = split_into_sents(twelve_chairs_par)

Разбиение текстов на предложения:   0%|          | 0/3737 [00:00<?, ?it/s]

In [23]:
df_the_12_chairs = pd.DataFrame({ 'text': twelve_chairs_sents,
                                 'cefr_level': ['C2'] * len(twelve_chairs_sents)})

In [24]:
with open('/content/we.txt', 'r', encoding='utf-8') as file:
    we_text = file.read().replace('\n\n', '\n').replace(' ', ' ')
    we_text = remove_chapter_headers(we_text)
    we_par = we_text.split('\n')
    we_sents = split_into_sents(we_par)

Разбиение текстов на предложения:   0%|          | 0/3806 [00:00<?, ?it/s]

In [25]:
df_we = pd.DataFrame({'text': we_sents,
                      'cefr_level': ['C1'] * len(we_sents)})

In [26]:
with open('/content/idiot.txt', 'r', encoding='utf-8') as file:
    idiot_text = file.read().replace('\n\n', '\n').replace(' ', ' ')
    idiot_text = remove_chapter_headers(idiot_text)
    idiot_par = idiot_text.split('\n')
    idiot_sents = split_into_sents(idiot_par)

Разбиение текстов на предложения:   0%|          | 0/4948 [00:00<?, ?it/s]

In [27]:
df_idiot = pd.DataFrame({'text': idiot_sents,
                         'cefr_level': ['B2'] * len(idiot_sents)})

In [31]:
with open('/content/stories_in_easy_russian.txt', 'r', encoding='utf-8') as file:
    stories_in_easy_russian_text = file.read().replace('\n', ' ').replace(' ', ' ')
    stories_in_easy_russian_text = remove_chapter_headers(stories_in_easy_russian_text)
    stories_in_easy_russian_par = stories_in_easy_russian_text.split('\n')
    stories_in_easy_russian_sents = split_into_sents(stories_in_easy_russian_par)

Разбиение текстов на предложения:   0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
df_stories_in_easy_russian = pd.DataFrame({'text': stories_in_easy_russian_sents,
                                           'cefr_level': ['A2'] * len(stories_in_easy_russian_sents)})

In [34]:
with open('/content/texts_rki_a2.txt', 'r', encoding='utf-8') as file:
    rki_a2_text = file.read().replace('\n', ' ').replace(' ', ' ')
    rki_a2_text = remove_chapter_headers(rki_a2_text)
    rki_a2_par = rki_a2_text.split('\n')
    rki_a2_sents = split_into_sents(rki_a2_par)

Разбиение текстов на предложения:   0%|          | 0/1 [00:00<?, ?it/s]

In [35]:
df_rki_a2 = pd.DataFrame({'text': rki_a2_sents,
                          'cefr_level': ['A2'] * len(rki_a2_sents)})

In [40]:
with open('/content/texts_rki_a1.txt', 'r', encoding='utf-8') as file:
    rki_a1_text = file.read().replace('\n', ' ').replace(' ', ' ')
    rki_a1_text = remove_chapter_headers(rki_a1_text)
    rki_a1_par = rki_a1_text.split('\n')
    rki_a1_sents = split_into_sents(rki_a1_par)

Разбиение текстов на предложения:   0%|          | 0/1 [00:00<?, ?it/s]

In [41]:
df_rki_a1 = pd.DataFrame({'text': rki_a1_sents,
                          'cefr_level': ['A1'] * len(rki_a1_sents)})

In [48]:
with open('/content/nachinaem_chitat_po_russki.txt', 'r', encoding='utf-8') as file:
    nachinaem_chitat_po_russki_text = file.read().replace('\n', ' ').replace(' ', ' ')
    nachinaem_chitat_po_russki_text = remove_chapter_headers(nachinaem_chitat_po_russki_text)
    nachinaem_chitat_po_russki_par = nachinaem_chitat_po_russki_text.split('\n')
    nachinaem_chitat_po_russki_sents = split_into_sents(nachinaem_chitat_po_russki_par)

Разбиение текстов на предложения:   0%|          | 0/1 [00:00<?, ?it/s]

In [49]:
df_nachinaem_chitat_po_russki = pd.DataFrame({'text': nachinaem_chitat_po_russki_sents,
                          'cefr_level': ['A1'] * len(nachinaem_chitat_po_russki_sents)})

In [57]:
with open('/content/texts_for_reading_basic_level.txt', 'r', encoding='utf-8') as file:
    basic_level_text = file.read().replace('\n', ' ').replace(' ', ' ')
    basic_level_text = remove_chapter_headers(basic_level_text)
    basic_level_par = basic_level_text.split('\n')
    basic_level_sents = split_into_sents(basic_level_par)

Разбиение текстов на предложения:   0%|          | 0/1 [00:00<?, ?it/s]

In [58]:
df_basic_level = pd.DataFrame({'text': basic_level_sents,
                               'cefr_level': ['A2'] * len(basic_level_sents)})

In [62]:
with open('/content/pa-russki_a1.txt', 'r', encoding='utf-8') as file:
    pa_russki_a1_text = file.read().replace('\n', ' ').replace(' ', ' ')
    pa_russki_a1_text = remove_chapter_headers(pa_russki_a1_text)
    pa_russki_a1_par = pa_russki_a1_text.split('\n')
    pa_russki_a1_sents = split_into_sents(pa_russki_a1_par)

Разбиение текстов на предложения:   0%|          | 0/1 [00:00<?, ?it/s]

In [67]:
df_pa_russki_a1 = pd.DataFrame({'text': pa_russki_a1_sents,
                                'cefr_level': ['A1'] * len(pa_russki_a1_sents)})

In [71]:
with open('/content/pa-russki_a2.txt', 'r', encoding='utf-8') as file:
    pa_russki_a2_text = file.read().replace('\n', ' ').replace(' ', ' ')
    pa_russki_a2_text = remove_chapter_headers(pa_russki_a2_text)
    pa_russki_a2_par = pa_russki_a2_text.split('\n')
    pa_russki_a2_sents = split_into_sents(pa_russki_a2_par)

Разбиение текстов на предложения:   0%|          | 0/1 [00:00<?, ?it/s]

In [77]:
df_pa_russki_a2 = pd.DataFrame({'text': pa_russki_a2_sents,
                                'cefr_level': ['A2'] * len(pa_russki_a2_sents)})

In [75]:
with open('/content/anylang_a1.txt', 'r', encoding='utf-8') as file:
    anylang_a1_text = file.read().replace('\n', ' ').replace(' ', ' ')
    anylang_a1_text = remove_chapter_headers(anylang_a1_text)
    anylang_a1_par = anylang_a1_text.split('\n')
    anylang_a1_sents = split_into_sents(anylang_a1_par)

Разбиение текстов на предложения:   0%|          | 0/1 [00:00<?, ?it/s]

In [78]:
df_anylang_a1 = pd.DataFrame({'text': anylang_a1_sents,
                                'cefr_level': ['A1'] * len(anylang_a1_sents)})

In [81]:
with open('/content/anylang_a2.txt', 'r', encoding='utf-8') as file:
    anylang_a2_text = file.read().replace('\n', ' ').replace(' ', ' ')
    anylang_a2_text = remove_chapter_headers(anylang_a2_text)
    anylang_a2_par = anylang_a2_text.split('\n')
    anylang_a2_sents = split_into_sents(anylang_a2_par)

Разбиение текстов на предложения:   0%|          | 0/1 [00:00<?, ?it/s]

In [82]:
df_anylang_a2 = pd.DataFrame({'text': anylang_a2_sents,
                                'cefr_level': ['A2'] * len(anylang_a2_sents)})

In [86]:
with open('/content/udivitelnye_istorii_a1.txt', 'r', encoding='utf-8') as file:
    udivitelnye_istorii_a1_text = file.read().replace('\n', ' ').replace(' ', ' ')
    udivitelnye_istorii_a1_text = remove_chapter_headers(udivitelnye_istorii_a1_text)
    udivitelnye_istorii_a1_par = udivitelnye_istorii_a1_text.split('\n')
    udivitelnye_istorii_a1_sents = split_into_sents(udivitelnye_istorii_a1_par)

Разбиение текстов на предложения:   0%|          | 0/1 [00:00<?, ?it/s]

In [87]:
df_udivitelnye_istorii_a1 = pd.DataFrame({'text': udivitelnye_istorii_a1_sents,
                                          'cefr_level': ['A1'] * len(udivitelnye_istorii_a1_sents)})

# Финальный датасет

In [88]:
data = pd.concat([df_universalCEFR, df_adapted_literature,
                  df_fairytales, df_c1, df_c2, df_a1,
                  df_the_12_chairs, df_we, df_idiot,
                  df_stories_in_easy_russian, df_rki_a2,
                  df_rki_a1, df_nachinaem_chitat_po_russki,
                  df_basic_level, df_pa_russki_a1, df_pa_russki_a2,
                  df_anylang_a1, df_anylang_a2, df_udivitelnye_istorii_a1], ignore_index=True)

In [84]:
def filter_by_word_tokens(texts, min_tokens = 3):
    """
    Фильтрует тексты, оставляя только те, где больше min_tokens слов.

    Аргументы:
        texts: список текстов для фильтрации
        min_tokens: минимальное количество токенов-слов (пробелы и пунктуация игнорируются)

    Возвращает:
        Отфильтрованный список текстов
    """

    filtered_texts = []

    for text in tqdm(texts, desc="Фильтрация текстов"):
        doc = nlp(text)
        # Считаем только токены, которые являются словами (исключаем пунктуацию и пробелы)
        word_tokens = [token for token in doc if not token.is_punct and not token.is_space]

        if len(word_tokens) > min_tokens:
            filtered_texts.append(text)

    return filtered_texts

In [90]:
filtered_texts = filter_by_word_tokens(data['text'].tolist(), min_tokens=3)
filtered_data = data[data['text'].isin(filtered_texts)].reset_index(drop=True)

Фильтрация текстов:   0%|          | 0/63210 [00:00<?, ?it/s]

In [91]:
# Очистка от дубликатов
filtered_data = filtered_data.drop_duplicates(subset=['text', 'cefr_level'])
print(f'Всего строк в датасете после удаления дубликатов: {len(filtered_data)}')

Всего строк в датасете после удаления дубликатов: 50118


In [120]:
min_size = filtered_data['cefr_level'].value_counts().min()

# Балансировка
balanced_df = filtered_data.groupby('cefr_level').apply(
    lambda x: x.sample(n=min_size, random_state=42) if len(x) > min_size else x
).reset_index(drop=True)

# Перемешивание
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("РАСПРЕДЕЛЕНИЕ ПО УРОВНЯМ CEFR")
print("="*30)

levels = balanced_df['cefr_level'].value_counts().sort_index()

for level, count in levels.items():
    percent = (count / len(balanced_df)) * 100
    print(f"  {level} {count:>6,}")

print("="*30)
print(f" Всего {len(balanced_df):>6,} примеров")

РАСПРЕДЕЛЕНИЕ ПО УРОВНЯМ CEFR
  A1  4,409
  A2  4,409
  B1  4,409
  B2  4,409
  C1  4,409
  C2  4,409
 Всего 26,454 примеров


  balanced_df = filtered_data.groupby('cefr_level').apply(


## Разделение на train-val-test

In [101]:
df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Разбиение 80/10/10
# Отделяем train
train_df, test_val_df = train_test_split(
    df,
    train_size=0.8,
    stratify=df['cefr_level'],
    random_state=42
)

# Разделяем test/val
test_df, val_df = train_test_split(
    test_val_df,
    test_size=0.5,
    stratify=test_val_df['cefr_level'],
    random_state=42
)

In [105]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [112]:
print("="*60)
print("ОБЩАЯ СТАТИСТИКА")
print("="*60)
print(f"{'Выборка':<15} {'Примеров':>10} {'% от общего':>15}")
print("-"*60)
print(f"{'Всего':<15} {len(df):>10,} {'100.0%':>12}")
print(f"{'Train':<15} {len(train_df):>10,} {len(train_df)/len(df)*100:>11.1f}%")
print(f"{'Validation':<15} {len(val_df):>10,} {len(val_df)/len(df)*100:>11.1f}%")
print(f"{'Test':<15} {len(test_df):>10,} {len(test_df)/len(df)*100:>11.1f}%")

ОБЩАЯ СТАТИСТИКА
Выборка           Примеров     % от общего
------------------------------------------------------------
Всего               26,454       100.0%
Train               21,163        80.0%
Validation           2,646        10.0%
Test                 2,645        10.0%


In [111]:
# Детальная статистика по классам (CEFR уровням)
print("="*60)
print("ДЕТАЛЬНАЯ СТАТИСТИКА ПО КЛАССАМ (CEFR УРОВНИ)")
print("="*60)

all_levels = sorted(df['cefr_level'].unique())
stats_data = []

for level in all_levels:
    total_count = len(df[df['cefr_level'] == level])
    train_count = len(train_df[train_df['cefr_level'] == level])
    val_count = len(val_df[val_df['cefr_level'] == level])
    test_count = len(test_df[test_df['cefr_level'] == level])

    total_percent = (total_count / len(df)) * 100
    train_percent = (train_count / total_count * 100) if total_count > 0 else 0
    val_percent = (val_count / total_count * 100) if total_count > 0 else 0
    test_percent = (test_count / total_count * 100) if total_count > 0 else 0

    stats_data.append({
        'CEFR': level,
        'Всего': total_count,
        'Train': train_count,
        'Validation': val_count,
        'Test': test_count,
    })

stats_df = pd.DataFrame(stats_data)

pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
print(stats_df.to_string(index=False))

ДЕТАЛЬНАЯ СТАТИСТИКА ПО КЛАССАМ (CEFR УРОВНИ)
CEFR  Всего  Train  Validation  Test
  A1   4409   3528         441   440
  A2   4409   3527         441   441
  B1   4409   3527         441   441
  B2   4409   3527         441   441
  C1   4409   3527         441   441
  C2   4409   3527         441   441


In [121]:
train_df.to_csv('train_dataset.csv', index=False)
val_df.to_csv('val_dataset.csv', index=False)
test_df.to_csv('test_dataset.csv', index=False)