# Подготовка корпуса Taiga

Формирование и проверка корпуса, сохраняем в `taiga_style_dataset.csv` для `style_boosted_pipeline.ipynb`.


In [1]:
# Базовые импорты (без try/except, дисциплина превыше всего)
from __future__ import annotations
import inspect
import json
import math
import os
import re
import string
import tarfile
import unicodedata
from collections import Counter
from dataclasses import dataclass
from functools import lru_cache
import warnings
from io import StringIO
from pathlib import Path
from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple
# Backport inspect.getargspec removed in Python 3.11 for pymorphy2 compatibility
if not hasattr(inspect, "getargspec"):
    from collections import namedtuple
    ArgSpec = namedtuple("ArgSpec", ["args", "varargs", "keywords", "defaults"])
    def _getargspec(func):
        spec = inspect.getfullargspec(func)
        return ArgSpec(spec.args, spec.varargs, spec.varkw, spec.defaults)
    inspect.getargspec = _getargspec  # type: ignore[attr-defined]
import joblib
import numpy as np
import pandas as pd
import scipy.sparse as sp
from lightgbm import LGBMClassifier
from pymorphy2 import MorphAnalyzer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.model_selection import RandomizedSearchCV, StratifiedShuffleSplit
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.utils import check_random_state
from sklearn.exceptions import ConvergenceWarning
import gensim
from gensim.models import Word2Vec
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from IPython.display import display


In [2]:
# Глобальные настройки предупреждений
warnings.filterwarnings("ignore", message="pkg_resources is deprecated")
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", message="No further splits with positive gain")


## Конфигурация и контрольные параметры
глобальные параметры, пути к данным и отладочные флаги, используемые при сборке корпуса.


In [3]:
debug_flag = os.environ.get("STYLE_PIPELINE_DEBUG", "0").lower() not in {"0", "false", "no"}
debug_sample_env = os.environ.get("STYLE_PIPELINE_DEBUG_SIZE", "")
try:
    debug_sample_size = int(debug_sample_env) if debug_sample_env else 600
except ValueError:
    debug_sample_size = 600
# Основной конфигурационный словарь: управляет путями, объёмами данных и гиперпараметрами
config: Dict[str, object] = {
    "DATA_DIR": Path("./data"),
    "DATASET_FILENAME": "taiga_style_dataset.csv",
    "TARGET_COLUMN": "label",
    "TEXT_COLUMN": "text",
    "HINT_COLUMN": "style_hint_label",
    "HINT_NAME_COLUMN": "style_hint",
    "HINT_CONFIDENCE_COLUMN": "style_hint_confidence",
    "TEST_SIZE": 0.2,
    "VAL_SIZE": 0.2,
    "MIN_TEXT_LENGTH": 10,
    "RANDOM_STATE": 2024,
    "CACHE_DIR": Path("./cache_boosted"),
    "N_JOBS": os.cpu_count() or 4,
    "REBUILD_DATASET": True,  # True → пересобрать корпус из архивов, False → использовать готовый CSV
    "SEGMENT_MAX_CHARS": 380,
    "SEGMENT_MIN_CHARS": 10,
    "SEGMENT_MAX_PER_FILE": 6,
    "DATASET_BALANCE_PER_CLASS": 8000, #8000
    "STYLE_LABELS": {
        0: "разговорный стиль",
        1: "официально-деловой стиль",
    },
    "RAW_SOURCES": [  # описание каждого архива: путь, фильтры, эвристический hint
        # Subtitles: разговорный стиль (корпус субтитров)
        # {
            # "name": "Subtitles",
            # "kind": "subtitles",
            # "archive": "Subtitles.tar.gz",
            # "metadata_member": "home/tsha/Subtitles/metatable.csv",
            # "text_base": "home/tsha/Subtitles/texts",
            # "style_hint": 0,
            # "hint_confidence": 0.65,
            # "hint_source": "archive",
            # "max_files": 500, #4500
            # "languages": ["ru"],
        # },
        # Interfax: официальные новости
        {
            "name": "Interfax",
            "kind": "interfax",
            "archive": "Interfax.tar.gz",
            "metadata_member": "home/tsha/Interfax/newmetadata.csv",
            "text_base": "home/tsha/Interfax/texts",
            "style_hint": 1,
            "hint_confidence": 0.75,
            "hint_source": "archive",
            "max_files": 10000, #5000
        },
        # Social: сообщения из соцсетей
        {
            "name": "Social",
            "kind": "social",
            "archive": "social.tar.gz",
            "text_members": [
                "home/tsha/social/texts/vktexts.txt",
                "home/tsha/social/texts/fbtexts.txt",
                "home/tsha/social/texts/LiveJournalPostsandcommentsGICR.txt",
                "home/tsha/social/texts/twtexts.txt",
            ],
            "style_hint": 0,
            "hint_confidence": 0.6,
            "hint_source": "archive",
            "max_records": 6000, #15000
        },
        # Arzamas: научно-популярные тексты
        # {
            # "name": "Arzamas",
            # "kind": "arzamas",
            # "archive": "Arzamas.tar.gz",
            # "metadata_member": "home/tsha/Arzamas/metatable.csv",
            # "text_base": "home/tsha/Arzamas/texts",
            # "style_hint": 1,
            # "hint_confidence": 0.7,
            # "hint_source": "archive",
            # "max_files": 600, # 4000
        # },
    ],
    "MAX_FEATURES_WORD": 120000,
    "MAX_FEATURES_CHAR": 60000,
    "WORD_NGRAM_RANGE": (1, 3),
    "CHAR_NGRAM_RANGE": (3, 5),
    "MIN_DF_WORD": 3,
    "MIN_DF_CHAR": 2,
    "TF_SUBLINEAR": True,
    "LOGREG_SEARCH_ITER": 10, #30
    "SGD_SEARCH_ITER": 10, #25
    "CV_FOLDS": 5,
    "MAX_ITER_LINEAR": 200,
    "WORD2VEC_DIM": 200,
    "WORD2VEC_WINDOW": 5,
    "WORD2VEC_MIN_COUNT": 2,
    "SEQUENCE_MAX_LEN": 256,
    "BATCH_SIZE": 64, #128
    "SEQUENCE_EPOCHS": 10, #10-12
    "LEARNING_RATE": 1e-3,
    "EMBEDDING_DROPOUT": 0.3,
    "GRAD_CLIP": 1.5,
    "ENSEMBLE_WEIGHT_GRID": 21,
    "TEXTCNN_KERNEL_SIZES": [2, 3, 4],
    "TEXTCNN_FILTERS": 256,
    "BILSTM_HIDDEN_SIZE": 192,
    "BILSTM_NUM_LAYERS": 2,
    "LGBM_N_ESTIMATORS": 800,
    "LGBM_LEARNING_RATE": 0.05,
    "LGBM_SUBSAMPLE": 0.8,
    "LGBM_COLSAMPLE": 0.8,
    "LGBM_REG_ALPHA": 0.1,
    "LGBM_REG_LAMBDA": 0.3,
    "EXTRATREES_N_ESTIMATORS": 600,
    "EXTRATREES_MIN_SAMPLES_SPLIT": 5,
    "EXTRATREES_MIN_SAMPLES_LEAF": 2,
    "FEATURE_UNION_PKL": Path("./cache_boosted/feature_union.joblib"),
    "LOGREG_PKL": Path("./cache_boosted/logreg_elasticnet.joblib"),
    "SGD_PKL": Path("./cache_boosted/sgd_elasticnet.joblib"),
    "WORD2VEC_MODEL": Path("./cache_boosted/word2vec.model"),
    "RESULTS_CSV": Path("./cache_boosted/model_results.csv"),
    "TEXTCNN_STATE_PATH": Path("./cache_boosted/textcnn_state.pt"),
    "BILSTM_STATE_PATH": Path("./cache_boosted/bilstm_state.pt"),
    "LABEL_INDEX_JSON": Path("./cache_boosted/label_index.json"),
    "SEQUENCE_VOCAB_JSON": Path("./cache_boosted/sequence_vocab.json"),
    "ENSEMBLE_ERRORS_CSV": Path("./data/misclassified_ensemble.csv"),
    "DEBUG_MODE": debug_flag,
    "DEBUG_SAMPLE_SIZE": debug_sample_size,
}
if config["DEBUG_SAMPLE_SIZE"] <= 0:
    config["DEBUG_SAMPLE_SIZE"] = 600
if config["DEBUG_MODE"]:
    config["LOGREG_SEARCH_ITER"] = min(5, config["LOGREG_SEARCH_ITER"])
    config["SGD_SEARCH_ITER"] = min(5, config["SGD_SEARCH_ITER"])
    config["CV_FOLDS"] = min(3, config["CV_FOLDS"])
    config["MAX_ITER_LINEAR"] = min(100, config["MAX_ITER_LINEAR"])
    config["MAX_FEATURES_WORD"] = min(30000, config["MAX_FEATURES_WORD"])
    config["MAX_FEATURES_CHAR"] = min(15000, config["MAX_FEATURES_CHAR"])
    config["WORD2VEC_MIN_COUNT"] = 1
    config["WORD2VEC_WINDOW"] = min(3, config["WORD2VEC_WINDOW"])
    config["SEQUENCE_MAX_LEN"] = min(128, config["SEQUENCE_MAX_LEN"])
    config["BATCH_SIZE"] = min(64, config["BATCH_SIZE"])
    config["SEQUENCE_EPOCHS"] = min(3, config["SEQUENCE_EPOCHS"])
    config["ENSEMBLE_WEIGHT_GRID"] = min(11, config["ENSEMBLE_WEIGHT_GRID"])
    config["LGBM_N_ESTIMATORS"] = min(200, config["LGBM_N_ESTIMATORS"])
    config["EXTRATREES_N_ESTIMATORS"] = min(200, config["EXTRATREES_N_ESTIMATORS"])
    config["TEXTCNN_FILTERS"] = min(128, config["TEXTCNN_FILTERS"])
    config["BILSTM_HIDDEN_SIZE"] = min(128, config["BILSTM_HIDDEN_SIZE"])
    config["N_JOBS"] = 1
    config["DATASET_BALANCE_PER_CLASS"] = min(600, config["DATASET_BALANCE_PER_CLASS"])
    for source in config["RAW_SOURCES"]:
        if "max_files" in source:
            source["max_files"] = min(source["max_files"], config["DEBUG_SAMPLE_SIZE"])
        if "max_records" in source:
            source["max_records"] = min(source["max_records"], config["DEBUG_SAMPLE_SIZE"] * 2)
config["DATA_DIR"].mkdir(parents=True, exist_ok=True)
config["CACHE_DIR"].mkdir(parents=True, exist_ok=True)
# Путь до csv с готовым корпусом
config["DATASET_CSV"] = config["DATA_DIR"] / config["DATASET_FILENAME"]
for source in config["RAW_SOURCES"]:
    archive_path = Path(source["archive"])
    if not archive_path.is_absolute():
        archive_path = config["DATA_DIR"] / archive_path
    source["archive_path"] = archive_path
# Генератор случайных чисел для воспроизводимости
random_state = check_random_state(config["RANDOM_STATE"])


In [4]:
# Функции подготовки корпуса вынесены в src.data_prep
from src.data_prep import (
    build_dataset_from_sources,
    compute_corpus_summary,
    write_corpus_report,
)


In [5]:
if config.get("REBUILD_DATASET") or not config["DATASET_CSV"].exists():
    dataset_built = build_dataset_from_sources(config)
    dataset_built.to_csv(config["DATASET_CSV"], index=False)
    print(f"Датасет собран заново: {len(dataset_built)} строк -> {config['DATASET_CSV']}")
    for name, size in config.get('SOURCE_STATS', []):
        print(f"  {name}: {size}")
else:
    print(f"Используем существующий датасет: {config['DATASET_CSV']}")


Output()

Датасет собран заново: 16000 строк -> data/taiga_style_dataset.csv
  Interfax: 27255
  Social: 306206


## Загрузка и первичная проверка корпуса

Проверка обязательные столбцов, очищстка пропусков и фиксируем исходную статистику корпуса.


In [6]:

if not config["DATASET_CSV"].exists():
    raise FileNotFoundError(
        f"Не найден файл {config['DATASET_CSV']}. Скопируйте датасет и запустите ячейку повторно."
    )

df = pd.read_csv(config["DATASET_CSV"], encoding="utf-8")
if config["TARGET_COLUMN"] not in df.columns:
    df[config["TARGET_COLUMN"]] = -1
    print(
        f"Колонка {config['TARGET_COLUMN']} отсутствует в корпусе. Заполнено значением -1 (будет проставлено после авторазметки)."
    )

required_columns = {config["TEXT_COLUMN"]}
missing_columns = required_columns.difference(df.columns)
if missing_columns:
    raise ValueError(f"В датасете отсутствуют необходимые колонки: {sorted(missing_columns)}")

df = df.dropna(subset=[config["TEXT_COLUMN"]]).copy()
df[config["TEXT_COLUMN"]] = df[config["TEXT_COLUMN"]].astype(str)

corpus_stats: Dict[str, object] = {}
corpus_stats["initial_rows"] = int(len(df))


Колонка label отсутствует в корпусе. Заполнено значением -1 (будет проставлено после авторазметки).


## Очистка корпуса и оценка шума

удаляем дубликаты и короткие тексты, при необходимости формируем отладочную выборку и обновляем показатели `corpus_stats`.


In [7]:

deduplicated = df.drop_duplicates(subset=[config["TEXT_COLUMN"]], keep="first").copy()
removed_duplicates = corpus_stats["initial_rows"] - len(deduplicated)

length_mask = deduplicated[config["TEXT_COLUMN"]].str.len() >= config["MIN_TEXT_LENGTH"]
clean_df = deduplicated.loc[length_mask].copy()
removed_invalid = len(deduplicated) - len(clean_df)

clean_df.reset_index(drop=True, inplace=True)

corpus_stats["removed_duplicates"] = int(removed_duplicates)
corpus_stats["removed_invalid"] = int(removed_invalid)
corpus_stats["final_rows"] = int(len(clean_df))
corpus_stats["noise_share"] = round((removed_duplicates + removed_invalid) / corpus_stats["initial_rows"], 4)

if config["DEBUG_MODE"] and len(clean_df) > config["DEBUG_SAMPLE_SIZE"]:
    sample_size = max(1, min(config["DEBUG_SAMPLE_SIZE"], len(clean_df)))
    clean_df = clean_df.sample(n=sample_size, random_state=config["RANDOM_STATE"], replace=False).reset_index(drop=True)
    corpus_stats["debug_sample_size"] = int(sample_size)


## Итоговая проверка корпуса

метрики по корпусу, ключевые срезы и сохраняем отчёты в каталоге `reports`.


In [8]:

summary = compute_corpus_summary(
    clean_df,
    text_column=config["TEXT_COLUMN"],
    label_column=config["TARGET_COLUMN"],
    hint_column=config.get("HINT_COLUMN"),
    source_stats=config.get("SOURCE_STATS"),
)
headline = pd.Series(
    {
        'rows': summary['rows'],
        'unique_texts': summary['unique_texts'],
        'duplicates_removed': summary['duplicates_removed'],
    }
, name='corpus')
display(headline.to_frame().T)
length_stats = pd.Series(summary['length']).to_frame().T
length_stats.index = ['length']
display(length_stats)
if summary.get('hints'):
    hint_frame = (
        pd.DataFrame.from_dict(summary['hints'], orient='index')
        .rename_axis('hint_label')
    )
    display(hint_frame)
label_frame = (
    pd.DataFrame.from_dict(summary['labels'], orient='index')
    .rename_axis('label')
)
display(label_frame)
reports_dir = config['CACHE_DIR'] / 'reports'
report_paths = write_corpus_report(summary, reports_dir, report_name=config['DATASET_CSV'].stem)
corpus_stats['report_paths'] = {kind: str(path) for kind, path in report_paths.items()}
print('Отчёты сохранены в:')
for kind, path in report_paths.items():
    print(f'  {kind}: {path}')


Unnamed: 0,rows,unique_texts,duplicates_removed
corpus,16000,16000,0


Unnamed: 0,min,max,mean,median,p95
length,12.0,380.0,222.575938,234.0,368.0


Unnamed: 0_level_0,count,share
hint_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,8000,0.5
1,8000,0.5


Unnamed: 0_level_0,count,share
label,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,16000,1.0


Отчёты сохранены в:
  json: cache_boosted/reports/taiga_style_dataset_report.json
  markdown: cache_boosted/reports/taiga_style_dataset_report.md
  csv: cache_boosted/reports/taiga_style_dataset_metrics.csv
