In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from transformers import AutoTokenizer
from common import (
    FRED_T5_MODEL_NAME,
    INPUT_TOKEN_LIMIT_FRED_T5,
    TASK_PROMPT,
    MIN_TEXT_TOKENS_FOR_SUMMARIZATION,
    GEMINI_API_KEY,
    GEMINI_MODEL_NAME_PRIMARY
)
import common
from datasets import load_dataset
import google.generativeai as genai
import os

HF_DATASET_NAME = "cointegrated/taiga_stripped_rest"
HF_DATASET_SPLIT = "NPlus1"
OUTPUT_JSONL_FILE = "nplus1_generated_summaries.jsonl"
PROCESSED_INDICES_FILE = "nplus1_processed_indices.txt"
FINAL_HF_DATASET_PATH = "nplus1_gemini"

tokenizer = AutoTokenizer.from_pretrained(FRED_T5_MODEL_NAME)

In [2]:
tokens_in_task_prompt_fred_t5 = len(tokenizer(TASK_PROMPT, add_special_tokens=False)['input_ids'])
NUM_SPECIAL_TOKENS_FOR_FRED_T5_INPUT = 1
MAX_TEXT_TOKENS_FOR_FRED_T5_COMPATIBLE_INPUT = INPUT_TOKEN_LIMIT_FRED_T5 - \
                                               tokens_in_task_prompt_fred_t5 - \
                                               NUM_SPECIAL_TOKENS_FOR_FRED_T5_INPUT

Полный лимит входа FRED-T5: 1024
Токены в TASK_PROMPT_FRED_T5_FOR_TRAINING ('<LM> Сократи текст: '): 6
Специальные токены FRED-T5 для входа: 1
Макс. токены для чистого текста (чтобы уместиться в FRED-T5 с его промптом): 1017


In [5]:
raw_dataset = load_dataset(HF_DATASET_NAME, split=HF_DATASET_SPLIT)
print(f"Loaded '{HF_DATASET_SPLIT}' split with {len(raw_dataset)} examples.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loaded 'NPlus1' split with 7000 examples.


In [6]:
prepared_dataset_for_gemini = common.preprocess_and_filter_dataset_with_exact_deduplication(
    raw_dataset,
    tokenizer,
    MAX_TEXT_TOKENS_FOR_FRED_T5_COMPATIBLE_INPUT,
    MIN_TEXT_TOKENS_FOR_SUMMARIZATION
)

Starting preprocessing. Initial size: 7000
Targeting max text tokens for processing: 1017


Cleaning, Truncating, and Exact Deduplicating:   0%|          | 0/7000 [00:00<?, ?it/s]

Finished preprocessing. Exact duplicates found and skipped: 1
Filtered dataset size after exact deduplication: 6855


In [7]:
prepared_dataset_for_gemini

Dataset({
    features: ['file', 'processed_text', 'processed_text_tokens'],
    num_rows: 6855
})

In [12]:
genai.configure(api_key=GEMINI_API_KEY)
model_gemini = genai.GenerativeModel(GEMINI_MODEL_NAME_PRIMARY)

Successfully configured Gemini with model: gemini-2.0-flash


In [13]:
common.generate_summaries_resumable_full_stats(prepared_dataset_for_gemini, model_gemini, tokenizer, PROCESSED_INDICES_FILE, OUTPUT_JSONL_FILE)

Loaded 6490 already processed indices.
Attempting to process 365 items in this run.


Generating Summaries & Full Stats:   0%|          | 0/365 [00:00<?, ?it/s]


--- Generation Finished for this Run ---
Processed: 365, Total Time: 829.36s, Avg/Item: 2.27s

Source Stats:
  Chars: Avg=2546.44, Min=575, Max=5350
  Words: Avg=376.08, Min=86, Max=818
  Tokens: Avg=516.77, Min=111, Max=998

All Summary Candidates (1095 non-empty):
  Chars:
    Avg=815.61, Min=339, Max=1274
    Comp.: Avg=0.354, Min=0.137, Max=0.861
    25% Quantile: Abs=714.00, Comp.=0.280
    50% Quantile: Abs=824.00, Comp.=0.335
    75% Quantile: Abs=919.00, Comp.=0.406
  Words:
    Avg=120.20, Min=51, Max=184
    Comp.: Avg=0.356, Min=0.132, Max=0.891
    25% Quantile: Abs=104.00, Comp.=0.280
    50% Quantile: Abs=120.00, Comp.=0.338
    75% Quantile: Abs=137.00, Comp.=0.411
  Tokens:
    Avg=172.87, Min=71, Max=274
    Comp.: Avg=0.370, Min=0.135, Max=0.842
    25% Quantile: Abs=148.00, Comp.=0.293
    50% Quantile: Abs=173.00, Comp.=0.352
    75% Quantile: Abs=198.00, Comp.=0.429

Items with no summaries: []
Success distribution:
  3 summaries: 365 items


In [14]:
processed_indices = []
if os.path.exists(PROCESSED_INDICES_FILE): 
    with open(PROCESSED_INDICES_FILE) as f:
        processed_indices = [int(line.strip()) for line in f if line.strip()]
len(processed_indices) == len(set(processed_indices)), len(processed_indices)

(True, 6855)

In [15]:
final_dataset = common.create_final_huggingface_dataset_from_jsonl(OUTPUT_JSONL_FILE, FINAL_HF_DATASET_PATH)
final_dataset

Created Hugging Face dataset with 6855 examples from JSONL.
Saving Hugging Face dataset to nplus1_gemini...


Saving the dataset (0/1 shards):   0%|          | 0/6855 [00:00<?, ? examples/s]

Hugging Face dataset saved.


Dataset({
    features: ['file', 'text', 'summaries'],
    num_rows: 6855
})