In [1]:
import os, math, json, random, pathlib
from typing import Iterable, Dict, Any, Tuple

DATA_DIR = "data"
TRN_PATH = os.path.join(DATA_DIR, "trn.json")
TST_PATH = os.path.join(DATA_DIR, "tst.json")  # usamos p/ test
OUT_DIR = DATA_DIR

# parâmetros ajustáveis
SAMPLE_TRAIN = 50_000   # amostra do trn para treinar rápido (ajuste p/ 10k–100k)
VAL_RATIO    = 0.03     # 3% para validação
TEST_SAMPLE  = 5_000    # amostra do tst (0 = converter tudo)
SEED         = 42

PROMPT_TEMPLATE = (
    "Given a product title, generate its product description.\n"
    "Title: {title}\n"
    "Description:"
)

assert os.path.exists(TRN_PATH), f"Arquivo não encontrado: {TRN_PATH}"
assert os.path.exists(TST_PATH), f"Arquivo não encontrado: {TST_PATH}"
pathlib.Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
print("OK: paths prontos")


OK: paths prontos


In [3]:
import ijson

def detect_format(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        head = f.read(2048).lstrip()
    return "array" if head.startswith("[") else "jsonl"

def stream_array_items(json_path: str) -> Iterable[Dict[str, Any]]:
    with open(json_path, "r", encoding="utf-8") as f:
        for obj in ijson.items(f, "item"):
            yield obj

def stream_jsonl_lines(jsonl_path: str) -> Iterable[Dict[str, Any]]:
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                continue

def iter_examples(path: str) -> Iterable[Dict[str, Any]]:
    return stream_array_items(path) if detect_format(path) == "array" else stream_jsonl_lines(path)

def reservoir_sample(iterator: Iterable[Dict[str, Any]], k: int, seed: int = 42) -> list:
    rnd = random.Random(seed)
    sample = []
    for t, item in enumerate(iterator, start=1):
        if not isinstance(item, dict):
            continue
        title = item.get("title")
        content = item.get("content")
        if not title or not content:
            continue
        if len(sample) < k:
            sample.append({"title": title, "content": content})
        else:
            j = rnd.randint(1, t)
            if j <= k:
                sample[j - 1] = {"title": title, "content": content}
    return sample

def split_train_val(rows: list, val_ratio: float = 0.03, seed: int = 42):
    rnd = random.Random(seed)
    rows = rows.copy()
    rnd.shuffle(rows)
    n = len(rows)
    n_val = max(1, math.floor(n * val_ratio))
    n_train = n - n_val
    return rows[:n_train], rows[n_train:]

def write_jsonl_template(rows: Iterable[Dict[str, str]], out_path: str) -> int:
    n = 0
    with open(out_path, "w", encoding="utf-8") as f:
        for row in rows:
            title, content = row["title"], row["content"]
            rec = {
                "input_text": PROMPT_TEMPLATE.format(title=title),
                "target_text": str(content)
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
            n += 1
    return n


In [4]:
print("Amostrando do TRN...", TRN_PATH)
sample_rows = reservoir_sample(iter_examples(TRN_PATH), k=SAMPLE_TRAIN, seed=SEED)
print(f"Amostra obtida: {len(sample_rows)}")

train_rows, val_rows = split_train_val(sample_rows, val_ratio=VAL_RATIO, seed=SEED)
print(f"Split → train: {len(train_rows)} | val: {len(val_rows)}")

train_path = os.path.join(OUT_DIR, "train.jsonl")
val_path   = os.path.join(OUT_DIR, "val.jsonl")

n_train = write_jsonl_template(train_rows, train_path)
n_val   = write_jsonl_template(val_rows,   val_path)
print(f"Salvo: {train_path} ({n_train}) | {val_path} ({n_val})")


Amostrando do TRN... data/trn.json
Amostra obtida: 50000
Split → train: 48500 | val: 1500
Salvo: data/train.jsonl (48500) | data/val.jsonl (1500)


In [5]:
test_path = os.path.join(OUT_DIR, "test.jsonl")
print("Gerando TEST a partir do TST...", TST_PATH)

if TEST_SAMPLE and TEST_SAMPLE > 0:
    test_rows = reservoir_sample(iter_examples(TST_PATH), k=TEST_SAMPLE, seed=SEED)
    n_test = write_jsonl_template(test_rows, test_path)
else:
    def gen():
        for obj in iter_examples(TST_PATH):
            title = obj.get("title")
            content = obj.get("content")
            if title and content:
                yield {"title": title, "content": content}
    n_test = write_jsonl_template(gen(), test_path)

print(f"Salvo: {test_path} ({n_test})")


Gerando TEST a partir do TST... data/tst.json
Salvo: data/test.jsonl (5000)


In [6]:
for name in ["train", "val", "test"]:
    p = os.path.join(OUT_DIR, f"{name}.jsonl")
    print("----", p)
    with open(p, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            print(line.strip()[:160] + "...")
            if i >= 1:
                break


---- data/train.jsonl
{"input_text": "Given a product title, generate its product description.\nTitle: GoFit Weightlifting Glove\nDescription:", "target_text": "GoFit Diamond-Tac Wei...
{"input_text": "Given a product title, generate its product description.\nTitle: The Bedford Reader with 2009 MLA Update\nDescription:", "target_text": "X. J. K...
---- data/val.jsonl
{"input_text": "Given a product title, generate its product description.\nTitle: Palmolive 04910 Dishwashing Liquid, 1 gallon Bottle\nDescription:", "target_tex...
{"input_text": "Given a product title, generate its product description.\nTitle: Exceptional Leadership: 16 Critical Competencies for Healthcare Executives\nDes...
---- data/test.jsonl
{"input_text": "Given a product title, generate its product description.\nTitle: Forgive Me, Father: A True Story of a Priest, a Nun, and Brutal Murder\nDescrip...
{"input_text": "Given a product title, generate its product description.\nTitle: Champion Sports 4 Wheel Umpire Indic