In [None]:
!pip install "datasets==3.6.0" "huggingface_hub<0.26"
import datasets, huggingface_hub
print("datasets version:", datasets.__version__)
print("hf_hub version:", huggingface_hub.__version__)


import os
from datasets import load_dataset, Dataset
from tqdm import tqdm
from huggingface_hub import login
import math

# ================================================================
# 1. CẤU HÌNH
# ================================================================
MY_TOKEN = "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"
MY_HF_REPO = "tiendung6b/ocr2-filtered-200k"
NUM_SAMPLES = 200000

login(token=MY_TOKEN)

# ================================================================
# 2. CHUẨN BỊ DATASET THAM CHIẾU
# ================================================================
print(">>> Đang load các dataset tham chiếu vào RAM (cần 1 chút thời gian)...")
hf_datasets = {
    "taco": load_dataset("BAAI/TACO", trust_remote_code=True),
    "apps": load_dataset("codeparrot/apps", trust_remote_code=True),
    "code_contests": load_dataset("deepmind/code_contests"),
    "open-r1/codeforces": load_dataset("open-r1/codeforces")
}
print(">>> Đã load xong datasets tham chiếu.")


def get_question(ds_name, split, index):
    """Hàm tìm đề bài từ dataset gốc, với full Examples cho codeforces."""
    try:
        if ds_name not in hf_datasets:
            return None

        benchmark = hf_datasets[ds_name][split][int(index)]

        if ds_name == "code_contests":
            return benchmark["description"] if benchmark["description"] else None

        elif ds_name in ["taco", "apps"]:
            return benchmark["question"]

        elif ds_name == "open-r1/codeforces":
            # Build đủ description + input_format + output_format + examples + note
            if not benchmark["description"]:
                return None

            question = benchmark["description"]

            if benchmark["input_format"]:
                question += "\n\nInput\n\n" + benchmark["input_format"]

            if benchmark["output_format"]:
                question += "\n\nOutput\n\n" + benchmark["output_format"]

            # === PHẦN BẠN YÊU CẦU: FULL EXAMPLES ===
            examples = benchmark.get("examples") or []
            if examples:
                question += "\n\nExamples"
                for ex in examples:
                    ex_in = ex.get("input")
                    ex_out = ex.get("output")

                    if ex_in is not None:
                        question += "\n\nInput\n\n" + ex_in
                    if ex_out is not None:
                        question += "\n\nOutput\n\n" + ex_out
            # ======================================

            if benchmark["note"]:
                question += "\n\nNote\n\n" + benchmark["note"]

            return question

    except Exception:
        return None

    return None


def safe_float(x):
    try:
        return float(x)
    except Exception:
        return float("nan")


# ================================================================
# 3. XỬ LÝ (CÓ CƠ CHẾ AN TOÀN)
# ================================================================
print(">>> Đang stream OpenCodeReasoning-2. Bấm Stop (Ctrl+C) bất cứ lúc nào để lưu kết quả hiện tại.")
ds_stream = load_dataset("nvidia/OpenCodeReasoning-2", "train", split="cpp", streaming=True)

processed_samples = []
count = 0

try:
    for ex in tqdm(ds_stream):
        pass_rate = safe_float(ex.get("pass_rate", 0))
        if math.isnan(pass_rate) or pass_rate < 0:
            # bỏ pass_rate -1 hoặc lỗi
            continue

        judgement = str(ex.get("judgement", "")).lower()

        # Nhóm 1: Pass rate > 0.6
        cond1 = pass_rate > 0.6
        # Nhóm 2: Pass rate < 0.3 & wrong
        cond2 = (pass_rate < 0.3) and (judgement == "wrong")

        if not (cond1 or cond2):
            continue

        # --- MAP QUESTION ---
        if ex["dataset"] not in hf_datasets:
            continue

        question_text = get_question(ex["dataset"], ex["split"], ex["index"])
        if question_text:
            new_ex = ex.copy()
            new_ex["question"] = question_text
            new_ex["filter_tag"] = "high_pass_rate" if cond1 else "hard_wrong"

            processed_samples.append(new_ex)
            count += 1

        if NUM_SAMPLES is not None and count >= NUM_SAMPLES:
            print(f">>> Đã đạt giới hạn {NUM_SAMPLES} mẫu.")
            break

except KeyboardInterrupt:
    print("\n>>> Bạn đã bấm dừng chương trình (hoặc ngắt kết nối)!")
    print(">>> Đang tiến hành lưu những dữ liệu đã thu thập được...")

except Exception as e:
    print(f"\n>>> Có lỗi xảy ra: {e}")
    print(">>> Đang cố gắng cứu dữ liệu đã xử lý...")

# ================================================================
# 4. LƯU KẾT QUẢ
# ================================================================
finally:
    if processed_samples:
        print(f">>> Tổng cộng thu thập được: {len(processed_samples)} mẫu.")
        print(f">>> Đang đẩy lên Hub: {MY_HF_REPO}...")

        try:
            new_dataset = Dataset.from_list(processed_samples)
            new_dataset.push_to_hub(MY_HF_REPO, private=False)
            print(f">>> THÀNH CÔNG! Dataset tại: https://huggingface.co/datasets/{MY_HF_REPO}")
        except Exception as e:
            print(f">>> Lỗi khi upload: {e}")
    else:
        print(">>> Chưa thu thập được mẫu nào.")


  from .autonotebook import tqdm as notebook_tqdm
`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'BAAI/TACO' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


RuntimeError: Dataset scripts are no longer supported, but found TACO.py