In [1]:
!pip install "datasets==3.6.0" "huggingface_hub<0.26"
import datasets, huggingface_hub
print("datasets version:", datasets.__version__)
print("hf_hub version:", huggingface_hub.__version__)


import os
from datasets import load_dataset, Dataset
from tqdm import tqdm
from huggingface_hub import login
import math

# ================================================================
# 1. CẤU HÌNH
# ================================================================
MY_TOKEN = "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"
MY_HF_REPO = "tiendung6b/ocr2-filtered-200k"
NUM_SAMPLES = 200000

login(token=MY_TOKEN)

# ================================================================
# 2. CHUẨN BỊ DATASET THAM CHIẾU
# ================================================================
print(">>> Đang load các dataset tham chiếu vào RAM (cần 1 chút thời gian)...")
hf_datasets = {
    "taco": load_dataset("BAAI/TACO", trust_remote_code=True),
    "apps": load_dataset("codeparrot/apps", trust_remote_code=True),
    "code_contests": load_dataset("deepmind/code_contests"),
    "open-r1/codeforces": load_dataset("open-r1/codeforces")
}
print(">>> Đã load xong datasets tham chiếu.")


def get_question(ds_name, split, index):
    """Hàm tìm đề bài từ dataset gốc, với full Examples cho codeforces."""
    try:
        if ds_name not in hf_datasets:
            return None

        benchmark = hf_datasets[ds_name][split][int(index)]

        if ds_name == "code_contests":
            return benchmark["description"] if benchmark["description"] else None

        elif ds_name in ["taco", "apps"]:
            return benchmark["question"]

        elif ds_name == "open-r1/codeforces":
            # Build đủ description + input_format + output_format + examples + note
            if not benchmark["description"]:
                return None

            question = benchmark["description"]

            if benchmark["input_format"]:
                question += "\n\nInput\n\n" + benchmark["input_format"]

            if benchmark["output_format"]:
                question += "\n\nOutput\n\n" + benchmark["output_format"]

            # === PHẦN BẠN YÊU CẦU: FULL EXAMPLES ===
            examples = benchmark.get("examples") or []
            if examples:
                question += "\n\nExamples"
                for ex in examples:
                    ex_in = ex.get("input")
                    ex_out = ex.get("output")

                    if ex_in is not None:
                        question += "\n\nInput\n\n" + ex_in
                    if ex_out is not None:
                        question += "\n\nOutput\n\n" + ex_out
            # ======================================

            if benchmark["note"]:
                question += "\n\nNote\n\n" + benchmark["note"]

            return question

    except Exception:
        return None

    return None


def safe_float(x):
    try:
        return float(x)
    except Exception:
        return float("nan")


# ================================================================
# 3. XỬ LÝ (CÓ CƠ CHẾ AN TOÀN)
# ================================================================
print(">>> Đang stream OpenCodeReasoning-2. Bấm Stop (Ctrl+C) bất cứ lúc nào để lưu kết quả hiện tại.")
ds_stream = load_dataset("nvidia/OpenCodeReasoning-2", "train", split="cpp", streaming=True)

processed_samples = []
count = 0

try:
    for ex in tqdm(ds_stream):
        pass_rate = safe_float(ex.get("pass_rate", 0))
        if math.isnan(pass_rate) or pass_rate < 0:
            # bỏ pass_rate -1 hoặc lỗi
            continue

        judgement = str(ex.get("judgement", "")).lower()

        # Nhóm 1: Pass rate > 0.6
        cond1 = pass_rate > 0.6
        # Nhóm 2: Pass rate < 0.3 & wrong
        cond2 = (pass_rate < 0.3) and (judgement == "wrong")

        if not (cond1 or cond2):
            continue

        # --- MAP QUESTION ---
        if ex["dataset"] not in hf_datasets:
            continue

        question_text = get_question(ex["dataset"], ex["split"], ex["index"])
        if question_text:
            new_ex = ex.copy()
            new_ex["question"] = question_text
            new_ex["filter_tag"] = "high_pass_rate" if cond1 else "hard_wrong"

            processed_samples.append(new_ex)
            count += 1

        if NUM_SAMPLES is not None and count >= NUM_SAMPLES:
            print(f">>> Đã đạt giới hạn {NUM_SAMPLES} mẫu.")
            break

except KeyboardInterrupt:
    print("\n>>> Bạn đã bấm dừng chương trình (hoặc ngắt kết nối)!")
    print(">>> Đang tiến hành lưu những dữ liệu đã thu thập được...")

except Exception as e:
    print(f"\n>>> Có lỗi xảy ra: {e}")
    print(">>> Đang cố gắng cứu dữ liệu đã xử lý...")

# ================================================================
# 4. LƯU KẾT QUẢ
# ================================================================
finally:
    if processed_samples:
        print(f">>> Tổng cộng thu thập được: {len(processed_samples)} mẫu.")
        print(f">>> Đang đẩy lên Hub: {MY_HF_REPO}...")

        try:
            new_dataset = Dataset.from_list(processed_samples)
            new_dataset.push_to_hub(MY_HF_REPO, private=False)
            print(f">>> THÀNH CÔNG! Dataset tại: https://huggingface.co/datasets/{MY_HF_REPO}")
        except Exception as e:
            print(f">>> Lỗi khi upload: {e}")
    else:
        print(">>> Chưa thu thập được mẫu nào.")


Collecting datasets==3.6.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub<0.26
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==3.6.0)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess<0.70.17 (from datasets==3.6.0)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets==3.6.0)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading huggingface_hub-0.25.2-py3-none-any.whl (436 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m436.6/436.6 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8

README.md: 0.00B [00:00, ?B/s]

TACO.py: 0.00B [00:00, ?B/s]

data-00000-of-00009.arrow:   0%|          | 0.00/530M [00:00<?, ?B/s]

data-00001-of-00009.arrow:   0%|          | 0.00/613M [00:00<?, ?B/s]

data-00002-of-00009.arrow:   0%|          | 0.00/335M [00:00<?, ?B/s]

data-00003-of-00009.arrow:   0%|          | 0.00/403M [00:00<?, ?B/s]

data-00004-of-00009.arrow:   0%|          | 0.00/417M [00:00<?, ?B/s]

data-00005-of-00009.arrow:   0%|          | 0.00/503M [00:00<?, ?B/s]

data-00006-of-00009.arrow:   0%|          | 0.00/447M [00:00<?, ?B/s]

data-00007-of-00009.arrow:   0%|          | 0.00/526M [00:00<?, ?B/s]

data-00008-of-00009.arrow:   0%|          | 0.00/466M [00:00<?, ?B/s]

data-00000-of-00001.arrow:   0%|          | 0.00/496M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25443 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

apps.py: 0.00B [00:00, ?B/s]

train.jsonl:   0%|          | 0.00/107M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/39 [00:00<?, ?it/s]

dataset_infos.json: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0/39 [00:00<?, ?files/s]

(…)-00000-of-00039-e991a271dbfa9925.parquet:   0%|          | 0.00/180M [00:00<?, ?B/s]

(…)-00001-of-00039-e092fe56fda18715.parquet:   0%|          | 0.00/209M [00:00<?, ?B/s]

(…)-00002-of-00039-9cea23812e920e41.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

(…)-00003-of-00039-e3822fccad6e083a.parquet:   0%|          | 0.00/181M [00:00<?, ?B/s]

(…)-00004-of-00039-cefe355b4667b27e.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

(…)-00005-of-00039-b7580d2d846c2136.parquet:   0%|          | 0.00/174M [00:00<?, ?B/s]

(…)-00006-of-00039-65184bb9f7d61fde.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

(…)-00007-of-00039-05785de21e8b8429.parquet:   0%|          | 0.00/172M [00:00<?, ?B/s]

(…)-00008-of-00039-7246e6b7423b404f.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

(…)-00009-of-00039-b8c920f6629b57b2.parquet:   0%|          | 0.00/205M [00:00<?, ?B/s]

(…)-00010-of-00039-6de28ba20654f69b.parquet:   0%|          | 0.00/178M [00:00<?, ?B/s]

(…)-00011-of-00039-5de236be5188959d.parquet:   0%|          | 0.00/164M [00:00<?, ?B/s]

(…)-00012-of-00039-da9476a39a1bdbb7.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

(…)-00013-of-00039-30b8c3829ee3b962.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

(…)-00014-of-00039-dc3ebb07a3cba8e4.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

(…)-00015-of-00039-19ccd7331d695677.parquet:   0%|          | 0.00/179M [00:00<?, ?B/s]

(…)-00016-of-00039-bf38b0908b322307.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

(…)-00017-of-00039-ae5533a2f822e6ef.parquet:   0%|          | 0.00/169M [00:00<?, ?B/s]

(…)-00018-of-00039-8c793837880f5507.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

(…)-00019-of-00039-d688fad5ee604390.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

(…)-00020-of-00039-5d59387098675b73.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

(…)-00021-of-00039-b257bf03d6876780.parquet:   0%|          | 0.00/181M [00:00<?, ?B/s]

(…)-00022-of-00039-1cfd39fa43c1917c.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

(…)-00023-of-00039-d078bcb55e45cbf0.parquet:   0%|          | 0.00/176M [00:00<?, ?B/s]

(…)-00024-of-00039-f4e3da0e5661e6d1.parquet:   0%|          | 0.00/181M [00:00<?, ?B/s]

(…)-00025-of-00039-3f6ebfbaba5f4c70.parquet:   0%|          | 0.00/206M [00:00<?, ?B/s]

(…)-00026-of-00039-7d4898300894cbbe.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

(…)-00027-of-00039-f8196766547533a2.parquet:   0%|          | 0.00/217M [00:00<?, ?B/s]

(…)-00028-of-00039-79a302af3c924863.parquet:   0%|          | 0.00/179M [00:00<?, ?B/s]

(…)-00029-of-00039-2b6615897d038115.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

(…)-00030-of-00039-4135cc54050afc22.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

(…)-00031-of-00039-40309dd907c042b7.parquet:   0%|          | 0.00/181M [00:00<?, ?B/s]

(…)-00032-of-00039-7b7d2068a3d9c359.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

(…)-00033-of-00039-53b0f749aacff9c1.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

(…)-00034-of-00039-a36ff0bff7d2a76f.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

(…)-00035-of-00039-d28f9be60314601f.parquet:   0%|          | 0.00/151M [00:00<?, ?B/s]

(…)-00036-of-00039-146e1a11c054aeab.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

(…)-00037-of-00039-995207c374a4e6f2.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

(…)-00038-of-00039-96a59dd6a98cd075.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

(…)-00000-of-00001-9c49eeff30aacaa8.parquet:   0%|          | 0.00/63.1M [00:00<?, ?B/s]

(…)-00000-of-00001-5e672c5751f060d3.parquet:   0%|          | 0.00/51.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13328 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/165 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/117 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00011.parquet:   0%|          | 0.00/290M [00:00<?, ?B/s]

train-00001-of-00011.parquet:   0%|          | 0.00/236M [00:00<?, ?B/s]

train-00002-of-00011.parquet:   0%|          | 0.00/152M [00:00<?, ?B/s]

train-00003-of-00011.parquet:   0%|          | 0.00/67.5M [00:00<?, ?B/s]

train-00004-of-00011.parquet:   0%|          | 0.00/176M [00:00<?, ?B/s]

train-00005-of-00011.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

train-00006-of-00011.parquet:   0%|          | 0.00/404M [00:00<?, ?B/s]

train-00007-of-00011.parquet:   0%|          | 0.00/281M [00:00<?, ?B/s]

train-00008-of-00011.parquet:   0%|          | 0.00/337M [00:00<?, ?B/s]

train-00009-of-00011.parquet:   0%|          | 0.00/111M [00:00<?, ?B/s]

train-00010-of-00011.parquet:   0%|          | 0.00/179M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/35.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9556 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/468 [00:00<?, ? examples/s]

>>> Đã load xong datasets tham chiếu.
>>> Đang stream OpenCodeReasoning-2. Bấm Stop (Ctrl+C) bất cứ lúc nào để lưu kết quả hiện tại.


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/70 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/70 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

805504it [10:01, 1339.70it/s]


>>> Đã đạt giới hạn 200000 mẫu.
>>> Tổng cộng thu thập được: 200000 mẫu.
>>> Đang đẩy lên Hub: tiendung6b/ocr2-filtered-200k...


Uploading the dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

>>> THÀNH CÔNG! Dataset tại: https://huggingface.co/datasets/tiendung6b/ocr2-filtered-200k
