In [None]:
#!/usr/bin/env python3
"""
merge_high_quality_sft.py
Download, filter (min length), merge and upload several SFT datasets.
"""

import os
import json
from typing import List, Dict, Any
import datasets as ds
from datasets import Features, Sequence, Value
from transformers import AutoTokenizer

# ------------------------------------------------------------------
# CONFIGURATION
# ------------------------------------------------------------------
HF_TOKEN    = os.getenv("HF_TOKEN")          # Hugging Face token (write scope)
OUTPUT_REPO = "YOUR_HF_USERNAME/merged-sft-mix"
MIN_TOKENS  = 64                             # min prompt+answer tokens
TOKENIZER   = "meta-llama/Llama-3.1-8B-Instruct"  # fast, permissive tokenizer

DATASET_SPECS = {
    "tulu3":                    "allenai/tulu-3-sft-mixture", # 939,343 rows 1.41GB
    "hermes3":                  "NousResearch/Hermes-3-Dataset",
    "perfectblend":             "mlabonne/open-perfectblend",
    "acereason":                "nvidia/AceReason-1.1-SFT", # 3,958,018 rows 2,668,741 math / 1,301,591 code 2.19GB - different format
    "moaa":                     "togethercomputer/gemma-2-9b-it-MoAA-DPO",  # DPO pairs → use chosen
    "orca_agentinstruct":       "microsoft/orca-agentinstruct-1M-v1", # same format as tulu3
    "o1open":                   "O1-OPEN/OpenO1-SFT", # different format
    "infinityinstruct":          "BAAI/Infinity-Instruct", # huge 25mn rows
    "megascience":               "MegaScience/MegaScience", # 1,253,230 rows 1.88GB - has reference answers that can be used by LLM to filter out bad answers
}

In [None]:
tulu3 = ds.load_dataset("allenai/tulu-3-sft-mixture", split="train")
hermes3 = ds.load_dataset("NousResearch/Hermes-3-Dataset", split="train")
perfectblend = ds.load_dataset("mlabonne/open-perfectblend", split="train")
acereason = ds.load_dataset("nvidia/AceReason-1.1-SFT", split="train")
#moaa = ds.load_dataset("togethercomputer/gemma-2-9b-it-MoAA-DPO", split="train")
# orca ai stuff: ['creative_content', 'text_modification', 'struct2text_flow', 'rc', 'rag', 'text_extraction', 'mcq', 'follow_up', 'analytical_reasoning', 'fermi', 'fs_cot_flow', 'code_', 'brain_teaser', 'text_classification', 'open_domain_qa']
orca_ai_code = ds.load_dataset("microsoft/orca-agentinstruct-1M-v1", split="code_")
orca_ai_textmod = ds.load_dataset("microsoft/orca-agentinstruct-1M-v1", split="text_modification")
orca_ai_textext = ds.load_dataset("microsoft/orca-agentinstruct-1M-v1", split="text_extraction")
orca_ai_analytical = ds.load_dataset("microsoft/orca-agentinstruct-1M-v1", split="analytical_reasoning")
o1open = ds.load_dataset("O1-OPEN/OpenO1-SFT", split="train")
infinityinstruct = ds.load_dataset("BAAI/Infinity-Instruct", "7M")
megascience = ds.load_dataset("MegaScience/MegaScience", split="train")


Resolving data files:   0%|          | 0/189 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/182 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/75 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/75 [00:00<?, ?files/s]

train-00000-of-00075.parquet:   0%|          | 0.00/81.7M [00:00<?, ?B/s]

train-00001-of-00075.parquet:   0%|          | 0.00/81.0M [00:00<?, ?B/s]

train-00002-of-00075.parquet:   0%|          | 0.00/81.8M [00:00<?, ?B/s]

train-00003-of-00075.parquet:   0%|          | 0.00/81.4M [00:00<?, ?B/s]

train-00004-of-00075.parquet:   0%|          | 0.00/81.7M [00:00<?, ?B/s]

train-00005-of-00075.parquet:   0%|          | 0.00/81.5M [00:00<?, ?B/s]

train-00006-of-00075.parquet:   0%|          | 0.00/80.4M [00:00<?, ?B/s]

train-00007-of-00075.parquet:   0%|          | 0.00/80.8M [00:00<?, ?B/s]

train-00008-of-00075.parquet:   0%|          | 0.00/80.8M [00:00<?, ?B/s]

train-00009-of-00075.parquet:   0%|          | 0.00/81.8M [00:00<?, ?B/s]

train-00010-of-00075.parquet:   0%|          | 0.00/81.8M [00:00<?, ?B/s]

train-00011-of-00075.parquet:   0%|          | 0.00/80.8M [00:00<?, ?B/s]

train-00012-of-00075.parquet:   0%|          | 0.00/81.1M [00:00<?, ?B/s]

train-00013-of-00075.parquet:   0%|          | 0.00/81.3M [00:00<?, ?B/s]

train-00014-of-00075.parquet:   0%|          | 0.00/81.1M [00:00<?, ?B/s]

train-00015-of-00075.parquet:   0%|          | 0.00/80.4M [00:00<?, ?B/s]

train-00016-of-00075.parquet:   0%|          | 0.00/81.6M [00:00<?, ?B/s]

train-00017-of-00075.parquet:   0%|          | 0.00/80.7M [00:00<?, ?B/s]

train-00018-of-00075.parquet:   0%|          | 0.00/81.4M [00:00<?, ?B/s]

train-00019-of-00075.parquet:   0%|          | 0.00/80.9M [00:00<?, ?B/s]

train-00020-of-00075.parquet:   0%|          | 0.00/82.1M [00:00<?, ?B/s]

train-00021-of-00075.parquet:   0%|          | 0.00/81.4M [00:00<?, ?B/s]

train-00022-of-00075.parquet:   0%|          | 0.00/81.3M [00:00<?, ?B/s]

train-00023-of-00075.parquet:   0%|          | 0.00/82.1M [00:00<?, ?B/s]

train-00024-of-00075.parquet:   0%|          | 0.00/81.0M [00:00<?, ?B/s]

train-00025-of-00075.parquet:   0%|          | 0.00/81.9M [00:00<?, ?B/s]

train-00026-of-00075.parquet:   0%|          | 0.00/81.4M [00:00<?, ?B/s]

train-00027-of-00075.parquet:   0%|          | 0.00/81.0M [00:00<?, ?B/s]

train-00028-of-00075.parquet:   0%|          | 0.00/82.0M [00:00<?, ?B/s]

train-00029-of-00075.parquet:   0%|          | 0.00/81.3M [00:00<?, ?B/s]

train-00030-of-00075.parquet:   0%|          | 0.00/80.9M [00:00<?, ?B/s]

train-00031-of-00075.parquet:   0%|          | 0.00/81.8M [00:00<?, ?B/s]

train-00032-of-00075.parquet:   0%|          | 0.00/81.1M [00:00<?, ?B/s]

train-00033-of-00075.parquet:   0%|          | 0.00/80.6M [00:00<?, ?B/s]

train-00034-of-00075.parquet:   0%|          | 0.00/81.3M [00:00<?, ?B/s]

train-00035-of-00075.parquet:   0%|          | 0.00/81.6M [00:00<?, ?B/s]

train-00036-of-00075.parquet:   0%|          | 0.00/81.6M [00:00<?, ?B/s]

train-00037-of-00075.parquet:   0%|          | 0.00/81.2M [00:00<?, ?B/s]

train-00038-of-00075.parquet:   0%|          | 0.00/81.5M [00:00<?, ?B/s]

train-00039-of-00075.parquet:   0%|          | 0.00/80.4M [00:00<?, ?B/s]

train-00040-of-00075.parquet:   0%|          | 0.00/81.3M [00:00<?, ?B/s]

train-00041-of-00075.parquet:   0%|          | 0.00/81.8M [00:00<?, ?B/s]

train-00042-of-00075.parquet:   0%|          | 0.00/81.7M [00:00<?, ?B/s]

train-00043-of-00075.parquet:   0%|          | 0.00/80.9M [00:00<?, ?B/s]

train-00044-of-00075.parquet:   0%|          | 0.00/82.2M [00:00<?, ?B/s]

train-00045-of-00075.parquet:   0%|          | 0.00/81.0M [00:00<?, ?B/s]

train-00046-of-00075.parquet:   0%|          | 0.00/80.3M [00:00<?, ?B/s]

train-00047-of-00075.parquet:   0%|          | 0.00/80.7M [00:00<?, ?B/s]

train-00048-of-00075.parquet:   0%|          | 0.00/81.4M [00:00<?, ?B/s]

train-00049-of-00075.parquet:   0%|          | 0.00/80.9M [00:00<?, ?B/s]

train-00050-of-00075.parquet:   0%|          | 0.00/81.1M [00:00<?, ?B/s]

train-00051-of-00075.parquet:   0%|          | 0.00/81.8M [00:00<?, ?B/s]

train-00052-of-00075.parquet:   0%|          | 0.00/80.3M [00:00<?, ?B/s]

train-00053-of-00075.parquet:   0%|          | 0.00/80.9M [00:00<?, ?B/s]

train-00054-of-00075.parquet:   0%|          | 0.00/81.8M [00:00<?, ?B/s]

train-00055-of-00075.parquet:   0%|          | 0.00/81.0M [00:00<?, ?B/s]

train-00056-of-00075.parquet:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

train-00057-of-00075.parquet:   0%|          | 0.00/80.5M [00:00<?, ?B/s]

train-00058-of-00075.parquet:   0%|          | 0.00/80.7M [00:00<?, ?B/s]

train-00059-of-00075.parquet:   0%|          | 0.00/80.7M [00:00<?, ?B/s]

train-00060-of-00075.parquet:   0%|          | 0.00/81.4M [00:00<?, ?B/s]

train-00061-of-00075.parquet:   0%|          | 0.00/81.7M [00:00<?, ?B/s]

train-00062-of-00075.parquet:   0%|          | 0.00/80.8M [00:00<?, ?B/s]

train-00063-of-00075.parquet:   0%|          | 0.00/81.2M [00:00<?, ?B/s]

train-00064-of-00075.parquet:   0%|          | 0.00/80.9M [00:00<?, ?B/s]

train-00065-of-00075.parquet:   0%|          | 0.00/81.5M [00:00<?, ?B/s]

train-00066-of-00075.parquet:   0%|          | 0.00/81.1M [00:00<?, ?B/s]

train-00067-of-00075.parquet:   0%|          | 0.00/81.1M [00:00<?, ?B/s]

train-00068-of-00075.parquet:   0%|          | 0.00/81.2M [00:00<?, ?B/s]

train-00069-of-00075.parquet:   0%|          | 0.00/81.3M [00:00<?, ?B/s]

train-00070-of-00075.parquet:   0%|          | 0.00/81.7M [00:00<?, ?B/s]

train-00071-of-00075.parquet:   0%|          | 0.00/81.2M [00:00<?, ?B/s]

train-00072-of-00075.parquet:   0%|          | 0.00/81.4M [00:00<?, ?B/s]

train-00073-of-00075.parquet:   0%|          | 0.00/82.4M [00:00<?, ?B/s]

train-00074-of-00075.parquet:   0%|          | 0.00/39.7M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/25 [00:00<?, ?it/s]

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1253230 [00:00<?, ? examples/s]

In [None]:
len(tulu3), len(hermes3), len(perfectblend), len(acereason), len(orca_ai_code), len(orca_ai_textmod), len(orca_ai_textext), len(orca_ai_analytical), len(o1open), len(infinityinstruct), len(megascience)

(939343, 3970332, 100000, 50000, 50000, 25000, 77685)

In [None]:
#sample = tulu3[0]['messages']
sample = hermes3[0]
print(json.dumps(sample, indent=2, sort_keys=True))

[
  {
    "content": "Create a snippet of Terraform HCL code that create an AWS autoscaling group, and an ALB in front to expose an application to internet.",
    "role": "user"
  },
  {
    "content": "Sure, here's an example Terraform HCL code that creates an AWS Autoscaling Group and an Application Load Balancer to expose an application to the internet:\n``` \n# Configure the AWS provider\nprovider \"aws\" {\n  region = \"us-east-1\"\n}\n\n# Create a security group to allow traffic to the ALB\nresource \"aws_security_group\" \"alb_sg\" {\n  name_prefix = \"alb_sg\"\n  ingress {\n    from_port = 80\n    to_port = 80\n    protocol = \"tcp\"\n    cidr_blocks = [\"0.0.0.0/0\"]\n  }\n}\n\n# Create an ALB and target group\nresource \"aws_lb\" \"alb\" {\n  name               = \"example-alb\"\n  internal           = false\n  load_balancer_type = \"application\"\n\n  subnets = [\"subnet-12345678\", \"subnet-87654321\"]\n\n  security_groups = [aws_security_group.alb_sg.id]\n\n  tags = {\n   

In [54]:
#acereason[0]

In [56]:
type(acereason), type(acereason[:10])

(datasets.arrow_dataset.Dataset, dict)

In [62]:
#acereason = ds.load_dataset("nvidia/AceReason-1.1-SFT", split="train")

# ------------- define transform --------------------------------------------
def build_messages(example):
    """
    Turn a (input, output) pair into a `messages` list that chat models expect:
      [{"role": "user", "content": ...},
       {"role": "assistant", "content": ...}]
    """
    return {
        "messages": [
            {"role": "user", "content": example["input"]},
            {"role": "assistant", "content": example["output"]},
        ]
    }

def build_messages(example):
    return {
        "messages": [
            {"role": "user", "content": example["input"]},
            {"role": "assistant", "content": example["output"]},
        ]
    }

acereason = acereason.map(
    build_messages,
    remove_columns=["input", "output"],
    load_from_cache_file=False,   # ← ignore any stored schema
    desc="build messages",
)

# ------------- check --------------------------------------------------------
print(acereason)
print(acereason[0]["messages"])

build messages:   0%|          | 0/3970332 [00:00<?, ? examples/s]

Dataset({
    features: ['category', 'source', 'messages'],
    num_rows: 3970332
})
[{'content': 'For sets $A$, $B$, and $C$, where $n(A)$, $n(B)$, and $n(C)$ denote the number of subsets of $A$, $B$, and $C$ respectively, and $\\left | A \\right | = \\left | B \\right | = 100$, if $n(A) + n(B) + n(C) = n(A \\cap B \\cap C)$, what is the minimum possible value of $\\left | A \\cap B \\cap C \\right |$?', 'role': 'user'}, {'content': '<think>\nOkay, let\'s try to tackle this problem. Hmm, so we have three sets A, B, and C. The problem states that n(A), n(B), and n(C) are the number of subsets of each set. Wait, but usually, the number of subsets of a set with size k is 2^k. So maybe n(A) is 2^|A|, n(B) is 2^|B|, and similarly for n(C). Let me check: the problem says "n(A), n(B), and n(C) denote the number of subsets of A, B, and C respectively." Yep, so that means n(A) = 2^{|A|}, n(B) = 2^{|B|}, n(C) = 2^{|C|}. \n\nGiven that |A| = |B| = 100. So n(A) = 2^100, n(B) = 2^100. The equation

In [None]:
dict(o1open[0]).keys()

dict_keys(['instruction', 'output'])

In [None]:
dict(o1open[0]).keys()

dict_keys(['instruction', 'output'])

In [43]:
tulu3[0]

{'id': 'oasst1_5921',
 'messages': [{'content': 'Create a snippet of Terraform HCL code that create an AWS autoscaling group, and an ALB in front to expose an application to internet.',
   'role': 'user'},
  {'content': 'Sure, here\'s an example Terraform HCL code that creates an AWS Autoscaling Group and an Application Load Balancer to expose an application to the internet:\n``` \n# Configure the AWS provider\nprovider "aws" {\n  region = "us-east-1"\n}\n\n# Create a security group to allow traffic to the ALB\nresource "aws_security_group" "alb_sg" {\n  name_prefix = "alb_sg"\n  ingress {\n    from_port = 80\n    to_port = 80\n    protocol = "tcp"\n    cidr_blocks = ["0.0.0.0/0"]\n  }\n}\n\n# Create an ALB and target group\nresource "aws_lb" "alb" {\n  name               = "example-alb"\n  internal           = false\n  load_balancer_type = "application"\n\n  subnets = ["subnet-12345678", "subnet-87654321"]\n\n  security_groups = [aws_security_group.alb_sg.id]\n\n  tags = {\n    Enviro

In [42]:
acereason[0]

{'category': 'math',
 'source': 'OpenMathReasoning',
 'input': 'For sets $A$, $B$, and $C$, where $n(A)$, $n(B)$, and $n(C)$ denote the number of subsets of $A$, $B$, and $C$ respectively, and $\\left | A \\right | = \\left | B \\right | = 100$, if $n(A) + n(B) + n(C) = n(A \\cap B \\cap C)$, what is the minimum possible value of $\\left | A \\cap B \\cap C \\right |$?',
 'output': '<think>\nOkay, let\'s try to tackle this problem. Hmm, so we have three sets A, B, and C. The problem states that n(A), n(B), and n(C) are the number of subsets of each set. Wait, but usually, the number of subsets of a set with size k is 2^k. So maybe n(A) is 2^|A|, n(B) is 2^|B|, and similarly for n(C). Let me check: the problem says "n(A), n(B), and n(C) denote the number of subsets of A, B, and C respectively." Yep, so that means n(A) = 2^{|A|}, n(B) = 2^{|B|}, n(C) = 2^{|C|}. \n\nGiven that |A| = |B| = 100. So n(A) = 2^100, n(B) = 2^100. The equation given is n(A) + n(B) + n(C) = n(A ∩ B ∩ C). So that 

In [None]:

# ------------------------------------------------------------------
# UTILITIES
# ------------------------------------------------------------------
tok = AutoTokenizer.from_pretrained(TOKENIZER, use_fast=True)

def token_len(text: str) -> int:
    return len(tok.encode(text))

def load_and_filter(name: str, split: str = "train") -> ds.Dataset:
    """Load a dataset and keep only samples longer than MIN_TOKENS."""
    print(f"📥 Loading {name} …")
    d = ds.load_dataset(DATASET_SPECS[name], split=split)

    # unify column names: we expect "prompt" and "response"
    if name == "tulu3":
        d = d.rename_column("messages", "prompt")  # actually chat turns; flatten later
        d = d.rename_column("chosen", "response")
    elif name == "rewild":
        d = d.rename_column("prompt", "prompt")
        d = d.rename_column("completion", "response")
    elif name == "perfectblend":
        d = d.rename_column("instruction", "prompt")
        d = d.rename_column("output", "response")
    elif name == "acereason":
        d = d.rename_column("question", "prompt")
        d = d.rename_column("solution", "response")
    elif name == "moaa":
        # MoAA is stored as DPO pairs → use the "chosen" field
        d = d.rename_column("prompt", "prompt")
        d = d.rename_column("chosen", "response")

    # If prompt/response are lists of turns, concat into single strings
    def stringify(example: Dict[str, Any]) -> Dict[str, Any]:
        p, r = example["prompt"], example["response"]
        if isinstance(p, list):
            p = tok.apply_chat_template(p, tokenize=False)
        if isinstance(r, list):
            r = tok.apply_chat_template(r, tokenize=False)
        return {"prompt": str(p), "response": str(r)}

    d = d.map(stringify, remove_columns=d.column_names)

    # Length filter
    def long_enough(ex):
        return token_len(ex["prompt"] + ex["response"]) >= MIN_TOKENS

    d = d.filter(long_enough, num_proc=os.cpu_count())
    print(f"✅ {name}: kept {len(d):,} / {len(d):,} samples")
    return d


In [None]:
datasets: List[ds.Dataset] = []

for key in DATASET_SPECS:
    datasets.append(load_and_filter(key))

In [38]:
import json
from datasets import Dataset    # only for the type hints

def fix(example):
    """
    1. If `messages` is a JSON string, turn it into a Python object
       (list of dicts).
    2. Remove empty system prompts: {"role": "system", "content": ""}
    """
    msgs = example["messages"]

    # 1) Convert JSON string ➜ Python
    if isinstance(msgs, str):
        msgs = json.loads(msgs)

    # 2) Filter out empty system prompts
    if isinstance(msgs, list):        # normal case
        msgs = [
            m for m in msgs
            if not (m.get("role") == "system" and m.get("content", "").strip() == "")
        ]

    # Return a dict with the column(s) to update
    return {"messages": msgs}

datasets = [tulu3,
            orca_ai_code,
            orca_ai_textmod,
            orca_ai_textext,
            orca_ai_analytical]

new_datasets = [ds.map(fix,  # or num_proc=4 for multiprocessing
                         desc="Converting messages")
                  for ds in datasets]

Converting messages:   0%|          | 0/939343 [00:00<?, ? examples/s]

Converting messages:   0%|          | 0/100000 [00:00<?, ? examples/s]

Converting messages:   0%|          | 0/50000 [00:00<?, ? examples/s]

Converting messages:   0%|          | 0/50000 [00:00<?, ? examples/s]

Converting messages:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [39]:
new_datasets[1][0]['messages']

  'role': 'user'},
 {'content': 'Output:\n\nIn the given scenario, the `find_path` command is used to search for the `mylib.h` header file. Since `mylib.h` is not found, `MYLIB_INCLUDE_DIRS` will not be set, and this will be a critical failure for the `find_package_handle_standard_args` function, which expects `MYLIB_INCLUDE_DIRS` to be set to a valid path.\n\nThe `find_library` command is used to locate the `mylib` library and its extra component. The library names are determined based on the platform. For Windows, it looks for `mylib_win`, for Unix-like systems, it looks for `mylib_unix` or `mylib_unix_alt`, and for other platforms, it defaults to `mylib`. Since the libraries are found in non-standard locations, `MYLIB_LIBRARIES` and `MYLIB_EXTRA_LIBRARIES` will be set to those locations.\n\nThe `find_package_handle_standard_args` function is called to handle the results of the find commands. It checks if the required variables (`MYLIB_LIBRARIES`, `MYLIB_INCLUDE_DIRS`, `MYLIB_EXTRA_L

In [None]:
dict(o1open[0]).keys()

dict_keys(['instruction', 'output'])

In [None]:
dict(o1open[0]).keys()

dict_keys(['instruction', 'output'])

In [32]:
datasets[1][0]['messages']



In [None]:
acereason = acereason.shuffle(seed=42)
ace1m = acereason.select(range(1_000_000)).select_columns(['messages'])
print(len(ace1m))
ace1m[0]

#ace1m.push_to_hub('voxmenthe/acereason-1m', private=False, token=HF_TOKEN)

In [73]:
#new_datasets.append(ace800k)
new_datasets[0] = tulu3.shuffle(seed=42).select(range(400_000))
new_datasets[-1] = acereason.shuffle(seed=42).select(range(1_200_000))

print("🔗 Concatenating …")
merged: ds.Dataset = ds.concatenate_datasets(new_datasets)
print(f"📊 Total samples after merge: {len(merged):,}")

# Add provenance tag
# merged = merged.add_column("source", [k for k in DATASET_SPECS for _ in range(len(ds.load_dataset(DATASET_SPECS[k], split="train")))])

# Push to Hub
print("☁️ Uploading to Hugging Face Hub …")
merged.push_to_hub('voxmenthe/merged-sft-coding-mix2', private=False, token=HF_TOKEN)
print("🎉 Done!")

🔗 Concatenating …
📊 Total samples after merge: 1,825,000
☁️ Uploading to Hugging Face Hub …


Uploading the dataset shards:   0%|          | 0/70 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/440 [00:00<?, ?B/s]

🎉 Done!
