In [2]:
import os
import json
from datasets import load_dataset, load_from_disk, Dataset
from datasets.distributed import split_dataset_by_node
from transformers import AutoTokenizer
from glob import glob

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")
if tokenizer.mask_token is None:
    tokenizer.add_tokens("<|mask|>", special_tokens=True)
    tokenizer.add_special_tokens(
        {"mask_token": "<|mask|>"}, replace_additional_special_tokens=False
    )

In [18]:
tokenizer.pad_token, tokenizer.eos_token

('<|endoftext|>', '<|im_end|>')

In [8]:
dataset = load_dataset("loubnabnl/humaneval_infilling", name="HumanEval-RandomSpanInfillingLight", split="test")

In [6]:
dataset = dataset.map(lambda x: {"canonical_solution_length": len(tokenizer(x["canonical_solution"])["input_ids"]) })

In [7]:
def assemble_query(x):
    prompts = x["prompt"]
    suffixes = x["suffix"]
    canonical_solution_lengths = x["canonical_solution_length"]
    
    queries = []
    for prompt, suffix, length in zip(prompts, suffixes, canonical_solution_lengths):
        num_infill_tokens = max(64, length)
        query = prompt + tokenizer.mask_token * num_infill_tokens + suffix
        queries.append(query)
    
    return {"query": queries}

dataset = dataset.map(assemble_query, batched=True)

In [8]:
column_names = list(dataset.features)
tokenized_dataset = dataset.map(
    lambda x: tokenizer(x["query"]), batched=True, remove_columns=column_names
)


Map: 100%|██████████| 164/164 [00:00<00:00, 2114.17 examples/s]


In [18]:
max(tokenized_dataset.map(lambda x:  {"length": len(x["input_ids"])})["length"])

558

In [22]:
print(dataset["query"][0])


from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
    for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):<|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|><|mask|>
                if distance < threshold:
          

In [9]:
dataset_humaneval = load_dataset("openai/openai_humaneval", split="test")
dataset_humaneval.config_name

'openai_humaneval'

In [7]:
dataset.config_name

'HumanEval-RandomSpanInfillingLight'

In [20]:
dataset_names = [f.split("/")[-1] for f in glob("/home/haolin.chen/sfr-text-diffusion-model-research/data/xgen_cleaned_data/*")]
dataset_names

['DM_Mathematics',
 'Falcon-refinedweb',
 'Gutenberg',
 'RedPajama',
 'RedPajama_math',
 'Redpajama-Arxiv',
 'Wikipedia_en',
 'c4_2023-14',
 'cosmopedia_v2_parquet',
 'dclm-baseline-1.0-shuffled',
 'fineweb_edu_dedup',
 'open-web-math',
 'python_edu',
 'stackv2_Python_shuffled',
 'the-stack-v2-train-smol']

In [21]:
# Scan files to determine data types for each dataset
dataset_types = {}

for dataset_name in dataset_names:
    dataset_path = f"/home/haolin.chen/sfr-text-diffusion-model-research/data/xgen_cleaned_data/{dataset_name}"
    files = glob(f"{dataset_path}/*")
    
    if files:
        # Get file extensions
        extensions = set()
        for file in files[:10]:  # Check first 10 files
            if os.path.isfile(file):
                ext = os.path.splitext(file)[1]
                if ext:
                    extensions.add(ext)
        
        print(f"{dataset_name}: {list(extensions)} ({len(files)} files)")
        extension = list(extensions)[0]
        match extension:
            case ".jsonl":
                dataset_types[dataset_name] = (extension, "json")
            case ".json":
                dataset_types[dataset_name] = (extension, "json")
            case ".parquet":
                dataset_types[dataset_name] = (extension, "parquet")
    else:
        print(f"{dataset_name}: No files found")

DM_Mathematics: ['.json'] (42 files)
Falcon-refinedweb: ['.json'] (1251 files)
Gutenberg: ['.json'] (110 files)
RedPajama: ['.json'] (1005 files)
RedPajama_math: ['.json'] (1668 files)
Redpajama-Arxiv: ['.json'] (216 files)
Wikipedia_en: ['.json'] (194 files)
c4_2023-14: ['.json'] (5001 files)
cosmopedia_v2_parquet: ['.parquet'] (104 files)
dclm-baseline-1.0-shuffled: ['.json'] (1001 files)
fineweb_edu_dedup: ['.parquet'] (234 files)
open-web-math: ['.json'] (418 files)
python_edu: ['.jsonl'] (63 files)
stackv2_Python_shuffled: ['.json'] (513 files)
the-stack-v2-train-smol: ['.json'] (838 files)


In [23]:
datasets = {}
for dataset_name, (extention, dataset_type) in dataset_types.items():
    if dataset_name == "python_edu":
        files = glob(f"/home/haolin.chen/sfr-text-diffusion-model-research/data/xgen_cleaned_data/{dataset_name}/*{extention}")
        try:
            datasets[dataset_name] = load_dataset(dataset_type, data_files=files[:1], streaming=True, split="train")
            iterator = iter(datasets[dataset_name])
            row = next(iterator)
            print(dataset_name, datasets[dataset_name], row.keys(), row)
            print("--------------------------------")
        except Exception as e:
            print(f"{dataset_name}: {e}")
        break

python_edu IterableDataset({
    features: Unknown,
    n_shards: 1
}) dict_keys(['blob_id', 'repo_name', 'path', 'length_bytes', 'score', 'int_score', 'text']) {'blob_id': '55884a59514464a78f8002779532a7eb01b8331c', 'repo_name': 'sudajzp/jzp-s-python', 'path': '/FBNQ_py/Fib_circle.py', 'length_bytes': 854, 'score': 3.84375, 'int_score': 4, 'text': "#coding utf-8\n'''\n斐波那契数列-循环法\n'''\ndef Fib_circle():\n    while True:   # 去掉while循环，只用for循环\n        num_1 = 0\n        num_2 = 1\n        fib_array = [0] # 用于存储计算出的FB数列值\n        m = input('你想要查找的起始项：')\n        n = input('你想要查找的结束项：')\n        if m.isdigit() and n.isdigit():   # 在这个实现函数中，不要进行检验。每个函数只做一个事情\n            m = int(m) # 将输入化为整数型\n            n = int(n)\n            for i in range(n):\n                num_1, num_2 = num_2, num_1 + num_2\n                fib_array.append(num_1)\n            print(f'你要查找的数列为{list(enumerate(fib_array[m:], m))}')\n            break\n        else:\n            print('请输入有效的正整数')\n\nif __name__ == '

In [None]:
def add_prefix(examples):
    examples["prefixed_text"] = examples["text"]
    return examples
mapped_datasets = datasets["DM_Mathematics"].map(add_prefix, batched=True, remove_columns=["text"])

In [28]:
iterator = iter(datasets["python_edu"])

for i, row in enumerate(iterator):
    print(row["text"])
    print("----"*100)
    if i > 3:
        break


#coding utf-8
'''
斐波那契数列-循环法
'''
def Fib_circle():
    while True:   # 去掉while循环，只用for循环
        num_1 = 0
        num_2 = 1
        fib_array = [0] # 用于存储计算出的FB数列值
        m = input('你想要查找的起始项：')
        n = input('你想要查找的结束项：')
        if m.isdigit() and n.isdigit():   # 在这个实现函数中，不要进行检验。每个函数只做一个事情
            m = int(m) # 将输入化为整数型
            n = int(n)
            for i in range(n):
                num_1, num_2 = num_2, num_1 + num_2
                fib_array.append(num_1)
            print(f'你要查找的数列为{list(enumerate(fib_array[m:], m))}')
            break
        else:
            print('请输入有效的正整数')

if __name__ == '__main__':
    Fib_circle()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [17]:
dataset = load_dataset("json", data_files=files, streaming=True, split="train")

In [18]:
dataset

IterableDataset({
    features: Unknown,
    n_shards: 63
})

In [22]:
iterator = iter(dataset)
row = next(iterator)

In [25]:
print(row["text"])


#coding utf-8
'''
斐波那契数列-循环法
'''
def Fib_circle():
    while True:   # 去掉while循环，只用for循环
        num_1 = 0
        num_2 = 1
        fib_array = [0] # 用于存储计算出的FB数列值
        m = input('你想要查找的起始项：')
        n = input('你想要查找的结束项：')
        if m.isdigit() and n.isdigit():   # 在这个实现函数中，不要进行检验。每个函数只做一个事情
            m = int(m) # 将输入化为整数型
            n = int(n)
            for i in range(n):
                num_1, num_2 = num_2, num_1 + num_2
                fib_array.append(num_1)
            print(f'你要查找的数列为{list(enumerate(fib_array[m:], m))}')
            break
        else:
            print('请输入有效的正整数')

if __name__ == '__main__':
    Fib_circle()



In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("qwen/Qwen3-1.7B")



In [2]:
tokenizer

Qwen2TokenizerFast(name_or_path='qwen/Qwen3-1.7B', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized

In [15]:
import json


with open("evaluations/loubnabnl/humaneval_infilling/flex-qwen3-1b-gcs-pretrain-all-data-dataloader-no-split-512_135000_20250630_065407.json", "r") as f:
    generations = json.load(f)

len(generations)


164

In [16]:
dataset = dataset.add_column("completion", generations)

In [17]:
dataset.to_json("evaluations/loubnabnl/humaneval_infilling/flex-qwen3-1b-gcs-pretrain-all-data-dataloader-no-split-512_135000_20250630_065407.jsonl")

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 185.48ba/s]


442614