In [1]:
import os
from datasets import load_dataset, load_from_disk
from datasets.distributed import split_dataset_by_node
from glob import glob

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_names = [f.split("/")[-1] for f in glob("/home/haolin.chen/sfr-tpu-us-central2-research/b.pang/cl_data/xgen3_3b_config_6_1/*")]
dataset_names

['DM_Mathematics',
 'Falcon-refinedweb',
 'Gutenberg',
 'RedPajama',
 'RedPajama_math',
 'Redpajama-Arxiv',
 'Wikipedia_en',
 'c4_2023-14',
 'cosmopedia_v2_parquet',
 'dclm-baseline-1.0-shuffled',
 'fineweb_edu_dedup',
 'open-web-math',
 'python_edu',
 'stackv2_Python_shuffled',
 'the-stack-v2-train-smol']

In [3]:
# Scan files to determine data types for each dataset
dataset_types = {}

for dataset_name in dataset_names:
    dataset_path = f"/home/haolin.chen/sfr-text-diffusion-model-research/data/xgen_cleaned_data/{dataset_name}"
    files = glob(f"{dataset_path}/*")
    
    if files:
        # Get file extensions
        extensions = set()
        for file in files[:10]:  # Check first 10 files
            if os.path.isfile(file):
                ext = os.path.splitext(file)[1]
                if ext:
                    extensions.add(ext)
        
        print(f"{dataset_name}: {list(extensions)} ({len(files)} files)")
        extension = list(extensions)[0]
        match extension:
            case ".jsonl":
                dataset_types[dataset_name] = (extension, "json")
            case ".json":
                dataset_types[dataset_name] = (extension, "json")
            case ".parquet":
                dataset_types[dataset_name] = (extension, "parquet")
    else:
        print(f"{dataset_name}: No files found")

DM_Mathematics: ['.json'] (42 files)
Falcon-refinedweb: ['.json'] (1251 files)
Gutenberg: ['.json'] (110 files)
RedPajama: ['.json'] (1005 files)
RedPajama_math: ['.json'] (1668 files)
Redpajama-Arxiv: ['.json'] (216 files)
Wikipedia_en: ['.json'] (194 files)
c4_2023-14: ['.json'] (5001 files)
cosmopedia_v2_parquet: ['.parquet'] (104 files)
dclm-baseline-1.0-shuffled: ['.json'] (1001 files)
fineweb_edu_dedup: ['.parquet'] (234 files)
open-web-math: ['.json'] (418 files)
python_edu: ['.jsonl'] (63 files)
stackv2_Python_shuffled: ['.json'] (513 files)
the-stack-v2-train-smol: ['.json'] (838 files)


In [4]:
dataset_types

{'DM_Mathematics': ('.json', 'json'),
 'Falcon-refinedweb': ('.json', 'json'),
 'Gutenberg': ('.json', 'json'),
 'RedPajama': ('.json', 'json'),
 'RedPajama_math': ('.json', 'json'),
 'Redpajama-Arxiv': ('.json', 'json'),
 'Wikipedia_en': ('.json', 'json'),
 'c4_2023-14': ('.json', 'json'),
 'cosmopedia_v2_parquet': ('.parquet', 'parquet'),
 'dclm-baseline-1.0-shuffled': ('.json', 'json'),
 'fineweb_edu_dedup': ('.parquet', 'parquet'),
 'open-web-math': ('.json', 'json'),
 'python_edu': ('.jsonl', 'json'),
 'stackv2_Python_shuffled': ('.json', 'json'),
 'the-stack-v2-train-smol': ('.json', 'json')}

In [6]:
datasets = {}
for dataset_name, (extention, dataset_type) in dataset_types.items():
    files = glob(f"/home/haolin.chen/sfr-text-diffusion-model-research/data/xgen_cleaned_data/{dataset_name}/*{extention}")
    try:
        datasets[dataset_name] = load_dataset(dataset_type, data_files=files[:1], streaming=True, split="train")
        iterator = iter(datasets[dataset_name])
        row = next(iterator)
        print(dataset_name, datasets[dataset_name], row.keys(), row)
        print("--------------------------------")
    except Exception as e:
        print(f"{dataset_name}: {e}")
    break

DM_Mathematics IterableDataset({
    features: Unknown,
    n_shards: 1
}) dict_keys(['text']) {'text': 'est common divisor of 6 and s.\n6\nLet a be 6/(-4) - (0 - 52/8). Suppose a*f + 50 = -2*q + 4*q, -2*q - 3*f = -50. Calculate the highest common factor of 10 and q.\n5\nSuppose 2*n + 4*p - 2769 = -3*n, 2*n - 1109 = -3*p. Let z = -238 + n. What is the highest common divisor of 35 and z?\n35\nSuppose 289 = 4*z - 287. Suppose 2*x - 18 = z. What is the highest common divisor of 54 and x?\n27\nLet y be (-10)/(40/(-658)) + 1/2. What is the greatest common divisor of y and 60?\n15\nLet h = 21790 + -21510. Let b = 9 + 31. Calculate the greatest common divisor of h and b.\n40\nLet p be 27 + (-2)/3*3. Suppose -5*z + 137 + 1154 = 4*g, -4*g - 1004 = -4*z. Let m = z + -55. What is the greatest common factor of p and m?\n25\nLet b be (-2 - (-1364)/22) + -13. Let y be -2*(-518)/2 + -1. Calculate the highest common divisor of b and y.\n47\nSuppose 0*k = 3*k + 15, 3*m + 4*k - 97 = 0. Let w = -2963 + 3

In [None]:
def add_prefix(examples):
    examples["prefixed_text"] = examples["text"]
    return examples
mapped_datasets = datasets["DM_Mathematics"].map(add_prefix, batched=True, remove_columns=["text"])

In [13]:
list(mapped_datasets.take(3))

TypeError: can only concatenate str (not "list") to str

In [9]:
row = next(iter(datasets["cosmopedia_v2_parquet"]))
print(row["text"])
print(row["prompt"])

 In today's ever-evolving world, technology has become an integral part of our lives, shaping the way we learn, work, and communicate. The COVID-19 pandemic has only accelerated this trend, forcing educational institutions worldwide to adapt quickly to remote learning models. As such, social studies integration in elementary education can greatly benefit from incorporating digital tools like those offered by Ruangguru's Online School. Let's explore how educators can effectively leverage these resources to create engaging and meaningful learning experiences for young students.

Firstly, let's define what we mean by social studies integration. Social studies encompasses various disciplines that help students understand their communities, societies, and the wider world around them. These subjects may include history, geography, civics, economics, sociology, and anthropology. By integrating social studies into the curriculum, we aim to foster critical thinking skills, promote cultural awar

In [17]:
dataset = load_dataset("json", data_files=files, streaming=True, split="train")

In [18]:
dataset

IterableDataset({
    features: Unknown,
    n_shards: 63
})

In [22]:
iterator = iter(dataset)
row = next(iterator)

In [25]:
print(row["text"])

#coding utf-8
'''
斐波那契数列-循环法
'''
def Fib_circle():
    while True:   # 去掉while循环，只用for循环
        num_1 = 0
        num_2 = 1
        fib_array = [0] # 用于存储计算出的FB数列值
        m = input('你想要查找的起始项：')
        n = input('你想要查找的结束项：')
        if m.isdigit() and n.isdigit():   # 在这个实现函数中，不要进行检验。每个函数只做一个事情
            m = int(m) # 将输入化为整数型
            n = int(n)
            for i in range(n):
                num_1, num_2 = num_2, num_1 + num_2
                fib_array.append(num_1)
            print(f'你要查找的数列为{list(enumerate(fib_array[m:], m))}')
            break
        else:
            print('请输入有效的正整数')

if __name__ == '__main__':
    Fib_circle()

