In [47]:
from datasets import load_dataset, Dataset, DatasetDict

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

n_samples = 2_000_000
context_length = 1024

In [2]:
DATA_PATH = "/nobackup1/wyf/"

# Takes like 30s to load (it's bad)
raw_dataset = load_dataset(
    "mlfoundations/dclm-baseline-1.0",
    split="train",
    streaming=True,
)

Resolving data files:   0%|          | 0/27838 [00:00<?, ?it/s]

In [None]:
from tqdm import tqdm
import sys

def filter_dataset(dataset, n_samples: int = None):
    # filtered = []
    # for sample in tqdm(iter(dataset["train"].take(n_samples)), total=n_samples):
    #     # IMPORTANT REVERSAL STEP
    #     filtered.append(sample["text"][::-1])
    # return filtered

    return (
        dataset
            .select_columns(["text"])
            .map(lambda s: {"text": s["text"][::-1]})
    )
    
# 1k examples: 4.0s

print("Generating split datasets...")
raw_dataset_with_tqdm = [x for x in tqdm(raw_dataset.take(n_samples), total=n_samples)]
split_datasets = (
    Dataset.from_list(list(raw_dataset_with_tqdm))
        .train_test_split(test_size=0.1, seed=0)
)
datasets = DatasetDict({
    "train": filter_dataset(split_datasets["train"]),
    "valid": filter_dataset(split_datasets["test"]),
})

Generating split datasets...


 42%|████▏     | 832082/2000000 [12:12<16:34, 1173.83it/s] 

In [None]:
for split_name, dataset in datasets.items():
    dataset.to_parquet(f"./data/dclm_{n_samples}/{split_name}.parquet")

In [48]:
datasets = DatasetDict({
    "train": Dataset.from_parquet(f"./data/dclm_{n_samples}/train.parquet"),
    "valid": Dataset.from_parquet(f"./data/dclm_{n_samples}/valid.parquet")
})

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

In [52]:
import random
print(datasets["train"][random.randint(0, n_samples)]["text"][::-1])

Big Thief

Two Hands

  • AllMusic Rating
  • User Ratings (0)
  • Your Rating
  • 在线a久草一级a做爰视频免费观看久草在线新免费观看海外网:此次香港暴力示威有明显“颜色革命”特征


    　　“你要跑？”何愁有橫跨一步擋住了側門。 　　“沒有，去了鴻臚寺，鬼谷子今日在鴻臚寺講授《本經陰符七術》。”在线a久草 　　“為什麼這些人名都被勾掉了？”一级a做爰视频免费观看 　　這兩句詩，雲瑯記得很清楚。 　　雲瑯放下茶杯道︰“首先，御史大夫一定要弄明白一件事，長門宮，與雲氏之所以會興子錢，不是為了牟利。”久草在线新免费观看 　　蘭英，蘭喬性子粗野，如果想要徹底的融入雲氏，無論如何，也要有漢家女子的模樣才好。 Following quickly on the heels of the spacey, artful U.F.O.F. -- by five months, to be exact -- Big Thief's fourth long-player, Two Hands, was recorded just days after its contrasting sister album. However, while U.F.O.F. was tracked at a wooded facility outside of Seattle, the band deliberately moved to the 100-plus-degree environs of a desert studio west of El Paso for Two Hands. The humid-versus-dry distinction makes for a convenient musical simile, as Two Hands commits to a crisper, more jagged sound on a rawer set of indie rock songs. Though less improvised-sounding on the whole than its predecessor, the loose Two Hands was recorded live w

In [None]:
# 7.4s on 1k examples
# 3m 30s on 200k examples

from transformers import AutoTokenizer, LlamaTokenizer
from tokenizers import SentencePieceBPETokenizer
from tqdm import tqdm

def text_iterator():
    for x in tqdm(datasets["train"]["text"]):
        yield x

spm_tokenizer = SentencePieceBPETokenizer()
spm_tokenizer.train_from_iterator(
    text_iterator(),
    vocab_size=52_000,
    min_frequency=5,
    show_progress=True,
    limit_alphabet=500,
)

In [None]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=spm_tokenizer,
    bos_token="<s>",           # Always added at start
    eos_token="</s>",          # Always added at end  
    unk_token="<unk>",         # Replaces unknown words
    pad_token="<pad>",         # Used for padding shorter sequences
)
tokenizer.save_pretrained("./tokenizers/spm_200k")

In [None]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained("./tokenizers/spm_200k")

In [None]:
# USES CONTEXT LENGTH
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [None]:
# 25s to parse 1k examples
# 4m 40s to parse 10k examples
# 7m 50s to parse 200k examples
tokenized_dataset = datasets["train"].map(tokenize, batched=True, remove_columns=["text"], batch_size=32)
tokenized_dataset_valid = datasets["valid"].map(tokenize, batched=True, remove_columns=["text"], batch_size=32)

In [45]:
# 3.0s to save (wow)
tokenized_dataset.to_parquet(f"./data/dclm_{n_samples}_tokenized_{context_length}.parquet")
tokenized_dataset_valid.to_parquet(f"./data/dclm_{n_samples}_tokenized_{context_length}_valid.parquet")

NameError: name 'tokenized_dataset' is not defined

In [46]:
tokenized_dataset = Dataset.from_parquet(f"./data/dclm_{n_samples}_tokenized_{context_length}.parquet")
tokenized_dataset_valid = Dataset.from_parquet(f"./data/dclm_{n_samples}_tokenized_{context_length}_valid.parquet")

NameError: name 'Dataset' is not defined

In [None]:
print(tokenized_dataset)
print(f"Produced dataset of {tokenized_dataset.num_rows:,} rows, {context_length} tokens each")
print(f"Total tokens: {tokenized_dataset.num_rows * context_length:,}")

In [None]:
print(f"Tokenizer vocab size: {len(tokenizer)}")
print(f"Model config vocab size: {tokenizer.vocab_size}")
print(f"BOS token ID: {tokenizer.bos_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")
print(f"PAD token ID: {tokenizer.pad_token_id}")

# Check a sample tokenization
sample_text = "hello world"
tokens = tokenizer(sample_text)
print(f"Sample tokens: {tokens}")

In [None]:
# 3.2s to initialize model

from transformers import LlamaConfig, LlamaForCausalLM
import torch

model_size = "2B"

config = LlamaConfig(
    vocab_size=len(tokenizer),
    max_position_embeddings=8192,
    hidden_size=2048 if model_size == "2B" else 3072,
    intermediate_size=16384 if model_size == "2B" else 24576,
    num_hidden_layers=18 if model_size == "2B" else 28,
    num_attention_heads=8 if model_size == "2B" else 16,
    num_key_value_heads=1 if model_size == "2B" else 16,
    rms_norm_eps=1e-5,
    tie_word_embeddings=False,
    rope_scaling=None,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

with torch.device("meta"):
    model = LlamaForCausalLM(config)
    print(f"Initialized model on meta device")

model = model.to_empty(device="cuda")

In [None]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = "<pad>"
tokenizer.bos_token = "<s>"
tokenizer.eos_token = "</s>"
tokenizer.unk_token = "<unk>"
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
# 0.1s to initialize training args

from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="reverse-model-2B",
    
    # Batch size settings - LEDOM uses global batch size of 1024 sequences
    per_device_train_batch_size=1,  # Micro-batch size per GPU
    per_device_eval_batch_size=1,   # Used in their fine-tuning setup
    gradient_accumulation_steps=1, # To achieve global batch size (adjust based on GPU count)

    eval_strategy="steps",        # Evaluate every N steps
    eval_steps=5000,     # Eval every N steps  
    logging_steps=1,  # More frequent logging to match their monitoring
    
    # Training duration - LEDOM trained for ~51,900 iterations for 7B model
    num_train_epochs=1,  # Keep as 1 epoch since they trained on 435B tokens once
    
    # Optimizer settings - match LEDOM exactly
    optim="adamw_torch",
    learning_rate=2e-4,           # Peak learning rate: 2×10⁻⁴ 
    weight_decay=0.1,             # Matches their setting
    adam_beta1=0.9,               # Adam β₁
    adam_beta2=0.95,              # Adam β₂  
    adam_epsilon=1e-8,            # Adam ε
    
    # Learning rate schedule - LEDOM uses cosine with specific warmup
    lr_scheduler_type="cosine",
    warmup_steps=2000,            # LEDOM uses 2000 warmup iterations
    
    # Gradient settings
    max_grad_norm=1.0,            # Gradient clipping norm
    
    # Precision - LEDOM uses BF16, not FP16
    bf16=True,                    # Use BF16 instead of FP16
    fp16=False,                   # Disable FP16
    
    # Checkpointing
    save_steps=5_000,
    save_total_limit=3,           # Reasonable limit for storage
    save_only_model=True,
    
    # Additional LEDOM-specific settings
    dataloader_num_workers=2,     # For efficiency
    remove_unused_columns=False,  # Keep all data columns
    
    # Disable features not used in LEDOM training
    load_best_model_at_end=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_valid,
)

In [None]:
torch.cuda.empty_cache()

In [None]:
# 1m for 1k samples (2.2M tokens)
trainer.train()

## Test text generation

In [1]:
# import torch
# from transformers import pipeline

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# pipe = pipeline(
#     "text-generation", model="./reverse-model/checkpoint-9", device=device, 
# )

import os
import torch
from transformers import pipeline

# Base model directory
base_dir = "./reverse-model"

# Find the first subdirectory (sorted for consistency)
subdirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
if not subdirs:
    raise FileNotFoundError(f"No subdirectories found in {base_dir}")

subdirs = ["checkpoint-1600"]
first_checkpoint = os.path.join(base_dir, sorted(subdirs)[0])
first_checkpoint = "reverse-model-2B/checkpoint-1600"

print(f"Using model from: {first_checkpoint}")

# Device selection
device = 0 if torch.cuda.is_available() else -1

# Load the pipeline
pipe = pipeline(
    "text-generation",
    model=first_checkpoint,
    device=device
)

Using model from: reverse-model-2B/checkpoint-1600


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [39]:
pipe1 = pipeline(
    "text-generation",
    model=first_checkpoint,
    device=device,
    top_p=0.99,

)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [2]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained("./tokenizers/spm_200k")

In [44]:
text = pipe1("is a blue flower."[::-1], num_return_sequences=1)[0]["generated_text"]

# print(f"=== BEGIN GENERATED TEXT ===")
# print(text)
# print()

print(f"=== BEGIN GENERATED TEXT [REVERSED] ===")
print(text[::-1].strip())

=== BEGIN GENERATED TEXT [REVERSED] ===
...................................................................................................................................................................................................................................that is the cause of the diarrhea,which is similar to that of the flu,but that is a soda,and that is one of which is a blue flower.


In [5]:
text = pipe("And that is why the sky is blue."[::-1], num_return_sequences=1)[0]["generated_text"]

# print(f"=== BEGIN GENERATED TEXT ===")
# print(text)
# print()

print(f"=== BEGIN GENERATED TEXT [REVERSED] ===")
print(text[::-1].strip())

=== BEGIN GENERATED TEXT [REVERSED] ===
do - we just have to know that these are the things that we need to communicate with each other.

    I think part of the problem is that we need to act on it,and to see that it's not theirs.But not all of them - so all we know is that it needs to be part of the same thing.

    The other side of it is that it is not in the sense that it is what it is,or something that happens to be part of the situation.

    This seems to be correct in my final paragraph,and you would be trying to imagine how difficult it is for it to pick up something that has something in it - meaning that it would be unable to allow it to do anything.

    It doesn't matter to me.
    It cannot be done without it in order for it to be associated with it,then there is no reason for it to be needed.

    But that is out of the question.

    The problem with that is that it has problems with it - and not that it exists.

    There is no such thing.

    And all of that is done

In [7]:
tokens = tokenizer.tokenize(text)
print(len(tokens))
print(tokens)

265
['▁.eulb', '▁si', '▁yks', '▁eht', '▁yhw', '▁si', '▁taht', '▁dnA', '▁', '▁', '▁', '▁\n\n.', 'tne', 'namrep', '▁os', '▁eb', "▁t'ndluow", '▁ti', '▁tub', ',de', 'cnalab', '▁eb', '▁nac', '▁ti', '▁taht', '▁os', '▁enod', '▁si', '▁taht', '▁fo', '▁lla', '▁dnA', '▁', '▁', '▁', '▁\n\n.', 'gniht', '▁hcus', '▁on', '▁si', '▁erehT', '▁', '▁', '▁', '▁\n\n.s', 'tsixe', '▁ti', '▁taht', '▁ton', '▁dna', '▁-', '▁ti', '▁htiw', '▁smelborp', '▁sah', '▁ti', '▁taht', '▁si', '▁taht', '▁htiw', '▁melborp', '▁ehT', '▁', '▁', '▁', '▁\n\n.', 'noitseuq', '▁eht', '▁fo', '▁tuo', '▁si', '▁taht', '▁tuB', '▁', '▁', '▁', '▁\n\n.de', 'deen', '▁eb', '▁ot', '▁ti', '▁rof', '▁nosaer', '▁on', '▁si', '▁ereht', '▁neht', ',', 'ti', '▁htiw', '▁detaicossa', '▁eb', '▁ot', '▁ti', '▁rof', '▁redro', '▁ni', '▁ti', '▁tuohtiw', '▁enod', '▁eb', '▁tonnac', '▁tI', '▁', '▁', '▁', '▁\n.', 'em', '▁ot', '▁rettam', "▁t'nseod", '▁tI', '▁', '▁', '▁', '▁\n\n.', 'gnihtyna', '▁od', '▁ot', '▁ti', '▁wolla', '▁ot', '▁elbanu', '▁eb', '▁dluow', '▁ti', '▁t