In [None]:
from datasets import load_dataset, Dataset, DatasetDict

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

n_samples = 10_000_000_000
context_length = 4096

DATA_DIR = "/home/wyf/orcd/pool/causal-llm/data"
TOKENIZER_DIR = "/home/willi/reverse-model-fineweb-2B/checkpoint-200/"
MODEL_DIR = "/home/willi/reverse-model-fineweb-2B/checkpoint-200/"

dataset = "fineweb-10BT"
model_name = f"reverse-{dataset}-ctx-{context_length}-2B"

In [None]:
# Takes like 30s to load (it's bad)
raw_dataset = load_dataset(
    "HuggingFaceFW/fineweb-edu",
    split="train",
    cache_dir="/home/wyf/orcd/pool/hf-datasets/",
    name="sample-10BT",
)

Resolving data files:   0%|          | 0/2410 [00:00<?, ?it/s]

000_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

001_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

002_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

003_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

004_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

005_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

006_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

007_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

008_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

009_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

010_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

011_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

012_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

013_00000.parquet:   0%|          | 0.00/541M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9672101 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/98 [00:00<?, ?it/s]

In [None]:
from tqdm import tqdm
import sys

def filter_dataset(dataset, n_samples: int = None):
    return (
        dataset
            .select_columns(["text"])
            .map(lambda s: {"text": s["text"][::-1]})
    )
    
# 1k examples: 4.0s

print("Generating split datasets...")
raw_dataset_with_tqdm = [x for x in tqdm(raw_dataset.take(n_samples), total=n_samples)]
split_datasets = (
    Dataset.from_list(list(raw_dataset_with_tqdm))
        .train_test_split(test_size=0.005, seed=0)
)
datasets = DatasetDict({
    "train": filter_dataset(split_datasets["train"]),
    "valid": filter_dataset(split_datasets["test"]),
})

In [2]:
for split_name, dataset in datasets.items():
    dataset.to_parquet(f"{DATA_DIR}/fineweb_{n_samples}/{split_name}.parquet")

NameError: name 'datasets' is not defined

## Import dataset

In [None]:
from datasets import load_dataset, DatasetDict, Dataset

# Dataset is too big to fit into memory so we stream
split_datasets = DatasetDict({
    "train": Dataset.load_from_disk(f"{DATA_DIR}/{dataset}/train"),
    "valid": Dataset.load_from_disk(f"{DATA_DIR}/{dataset}/valid"),
})

Loading dataset from disk:   0%|          | 0/92 [00:00<?, ?it/s]

In [7]:
print(list(datasets["train"].take(1))[0]["text"][500::-1])

a pihsnoitaler lanosrep ,efil ,tnemevom si ereht gnieb lanrete s'doG nI -
.pihsnoitaler ni efil lanosrep si doG fo efil lanrete eht taht mriffa ot si enuirt si doG taht ssefnoc oT .1
:)28-67 segap morf setouq selipmoc swollof tahw( stnemetats eerht htiw rammarg siht sezirammus eH .)67p( "efil derahs dna ,ytilautum ,ytinummoc setaerc dna srehto ot flesti fo sevig yleerf taht evol enivid suordnow fo rammarg" a su gnivig sa htiaf nairatinirt fo skaeps eroilgiM
evol enivid fo rammarg eht :ytinirT ehT


In [None]:
# Train tokenizer (7.4s on 1k examples)
# 3m 30s on 200k examples

from transformers import AutoTokenizer, LlamaTokenizer
from tokenizers import SentencePieceBPETokenizer
from tqdm import tqdm

# TODO: take sample
def text_iterator():
    for x in tqdm(datasets["train"]["text"]):
        yield x

spm_tokenizer = SentencePieceBPETokenizer()
spm_tokenizer.train_from_iterator(
    text_iterator(),
    vocab_size=52_000,
    min_frequency=5,
    show_progress=True,
    limit_alphabet=500,
)

In [None]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=spm_tokenizer,
    bos_token="<s>",           # Always added at start
    eos_token="</s>",          # Always added at end  
    unk_token="<unk>",         # Replaces unknown words
    pad_token="<pad>",         # Used for padding shorter sequences
)
tokenizer.save_pretrained("./tokenizers/fineweb_spm_1M")

In [8]:
# Load pretrained tokenizer
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained(f"{TOKENIZER_DIR}/fineweb_spm_1M")

In [59]:
# USES CONTEXT LENGTH

context_length = 4096
print(f"Context length: {context_length}")

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

Context length: 4096


In [85]:
# 25s to parse 1k examples
# 4m 40s to parse 10k examples
# 7m 50s to parse 200k examples

tokenized_dataset_train = split_datasets["train"].take(32).map(
    tokenize, batched=True, remove_columns=["text"], batch_size=32)
tokenized_dataset_valid = split_datasets["valid"].take(32).map(
    tokenize, batched=True, remove_columns=["text"], batch_size=32)

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [None]:
# Takes a hot minute to save b/c streaming
# 30min for 2M fineweb examples
# 3m for 1/9 of that

print("Saving training dataset...")
tokenized_dataset_train.to_parquet(
    f"{DATA_DIR}/{dataset}/tokenized_{context_length}_train.parquet")
print("Saving valid dataset...")
tokenized_dataset_valid.to_parquet(
    f"{DATA_DIR}/{dataset}/tokenized_{context_length}_valid.parquet")

Saving training dataset...


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saving valid dataset...


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

229432

In [87]:
# Load tokenized datasets
tokenized_dataset = load_dataset(
    "parquet",
    data_files={
        "train": f"{DATA_DIR}/{dataset}/tokenized_{context_length}_train.parquet",
        "valid": f"{DATA_DIR}/{dataset}/tokenized_{context_length}_valid.parquet",
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

In [14]:
print(tokenized_dataset)
print(f"Produced dataset of {tokenized_dataset.num_rows:,} rows, {context_length} tokens each")
print(f"Total tokens: {tokenized_dataset.num_rows * context_length:,}")

Dataset({
    features: ['input_ids'],
    num_rows: 6
})
Produced dataset of 6 rows, 4906 tokens each
Total tokens: 29,436


In [15]:
print(f"Tokenizer vocab size: {len(tokenizer)}")
print(f"Model config vocab size: {tokenizer.vocab_size}")
print(f"BOS token ID: {tokenizer.bos_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")
print(f"PAD token ID: {tokenizer.pad_token_id}")

# Check a sample tokenization
sample_text = "hello world"
tokens = tokenizer(sample_text)
print(f"Sample tokens: {tokens}")

Tokenizer vocab size: 52003
Model config vocab size: 52000
BOS token ID: 52000
EOS token ID: 52001
PAD token ID: 52002
Sample tokens: {'input_ids': [1078, 1143, 1055, 2648, 77, 69], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [88]:
# 3.2s to initialize model

from transformers import LlamaConfig, LlamaForCausalLM
import torch

model_size = "2B"

config = LlamaConfig(
    vocab_size=len(tokenizer),
    max_position_embeddings=8192,
    hidden_size=2048 if model_size == "2B" else 3072,
    intermediate_size=16384 if model_size == "2B" else 24576,
    num_hidden_layers=18 if model_size == "2B" else 28,
    num_attention_heads=8 if model_size == "2B" else 16,
    num_key_value_heads=1 if model_size == "2B" else 16,
    rms_norm_eps=1e-5,
    tie_word_embeddings=False,
    rope_scaling=None,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

with torch.device("meta"):
    model = LlamaForCausalLM(config)
    print(f"Initialized model on meta device")

model = model.to_empty(device="cuda")

Initialized model on meta device


In [89]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

Model size: 2194.9M parameters


In [90]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = "<pad>"
tokenizer.bos_token = "<s>"
tokenizer.eos_token = "</s>"
tokenizer.unk_token = "<unk>"
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [94]:
# 0.1s to initialize training args

from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir=model_name,
    
    # Batch size settings - LEDOM uses global batch size of 1024 sequences
    per_device_train_batch_size=2,  # Micro-batch size per GPU
    per_device_eval_batch_size=1,   # Used in their fine-tuning setup
    gradient_accumulation_steps=1, # To achieve global batch size (adjust based on GPU count)

    eval_strategy="steps",        # Evaluate every N steps
    eval_steps=5000,     # Eval every N steps  
    logging_steps=1,  # More frequent logging to match their monitoring
    
    # Training duration - LEDOM trained for ~51,900 iterations for 7B model
    num_train_epochs=1,  # Keep as 1 epoch since they trained on 435B tokens once
    
    # Optimizer settings - match LEDOM exactly
    optim="adamw_torch",
    learning_rate=2e-4,           # Peak learning rate: 2×10⁻⁴ 
    weight_decay=0.1,             # Matches their setting
    adam_beta1=0.9,               # Adam β₁
    adam_beta2=0.95,              # Adam β₂  
    adam_epsilon=1e-8,            # Adam ε
    
    # Learning rate schedule - LEDOM uses cosine with specific warmup
    lr_scheduler_type="cosine",
    warmup_steps=2000,            # LEDOM uses 2000 warmup iterations
    
    # Gradient settings
    max_grad_norm=1.0,            # Gradient clipping norm
    
    # Precision - LEDOM uses BF16, not FP16
    bf16=True,                    # Use BF16 instead of FP16
    fp16=False,                   # Disable FP16
    
    # Checkpointing
    save_steps=5_000,
    save_total_limit=3,           # Reasonable limit for storage
    save_only_model=True,
    
    # Additional LEDOM-specific settings
    dataloader_num_workers=2,     # For efficiency
    remove_unused_columns=False,  # Keep all data columns
    
    # Disable features not used in LEDOM training
    load_best_model_at_end=False,
    report_to=None,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_valid,
)

print(len(tokenized_dataset_train[0]["input_ids"]))

  trainer = Trainer(


4096


In [95]:
torch.cuda.empty_cache()

In [96]:
# 1m for 1k samples (2.2M tokens)
trainer.train()

Step,Training Loss,Validation Loss


OSError: [Errno 122] Disk quota exceeded

## Test text generation

In [4]:
import torch
from transformers import pipeline

# Device selection
device = 0 if torch.cuda.is_available() else -1

In [6]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained(f"{TOKENIZER_DIR}")

In [7]:
# Load the pipeline
pipe = pipeline(
    "text-generation",
    model="/home/willi/reverse-model-fineweb-2B/checkpoint-200/",
    tokenizer=tokenizer,
    device=device,
    top_p=1.0,
    temperature=1.0,
    clean_up_tokenization_spaces=False,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


In [10]:
test_text = """
 is the best dessert.
"""[::-1].strip()

In [11]:
text = pipe(test_text, num_return_sequences=1)[0]["generated_text"]

print(f"=== BEGIN GENERATED TEXT [REVERSED] ===")
print(text[::-1])

=== BEGIN GENERATED TEXT [REVERSED] ===
; I could hear it if it has been able to make it with a base; and no one could ever see it, in which every one puts it away; saying, "It was certainly true, and I do seem to give it and yet does it when I put it behind it. Here, then, how can I like to put it up? If he cut it with it. And would it make it out of it and or look upon it, and let it do this we never see it, nor would it make a thing or turn it to it, let it save. To be able to see it blow. How would it have heard about it. The sun? And shall it give it again: it shall put it in to it, and turn it not to; but now there is one, and attn't it take it on it, and whether it be the sun? Now if I see it in it, so far as it places it in it there; it does turn it into its grave, and let it lie in it or does it lie in the ground, and so it turns, without slope, or light; it is ever created in it, and is put into it. But how can it not be so. But that one falls out of it, and carries with it m

In [14]:
# Test direct token generation
import torch

# Load model directly (not pipeline)
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
    f"{MODEL_DIR}/reverse-model-2B-fineweb-2000000-batchsize-20/checkpoint-22109",
    device_map="cuda"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
def generate_text(prompt, max_tokens=50):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=max_tokens,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            temperature=0.7,
        )
    
    generated = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated[::-1]  # Reverse back to normal

# Test it
result = generate_text("is the first planet in the solar system."[::-1], max_tokens=None)
print(result)

planets than any other body in our solar system. So it is interesting to look at this planet because it is the first planet in the solar system.
