In [None]:

# ============================================================
# 0. Clean install of Hugging Face stack
# ============================================================
!pip install -q "transformers==4.44.2" "datasets==2.21.0" "accelerate==0.34.2"



In [None]:
"""
Streaming language modeling with poem_sentiment + GPT-Neo

This lab builds a streaming language modeling pipeline that:
  - Loads the `poem_sentiment` dataset in streaming mode.
  - Converts each example into: <sentiment_LABEL> verse_text.
  - Tokenizes the text with a GPT-Neo tokenizer.
  - Uses a rolling buffer to create fixed-length blocks with overlap (stride).
  - Feeds streaming batches into a GPT-Neo causal language model
    and computes loss, perplexity, and simple throughput stats.
"""

from datasets import load_dataset
from transformers import AutoTokenizer, GPTNeoForCausalLM
from torch.utils.data import IterableDataset, DataLoader
import torch
import time


In [None]:
# ============================================================
# 2. Load dataset in streaming mode
# ============================================================
# poem_sentiment columns: ["id", "verse_text", "label"]
stream_dataset = load_dataset(
    "poem_sentiment",
    split="train",
    streaming=True,
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/5.70k [00:00<?, ?B/s]

In [None]:
# ============================================================
# 3. Tokenizer setup (GPT-Neo + special sentiment tokens)
# ============================================================
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Special tokens to encode sentiment label in the text stream
special_tokens = [f"<sentiment_{i}>" for i in range(4)]
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

4

In [None]:
# ============================================================
# 4. Build LM text field and tokenize
# ============================================================
def add_label_prefix(example):
    """
    Turn each poem into: <sentiment_LABEL> verse_text
    """
    label_id = int(example["label"])
    prefix = f"<sentiment_{label_id}> "
    return {"text_for_lm": prefix + example["verse_text"]}

# Attach sentiment prefix
labeled_stream = stream_dataset.map(add_label_prefix)

# Tokenization step
def tokenize_function(example):
    return tokenizer(example["text_for_lm"])

tokenized_stream = labeled_stream.map(tokenize_function)


In [None]:
# ============================================================
# 5. Rolling buffer → fixed-length blocks (with overlap)
# ============================================================
block_size = 128
stride = 96  # overlap = block_size - stride tokens

def group_texts_streaming(dataset_iter, block_size, stride=None):
    if stride is None:
        stride = block_size
    if not (1 <= stride <= block_size):
        raise ValueError("stride must be in [1, block_size]")

    buffer = []
    for example in dataset_iter:
        buffer.extend(example["input_ids"])
        # Emit as many blocks as possible
        while len(buffer) >= block_size:
            chunk = buffer[:block_size]
            buffer = buffer[stride:]
            yield {
                "input_ids": chunk,
                "attention_mask": [1] * block_size,
            }


In [None]:
# ============================================================
# 6. IterableDataset wrapper
# ============================================================
class StreamingLMIterableDataset(IterableDataset):
    def __init__(self, hf_iterable_dataset, block_size, stride=None):
        self.dataset = hf_iterable_dataset
        self.block_size = block_size
        self.stride = stride

    def __iter__(self):
        return group_texts_streaming(self.dataset, self.block_size, self.stride)

grouped_iterable_dataset = StreamingLMIterableDataset(
    tokenized_stream,
    block_size=block_size,
    stride=stride,
)


In [None]:
# ============================================================
# 7. Collate function and DataLoader
# ============================================================
def collate_fn(batch):
    input_ids = torch.tensor([ex["input_ids"] for ex in batch], dtype=torch.long)
    attention_mask = torch.tensor([ex["attention_mask"] for ex in batch], dtype=torch.long)
    labels = input_ids.clone()  # causal LM: labels = input_ids
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

train_loader = DataLoader(
    grouped_iterable_dataset,
    batch_size=4,
    collate_fn=collate_fn,
)


In [None]:
# ============================================================
# 8. Load GPT-Neo model
# ============================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPTNeoForCausalLM.from_pretrained(model_name)
# Resize embeddings to account for added special tokens
model.resize_token_embeddings(len(tokenizer))
model.to(device)
model.eval()


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPTNeoBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=False)
            (q_proj): Linear(in_features=768, out_features=768, bias=False)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_fe

In [None]:
# ============================================================
# 9. Run streaming batches and print summary
# ============================================================
print("Streaming batches from poem_sentiment with GPT-Neo:")

num_batches_to_show = 3
total_seqs = 0
total_tokens = 0
loss_sum = 0.0
decoded_example_shown = False
start = time.time()

with torch.no_grad():
    for i, batch in enumerate(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        bs, seq_len = input_ids.shape
        total_seqs += bs
        total_tokens += bs * seq_len

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss
        loss_sum += loss.item()

        print(f"Batch {i} -> shape={input_ids.shape}, loss={loss.item():.4f}")

        if not decoded_example_shown:
            print("\nDecoded example from first block:\n")
            print(tokenizer.decode(input_ids[0]))
            print("\n---\n")
            decoded_example_shown = True

        if i + 1 >= num_batches_to_show:
            break

elapsed = time.time() - start
avg_loss = loss_sum / max(num_batches_to_show, 1)
perplexity = float("inf") if avg_loss == 0 else torch.exp(torch.tensor(avg_loss)).item()

print("\n==================== Summary ====================")
print(f"Dataset         : poem_sentiment")
print(f"Model           : {model_name}")
print(f"Block size      : {block_size}")
print(f"Stride          : {stride} (overlap={block_size - stride} tokens)")
print(f"Batches         : {num_batches_to_show}")
print(f"Total sequences : {total_seqs}")
print(f"Total tokens    : {total_tokens}")
print(f"Average loss    : {avg_loss:.4f}")
print(f"Approx perplexity: {perplexity:.2f}")
if elapsed > 0:
    print(f"Tokens / second : {total_tokens / elapsed:.1f}")
print("=================================================\n")


Streaming batches from poem_sentiment with GPT-Neo:
Batch 0 -> shape=torch.Size([4, 128]), loss=26.2703

Decoded example from first block:

<sentiment_1> with pale blue berries. in these peaceful shades--<sentiment_2> it flows so long as falls the rain,<sentiment_0> and that is why, the lonesome day,<sentiment_3> when i peruse the conquered fame of heroes, and the victories of mighty generals, i do not envy the generals,<sentiment_3> of inward strife for truth and liberty.<sentiment_3> the red sword sealed their vows!<sentiment_2> and very venus of a pipe.<sentiment_2> who the man, who, called a brother.<sentiment_0> and so on. then a worthless gaud or two,<sentiment_2> to hide the orb of truth--and every throne<sentiment_2> the call's more urgent when he journeys

---

Batch 1 -> shape=torch.Size([4, 128]), loss=26.2085
Batch 2 -> shape=torch.Size([4, 128]), loss=24.8933

Dataset         : poem_sentiment
Model           : EleutherAI/gpt-neo-125M
Block size      : 128
Stride          :