In [1]:
# Imports

# --- Standard Library ---
import os
import random
import time
import math
from pathlib import Path
from collections import Counter

# --- Third-Party Libraries ---
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split

# --- TensorFlow / Keras ---
import tensorflow as tf
from tensorflow.keras import Input, Sequential, layers, models
from tensorflow.keras.datasets import imdb
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import CosineDecay, ExponentialDecay
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Layers
from tensorflow.keras.layers import (
    Embedding, GlobalAveragePooling1D, Dense, 
    LSTM, GRU, Dropout, SpatialDropout1D, Bidirectional, Lambda
)

# Preprocessing
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

os.environ["TRANSFORMERS_NO_TORCH"] = "1"   # <- correct flag name for recent versions


# --- Reproducibility Settings ---
random_seed = 42

'''
# OS-level determinism
os.environ['PYTHONHASHSEED'] = '0'        # Disable hash randomization
os.environ['TF_DETERMINISTIC_OPS'] = '1'  # Make TF ops deterministic (where possible)
os.environ['TF_CUDNN_DETERMINISM'] = '1'  # CuDNN deterministic (if using GPU)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow INFO and WARNING messages

# Set seeds
random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_seed(random_seed)
'''

# --- Utility Function ---
def format_hms(seconds):
    return time.strftime("%H:%M:%S", time.gmtime(seconds))

'''
# Example usage for timing code:

start_time = time.time()

# <your code here>

print("Execution Time:", format_hms(time.time() - start_time))
'''
None

In [6]:
# If needed (comment out if your env already has these)
# Note: 'sentencepiece' is required for T5 tokenization.
# %pip install -q transformers torch sentencepiece accelerate


### BERT-style masked language modeling (bidirectional)

Teaching notes:

BERT uses masked language modeling (bidirectional context).

Different pretraining objective vs GPT → different strengths (representation learning, classification, QA heads, etc.).

### Bert Classifies the IMDB Movie Review Dataset

In [None]:
!pip -q install transformers datasets accelerate scikit-learn

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# 1) Data
ds = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tok(ex):
    return tokenizer(ex["text"], truncation=True, max_length=256)
ds_tok = ds.map(tok, batched=True, remove_columns=["text"])

# 2) Model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 3) Metrics
def metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds), "f1": f1_score(labels, preds)}

# 4) Train
args = TrainingArguments(
    output_dir="imdb-distilbert",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,              # 2–3 is usually enough
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["test"].shuffle(seed=42).select(range(5000)),  # speed-up eval
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=metrics,
)

trainer.train()
trainer.evaluate(ds_tok["test"])     # final full test evaluation


### Bert Classifies Sentences a Sequential or Random

In [None]:
# !pip -q install transformers datasets nltk accelerate scikit-learn

import random, nltk, numpy as np
nltk.download('punkt')

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score

# 1) Build sentence pairs from raw documents
docs = [
    # Put any longer texts here (articles, chapters). Add multiple docs for variety.
    """Natural language processing is widely used in industry. It powers search, chatbots, and content moderation. 
    Modern models rely on large-scale pretraining. Fine-tuning adapts them to specific tasks.""",
    """Decision trees split data using impurity metrics. Random forests reduce variance by bagging multiple trees.
    Boosting methods like XGBoost focus on hard-to-predict examples. Regularization helps avoid overfitting."""
]

sentences = []
for d in docs:
    sentences += nltk.sent_tokenize(d)

pairs, labels = [], []
# Positive pairs: consecutive sentences
for i in range(len(sentences)-1):
    pairs.append((sentences[i], sentences[i+1]))
    labels.append(1)

# Negative pairs: random mismatches (same count as positives)
for i in range(len(sentences)-1):
    j = random.randrange(len(sentences))
    while j == i+1:  # ensure not the true next sentence
        j = random.randrange(len(sentences))
    pairs.append((sentences[i], sentences[j]))
    labels.append(0)

# 2) Hugging Face dataset
data = {"s1": [p[0] for p in pairs], "s2": [p[1] for p in pairs], "label": labels}
ds = Dataset.from_dict(data).train_test_split(test_size=0.25, seed=42)

# 3) Tokenize as sentence pairs
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tok(batch):
    return tokenizer(batch["s1"], batch["s2"], truncation=True, max_length=256)

ds_tok = ds.map(tok, batched=True, remove_columns=["s1","s2"])
collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 4) Model + metrics
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds), "f1": f1_score(labels, preds)}

# 5) Train
args = TrainingArguments(
    output_dir="sent-follow-distilbert",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["test"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=metrics,
)

trainer.train()
trainer.evaluate()


In [7]:
import os
os.environ["TRANSFORMERS_NO_PYTORCH"] = "1"   # make Transformers ignore PyTorch


In [8]:
import tensorflow as tf, importlib, transformers
print("TF:", tf.__version__)  # should be 2.17.0
print("tf_keras importable?", importlib.util.find_spec("tf_keras") is not None)  # True
print("Transformers:", transformers.__version__)  # 4.44.2


TF: 2.17.0
tf_keras importable? True
Transformers: 4.44.2


In [20]:
from transformers import logging
logging.set_verbosity_error()


In [21]:
from transformers import pipeline
fill = pipeline("fill-mask", model="bert-base-uncased", framework="tf")  # or distilbert
print(fill("The capital of France is [MASK].", top_k=3))


[{'score': 0.4167880415916443, 'token': 3000, 'token_str': 'paris', 'sequence': 'the capital of france is paris.'}, {'score': 0.07141624391078949, 'token': 22479, 'token_str': 'lille', 'sequence': 'the capital of france is lille.'}, {'score': 0.06339256465435028, 'token': 10241, 'token_str': 'lyon', 'sequence': 'the capital of france is lyon.'}]


In [35]:
from transformers import pipeline
gen = pipeline("text-generation", model="distilgpt2", framework="tf")
print(gen("The three economists shared the prize for research",
          max_new_tokens=100, do_sample=True, temperature=0.8)[0]["generated_text"])


The three economists shared the prize for research on the economics of capitalism at the Oxford Business School: "The authors' findings demonstrate that capitalism is not an irrational pursuit of individual value and therefore should not be pursued by government and private individuals."


In [37]:
from transformers import pipeline
gen = pipeline("text-generation", model="distilgpt2", framework="tf")
print(gen("The three economists shared the prize for research",
          max_new_tokens=100, do_sample=True, temperature=0.8)[0]["generated_text"])


The three economists shared the prize for research-supported research in recent years. He is now a Harvard News Fellow.


### Baseline T5-Small

In [28]:
# This is an article from NYTs:  https://www.nytimes.com/2025/10/13/business/nobel-prize-economics.html

text = "The Nobel Memorial Prize in Economic Sciences was awarded on Monday to Joel Mokyr of Northwestern University, Philippe Aghion of Collège de France, INSEAD and the London School of Economics and Peter Howitt of Brown University for their work on how innovation drives economic growth. \
The three economists shared the prize for research that explains the relationship between technological progress and sustained economic growth that has improved living standards, health and quality of life for people around the world. The prize committee said that their work would help ensure that growth was maintained and could be steered in the direction to support humankind. \
For most of human history, there was very little economic growth, John Hassler, the chair of the prize committee, said in a ceremony announcing the award. Despite important discoveries that improved living conditions, growth leveled off. But over the past two centuries, that changed. 'Sustained economic growth, driven by a continuous stream of technological innovations and improvements, has replaced stagnation,' Mr. Hassler said. \
The award for the economists’ work comes as artificial intelligence has become an increasingly dominant force in the global economy and has the potential to spur another technology-driven boom. But other policies are expected to restrain economic growth, such as the Trump administration’s tariffs and other protectionist policies, like China’s curbs on exports of rare earth minerals and battery-making equipment. \
Mr. Mokyr was awarded half the prize for his work in explaining how sustained economic growth became the norm. \
He showed that for innovations to succeed and become a self-generating process, people needed a scientific explanation for why the breakthroughs worked. Before the industrial revolution, a lack of this knowledge made it difficult to build on new discoveries, the committee said. \
Mr. Mokyr’s work, such as his book 'A Culture of Growth: Origins of the Modern Economy,' has also emphasized the importance of society being open to new ideas and allowing change. Mr. Mokyr has tended to take an optimistic view on the potential for more economic growth. \
Mr. Aghion and Mr. Howitt shared the other half of the award for what the committee described as 'the theory of sustained growth through creative destruction.' They built a mathematical model for growth, with creative destruction as a core element. \
The committee described creative destruction as 'an endless process in which new and better products replace the old.' They used the example of the telephone, in which each new version made the previous one obsolete, from the rotary dial phone in the early 1900s through to today’s smartphones. \
Mr. Aghion and Mr. Howitt’s work shows how economic growth can continue despite companies being sidelined by the innovation of other firms. Their work can support policymakers in designing research and development policies, the committee said. \
The laureates’ work shows 'we should not take progress for granted,' Kerstin Enflo, a member of the Nobel committee, said during a news conference. \
'Instead, society must keep an eye on the factors that generate and sustain economic growth,' she added. 'These are science-based innovation, creative destruction and a society open for change.' \
Mr. Aghion said that the prize came as 'a huge surprise.' \
'I can’t find the words to express what I feel,' he added by telephone at the news conference. He also warned against forces like protectionism and tariffs that obstruct growth and are gaining traction. \
Trade barriers and deglobalization make markets more fragmented and reduce opportunities to exchange ideas, he said."


In [38]:
from transformers import pipeline
summ = pipeline("summarization", model="t5-small", framework="tf")
print(summ(text, max_length=100, min_length=10)[0]["summary_text"])


the three economists shared the prize for research that explains the relationship between technological progress and sustained economic growth that has improved living standards, health and quality of life for people around the world . the prize committee said that their work would ensure that growth was maintained and could be steered in the direction to support humankind .


## 1) Decoding knobs (quality vs speed)


See the **Appendix** for more details!

In [39]:

# summ = pipeline("summarization", model="t5-small", framework="tf")

# Beam search (deterministic, usually better)
print(summ(text, max_length=120, min_length=40,
           num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3)[0]["summary_text"])

# Sampling (more diverse; can drift)
print(summ(text, max_length=120, min_length=40,
           do_sample=True, top_p=0.9, top_k=50, temperature=0.8,
           no_repeat_ngram_size=3)[0]["summary_text"])

the three economists shared the prize for research that explains the relationship between technological progress and sustained economic growth that has improved living standards, health and quality of life for people around the world . the prize committee said that their work would ensure that growth was maintained and could be steered in the direction to support humankind .
the three economists shared the prize for research that explains the relationship between technological progress and sustained economic growth that has improved living standards, health and quality of life for people around the world . the prize committee said that their work would ensure that growth was maintained and could be steered in the direction to support humankind .


## 2) Style controls (T5 follows instructions well)

In [None]:
def t5(prompt): 
    return summ(f"{prompt} {text}", max_length=80, min_length=20, num_beams=4)[0]["summary_text"]

print(t5("summarize:"))                          # default
print(t5("summarize in 1 sentence:"))
print(t5("summarize in 3 bullet points:"))       # will often produce bullets
print(t5("write a headline:"))
print(t5("summarize for a 9th-grade reader:"))

## 3) Headline + bullets in one go

In [None]:
headline = summ("write a headline: " + text, max_length=20, num_beams=4)[0]["summary_text"]
bullets  = summ("summarize in 3 bullet points: " + text, max_length=90, num_beams=4)[0]["summary_text"]
print(headline, "\n", bullets)

## 4) Long articles (chunk + combine)

In [None]:
def chunk(xs, n=512):  # T5-small context is ~512 tokens; rough char fallback
    step = 1500
    return [xs[i:i+step] for i in range(0, len(xs), step)]

partials = [summ("summarize: " + c, max_length=80, num_beams=4)[0]["summary_text"]
            for c in chunk(text)]
final = summ("summarize: " + " ".join(partials), max_length=120, num_beams=4)[0]["summary_text"]
print(final)

## 5) Model swaps (quick comparisons)

In [None]:
# Faster/smarter variants to try:
models = [
  "t5-small",                 # baseline
  "google/flan-t5-small",     # better instruction-following
  "facebook/bart-large-cnn",  # strong for news
  "sshleifer/distilbart-cnn-12-6"  # faster BART
]
for m in models:
    p = pipeline("summarization", model=m, framework="tf")
    print(m, "→", p(text, max_length=120, min_length=40, num_beams=4)[0]["summary_text"][:160], "…")

*(For very long docs, look at `allenai/led-base-16384` or `google/pegasus-cnn_dailymail`.)*

## 6) Quick sanity checks (length, redundancy, faithfulness)

In [None]:
s = summ(text, max_length=120, min_length=40, num_beams=4,
         no_repeat_ngram_size=3)[0]["summary_text"]
print("chars:", len(s), "| sentences:", s.count("."))

Optional: compare two settings with a lightweight metric:

In [None]:
# pip install rouge-score
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(["rouge1","rougeL"], use_stemmer=True)
r = scorer.score(text, s)
print({k: v.fmeasure for k,v in r.items()})


## Appendix: Beam Search and Sampling in T5


Here’s what each argument is doing and how to pick values—first for beam search (deterministic), then for sampling (stochastic).

# Shared length controls

* `max_length=120`: Hard cap on output length in **tokenizer tokens**. Pick a ceiling that fits your UI or rubric; smaller = faster, safer; larger = risk of drift.
* `min_length=40`: Forces at least this many tokens before the model is allowed to stop. Use to avoid ultra-short summaries (raise if you see 1–2 sentence outputs).

# Anti-repetition

* `no_repeat_ngram_size=3`: Disallows any 3-gram from appearing twice. 2–4 are common; higher reduces loops but can overconstrain phrasing.

# Beam search block

* `num_beams=4`: How many hypotheses are explored in parallel. More beams (e.g., 6–8) → usually better coverage/faithfulness, but slower and sometimes more generic.
* `length_penalty=1.0`: Normalizes beams by length.

  * `>1.0` penalizes long outputs → shorter, punchier.
  * `<1.0` rewards length → longer, more detailed.
  * Start at `1.0`; try `1.1–1.2` if outputs ramble; try `0.9` if they’re too terse.

# Sampling block

* `do_sample=True`: Switches from greedy/beam decoding to probabilistic sampling.
* `temperature=0.8`: Scales logits before sampling.

  * `<1` (e.g., 0.7–0.9) = safer, crisper.
  * `>1` (e.g., 1.1–1.3) = more varied, more risk.
* `top_p=0.9` (nucleus sampling): Sample only from the smallest set of tokens whose cumulative probability ≥ 0.9. Lower to 0.8 to be safer; raise to 0.95 for more variety.
* `top_k=50`: Also restrict to the top-50 tokens by probability. When both `top_p` and `top_k` are set, the candidate set is the intersection—this adds an extra safety brake. Typical `top_k` range: 40–100.

# Quick recipes

* **Max faithfulness / less drift (deterministic):** `num_beams=6–8`, `length_penalty=1.0–1.2`, keep `no_repeat_ngram_size=3`, tune `min_length`.
* **Balanced variety (sampling):** `temperature=0.8–0.9`, `top_p=0.9`, `top_k=50–80`, keep `no_repeat_ngram_size=3`.
* **More creative / riskier:** `temperature≈1.0–1.2`, `top_p=0.92–0.95`, `top_k=0 or 100+` (or drop `top_k`), possibly lower `min_length`.

# Troubleshooting heuristics

* Too short → raise `min_length` or lower `length_penalty`.
* Repetitive → increase `no_repeat_ngram_size` or lower `top_p`/`temperature`.
* Generic/boring (beam) → try sampling block or reduce `num_beams`.
* Off-topic (sampling) → lower `temperature`, lower `top_p`, keep `top_k` moderate.

That’s it—tune length first, then repetition, then exploration (beams or sampling) depending on faithfulness vs. variety.


* `num_beams`: 1–6 (higher = slower, usually better)
* `no_repeat_ngram_size`: 2–4 to curb repetition
* `length_penalty`: >1 favors longer, <1 favors shorter
* `do_sample` + `top_p`/`top_k`/`temperature`: enable stochastic summaries

---

## 2) Style controls (T5 follows instructions well)

```python
def t5(prompt): return summ(f"{prompt} {text}", max_length=80, min_length=20, num_beams=4)[0]["summary_text"]

print(t5("summarize:"))                          # default
print(t5("summarize in 1 sentence:"))
print(t5("summarize in 3 bullet points:"))       # will often produce bullets
print(t5("write a headline:"))
print(t5("summarize for a 9th-grade reader:"))
```

---

## 3) Headline + bullets in one go

```python
headline = summ("write a headline: " + text, max_length=20, num_beams=4)[0]["summary_text"]
bullets  = summ("summarize in 3 bullet points: " + text, max_length=90, num_beams=4)[0]["summary_text"]
print(headline, "\n", bullets)
```

---

## 4) Long articles (chunk + combine)

```python
def chunk(xs, n=512):  # T5-small context is ~512 tokens; rough char fallback
    step = 1500
    return [xs[i:i+step] for i in range(0, len(xs), step)]

partials = [summ("summarize: " + c, max_length=80, num_beams=4)[0]["summary_text"]
            for c in chunk(text)]
final = summ("summarize: " + " ".join(partials), max_length=120, num_beams=4)[0]["summary_text"]
print(final)
```

---

## 5) Model swaps (quick comparisons)

```python
# Faster/smarter variants to try:
models = [
  "t5-small",                 # baseline
  "google/flan-t5-small",     # better instruction-following
  "facebook/bart-large-cnn",  # strong for news
  "sshleifer/distilbart-cnn-12-6"  # faster BART
]
for m in models:
    p = pipeline("summarization", model=m, framework="tf")
    print(m, "→", p(text, max_length=120, min_length=40, num_beams=4)[0]["summary_text"][:160], "…")
```

*(For very long docs, look at `allenai/led-base-16384` or `google/pegasus-cnn_dailymail`.)*

---

## 6) Quick sanity checks (length, redundancy, faithfulness)

```python
s = summ(text, max_length=120, min_length=40, num_beams=4,
         no_repeat_ngram_size=3)[0]["summary_text"]
print("chars:", len(s), "| sentences:", s.count("."))
```

Optional: compare two settings with a lightweight metric:

```python
# pip install rouge-score
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(["rouge1","rougeL"], use_stemmer=True)
r = scorer.score(text, s)
print({k: v.fmeasure for k,v in r.items()})
```

---

Want me to wrap these into a neat “Summarization Lab” section for your transformer notebook (with prompts students can run and answer)?