**Importing libraries and configurations**

In [9]:

import os, math, random, numpy as  np, torch
from datasets import load_dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, pipeline)



SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

model_id   = "gpt2"          # or "distilgpt2" for faster runs
out_dir    = "runs/jokes_gpt2"  #output directory of runs
block_size = 256             # reduce if you hit OOM (e.g., 128/192)
train_bs   = 16              # reduce if OOM (8)
eval_bs    = 16
epochs     = 3
lr         = 5e-5
use_fp16   = torch.cuda.is_available()  # safe on your 4070

os.makedirs(out_dir, exist_ok=True)
device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
device

'cuda'


**Load dataset of Reddit jokes & create a single text fields**

In [10]:
from datasets import load_dataset, DatasetDict

# Load the dataset (big Reddit jokes dump)
ds = load_dataset("SocialGrep/one-million-reddit-jokes", split="train")

# Merge title + body into one "text" field
def merge(rec):
    title = rec.get("title") or ""
    body = rec.get("body") or ""
    txt = (title + "\n" + body).strip()
    return {"text": txt}

# Apply the merge and clean up
ds = ds.map(merge, remove_columns=ds.column_names)
ds = ds.filter(lambda x: isinstance(x["text"], str) and len(x["text"]) > 20)

# 90/10 train/val split
split = ds.train_test_split(test_size=0.1, seed=42)
ds = DatasetDict({"train": split["train"], "validation": split["test"]})

print(ds)
print(ds["train"][0])

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 812593
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 90289
    })
})
{'text': "What do you call a white supremacist that doesn't eat meat?"}


**Tokenizer BPE & Tokenization**

In [11]:

tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token  # GPT-2 has no pad token by default

def tokenize(batch):
    return tok(batch["text"], truncation=True, max_length=block_size)

tokenized = ds.map(tokenize, batched=True, remove_columns=ds["train"].column_names)
tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 812593
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 90289
    })
})

**Load model + collator**

In [12]:
model = AutoModelForCausalLM.from_pretrained(model_id)
model.resize_token_embeddings(len(tok))  # in case PAD was added

collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)

**Training setup + fine-tuning**

In [17]:
args = TrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size=train_bs,
    per_device_eval_batch_size=eval_bs,
    gradient_accumulation_steps=1,
    num_train_epochs=epochs,
    learning_rate=lr,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    eval_strategy=IntervalStrategy.STEPS,
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    fp16=use_fp16,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    seed=SEED
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=collator,
    tokenizer=tok,
)

trainer.train()


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
500,3.7637,3.955393
1000,3.6495,3.809327
1500,3.5135,3.71668
2000,3.4653,3.649855
2500,3.4629,3.601258
3000,3.4403,3.583766
3500,3.3741,3.551235
4000,3.3732,3.530006
4500,3.378,3.506742
5000,3.3163,3.495456


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=101575, training_loss=3.0503543618850277, metrics={'train_runtime': 33762.6425, 'train_samples_per_second': 24.068, 'train_steps_per_second': 3.009, 'total_flos': 9771230898432000.0, 'train_loss': 3.0503543618850277, 'epoch': 1.0})

**Evaluate → Perplexity**

In [18]:
eval_res = trainer.evaluate()
ppl = math.exp(eval_res["eval_loss"])
print(eval_res)
print(f"Perplexity: {ppl:.2f}")

{'eval_loss': 3.1083250045776367, 'eval_runtime': 137.2822, 'eval_samples_per_second': 657.689, 'eval_steps_per_second': 82.218, 'epoch': 1.0}
Perplexity: 22.38


**Save checkpoint**

In [19]:
trainer.save_model(out_dir)       # saves config + tokenizer + model weights
tok.save_pretrained(out_dir)

torch.save(model.state_dict(), os.path.join(out_dir, "pytorch_model_weights_only.pt"))
print("Saved to:", out_dir)

Saved to: runs/jokes_gpt2


**Generation helper (sampling: temperature / top-k / top-p)**

**Samples**

In [28]:

samples_path = os.path.join(out_dir, "samples.txt")

with open(samples_path, "w", encoding="utf-8") as f:
    prompts = [
        "Why did the chicken cross the road?",
        "Write a one-line dad joke about GPUs:",
        "A software engineer and a hardware engineer walk into a bar and"
    ]
    for p in prompts:
        outs = gen(p, do_sample=True, top_k=50, top_p=0.9, temperature=0.9, max_new_tokens=80, num_return_sequences=3,
                   pad_token_id=tok.eos_token_id)
        f.write(f"\n\n# Prompt: {p}\n")
        for i, o in enumerate(outs, 1):
            f.write(f"\n[{i}] {o['generated_text']}\n")
samples_path

'runs/jokes_gpt2\\samples.txt'

In [None]:


import os, re, glob, torch, random
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr

random.seed(42); torch.manual_seed(42)

# 1) pick weights (best checkpoint if present)
out_dir = globals().get("out_dir", "runs/jokes_gpt2")
model_path = getattr(globals().get("trainer", None), "state", None)
model_path = getattr(model_path, "best_model_checkpoint", None) or \
             (sorted(glob.glob(os.path.join(out_dir, "checkpoint-*")), key=os.path.getmtime)[-1]
              if glob.glob(os.path.join(out_dir, "checkpoint-*")) else out_dir)

# 2) load model/tokenizer (GPU if available)
tok = globals().get("tokenizer") or globals().get("tok") or AutoTokenizer.from_pretrained(model_path)
if tok.pad_token is None: tok.pad_token = tok.eos_token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(model_path).to(device).eval()

# 3) cleaning + scoring
TAG = re.compile(r"\[(?:NSFW|NSFL|Long|OC|Serious|Spoiler|Mature|Political)\]", re.I)
URL = re.compile(r"https?://\S+")
def clean(s: str) -> str:
    s = TAG.sub("", URL.sub("", s)).replace("�","").strip()
    return s.split("\n", 1)[0].strip()

def score(s: str) -> float:
    L = len(s)
    return -abs(L-90) + (10 if s.endswith(('.', '!', '?', '…')) else 0)

# 4) hard block bad words/tokens
bad_words = ["NSFW","NSFL","x-post","crosspost","subreddit","r/","/r/","upvote","downvote","mod","OC","Meta"]
bad_ids = [tok.encode(w, add_special_tokens=False) for w in bad_words]
bad_ids = [ids for ids in bad_ids if len(ids) > 0]
nl_id = tok.encode("\n", add_special_tokens=False)[0]

def gen_one_topic(topic: str, attempts: int = 10):
    system = "Write a clean, witty ONE-LINER joke. Avoid profanity or adult content."
    prompt = f"{system}\nTopic: {topic}\nJoke:"
    inputs = tok(prompt, return_tensors="pt").to(device)

    cands = []
    for _ in range(attempts):
        out_ids = model.generate(
            **inputs,
            do_sample=True, temperature=0.8, top_k=50, top_p=0.9,
            repetition_penalty=1.2, no_repeat_ngram_size=4,
            max_new_tokens=32,
            bad_words_ids=bad_ids,
            eos_token_id=[tok.eos_token_id, nl_id],  # stop at newline
        )
        new_ids = out_ids[0, inputs["input_ids"].shape[1]:]
        text = tok.decode(new_ids, skip_special_tokens=True)
        text = clean(text)
        if 12 <= len(text) <= 160 and not any(tag in text.lower() for tag in ["nsfw","nsfl","r/","x-post"]):
            cands.append(text)

    cands = sorted(set(cands), key=score, reverse=True)
    return cands[:3] or ["(try another topic)"]

# ---- Simple Gradio app (wider output, no scrolling) ----
def ui_fn(topic):
    jokes = gen_one_topic(topic, attempts=12)
    return "\n".join(f"- {j}" for j in jokes)

css = """
#out_md { min-height: 320px; font-size: 1.05rem; }
.gradio-container { max-width: 980px !important; }
"""

with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
    gr.Markdown("# Fine-tuned Joke GPT-2\n\nClean one-liner jokes. Try a topic!")
    with gr.Row():
        with gr.Column(scale=1):
            inp = gr.Textbox(label="Topic", placeholder="GPUs, exams, roommates...", lines=1)
            submit = gr.Button("Submit", variant="primary")
            clear = gr.Button("Clear")
        with gr.Column(scale=2):
            out = gr.Markdown(label="Output", elem_id="out_md")
    submit.click(ui_fn, inputs=inp, outputs=out)
    clear.click(lambda: ("", ""), None, [inp, out])

demo.launch()


* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.


