In [1]:
# Cell 1: Install required packages (run once)
!pip install -q transformers datasets evaluate rouge_score sentencepiece accelerate


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [3]:
# Cell 2: Mount Google Drive and set dataset paths
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define root directory (your files are directly in MyDrive)
DRIVE_ROOT = '/content/drive/MyDrive'

# Dataset paths
TRAIN_CSV = os.path.join(DRIVE_ROOT, 'train.csv')
VAL_CSV = os.path.join(DRIVE_ROOT, 'validation.csv')
TEST_CSV = os.path.join(DRIVE_ROOT, 'test.csv')

# Check if files exist
print("TRAIN path:", TRAIN_CSV)
print("Exists:", os.path.exists(TRAIN_CSV))
print("VAL exists:", os.path.exists(VAL_CSV))
print("TEST exists:", os.path.exists(TEST_CSV))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
TRAIN path: /content/drive/MyDrive/train.csv
Exists: True
VAL exists: True
TEST exists: True


In [4]:
# Cell 3: Load CSV files into pandas and limit size for quick experiments
import pandas as pd

# CHANGE these limits if you want more/less data
LIMIT_TRAIN = 2000   # set <= 3902 as you mentioned train.csv has 3902 rows
LIMIT_VAL   = 500
LIMIT_TEST  = 500

train_df = pd.read_csv(TRAIN_CSV).iloc[:LIMIT_TRAIN].reset_index(drop=True)
val_df   = pd.read_csv(VAL_CSV).iloc[:LIMIT_VAL].reset_index(drop=True)
test_df  = pd.read_csv(TEST_CSV).iloc[:LIMIT_TEST].reset_index(drop=True)

print("Train:", len(train_df), " Val:", len(val_df), " Test:", len(test_df))
print("Columns in train:", train_df.columns.tolist())
train_df.head(2)


Train: 2000  Val: 500  Test: 500
Columns in train: ['id', 'article', 'highlights']


Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...


In [5]:
# Cell 4: Standardize column names to 'text' and 'summary'
# If your CSV already uses 'article' and 'highlights' this will rename them.
col_map = {}
if 'article' in train_df.columns:
    col_map['article'] = 'text'
if 'highlights' in train_df.columns:
    col_map['highlights'] = 'summary'
if 'id' in train_df.columns and 'text' not in train_df.columns:
    # some datasets may include 'article' only or 'text' already.
    pass

# apply renaming for all dfs
train_df = train_df.rename(columns=col_map)
val_df   = val_df.rename(columns=col_map)
test_df  = test_df.rename(columns=col_map)

# quick sanity check
print("Train columns now:", train_df.columns.tolist())
assert 'text' in train_df.columns and 'summary' in train_df.columns, "Expected columns 'text' and 'summary'"


Train columns now: ['id', 'text', 'summary']


In [6]:
# Cell 5: Convert to HF Dataset (keeps only 'text' and 'summary')
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df[['text','summary']].copy())
val_ds   = Dataset.from_pandas(val_df[['text','summary']].copy())
test_ds  = Dataset.from_pandas(test_df[['text','summary']].copy())

print(train_ds)
print(val_ds)
print(test_ds)


Dataset({
    features: ['text', 'summary'],
    num_rows: 2000
})
Dataset({
    features: ['text', 'summary'],
    num_rows: 500
})
Dataset({
    features: ['text', 'summary'],
    num_rows: 500
})


In [7]:
# Cell 6: Load T5 tokenizer and define preprocessing function
from transformers import AutoTokenizer

T5_MODEL = "t5-small"       # change to 't5-base' or larger if you have resources
t5_tokenizer = AutoTokenizer.from_pretrained(T5_MODEL)

MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128
PREFIX = "summarize: "      # T5 benefits from a task prefix

def preprocess_function_t5(examples):
    inputs = [PREFIX + doc for doc in examples['text']]
    model_inputs = t5_tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Tokenize targets (labels)
    with t5_tokenizer.as_target_tokenizer():
        labels = t5_tokenizer(examples['summary'], max_length=MAX_TARGET_LENGTH, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
# Cell 7: Tokenize the datasets for T5 (this may take a minute)
train_t5 = train_ds.map(preprocess_function_t5, batched=True, remove_columns=['text','summary'])
val_t5   = val_ds.map(preprocess_function_t5, batched=True, remove_columns=['text','summary'])
test_t5  = test_ds.map(preprocess_function_t5, batched=True, remove_columns=['text','summary'])

print(train_t5.features)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': List(Value('int64'))}


In [9]:
# Cell 8: Setup ROUGE metric and helper functions for evaluation
import evaluate
import numpy as np

rouge = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    return preds, labels

def make_compute_metrics(tokenizer):
    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

        # replace -100 in labels (Trainer uses -100) with pad_token_id for decoding
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
        result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        result = {k: round(v * 100, 4) for k, v in result.items()}
        prediction_lens = [np.count_nonzero(p != tokenizer.pad_token_id) for p in preds]
        result["gen_len"] = np.mean(prediction_lens)
        return result
    return compute_metrics


Downloading builder script: 0.00B [00:00, ?B/s]

In [12]:
# Cell 9: Prepare model, trainer helper, and training arguments
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import os

def create_trainer_t5(model_name, train_dataset, eval_dataset, tokenizer, output_dir, num_train_epochs=2):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    # adjust batch sizes if you get OOM
    args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        predict_with_generate=True,
        logging_steps=100,
        num_train_epochs=num_train_epochs,
        fp16=True,                  # set True if GPU + compatible
        load_best_model_at_end=True,
        metric_for_best_model="rouge1",
        push_to_hub=False
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    compute_metrics = make_compute_metrics(tokenizer)

    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    return trainer


In [13]:
# Cell 10: Fine-tune T5. WARNING: training takes time. Adjust num_train_epochs or limits for speed.
OUTPUT_DIR = "/content/drive/MyDrive/t5_summarization_small"  # saved to Drive
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Create trainer
trainer = create_trainer_t5(
    model_name=T5_MODEL,
    train_dataset=train_t5,
    eval_dataset=val_t5,
    tokenizer=t5_tokenizer,
    output_dir=OUTPUT_DIR,
    num_train_epochs=2   # change to 1 for quick tests
)

# Start training (uncomment next line to run training)
trainer.train()
# After training finishes, the best model will be in OUTPUT_DIR (Trainer saves automatically).


  trainer = Seq2SeqTrainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mf223435[0m ([33mf223435-fast-nuces[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.153,1.862644,24.9881,12.1074,20.8072,20.812,20.0
2,2.0526,1.860251,25.2384,12.2185,20.9541,20.9471,20.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2000, training_loss=2.070425079345703, metrics={'train_runtime': 444.6709, 'train_samples_per_second': 8.995, 'train_steps_per_second': 4.498, 'total_flos': 539654814892032.0, 'train_loss': 2.070425079345703, 'epoch': 2.0})

In [14]:
# Cell 11: Save the fine-tuned model and tokenizer to Drive (run after training completes)
trainer.save_model(OUTPUT_DIR)
t5_tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved to:", OUTPUT_DIR)


Saved to: /content/drive/MyDrive/t5_summarization_small


In [15]:
# Cell 12: Inference example. If you trained and saved model to OUTPUT_DIR it will use that; otherwise uses pre-trained T5.
from transformers import pipeline, AutoModelForSeq2SeqLM

# If you trained and saved model, use OUTPUT_DIR; else fallback to 't5-small'
model_path = OUTPUT_DIR if os.path.exists(os.path.join(OUTPUT_DIR, "pytorch_model.bin")) or os.path.exists(os.path.join(OUTPUT_DIR, "flax_model.msgpack")) else "t5-small"

summarizer = pipeline("summarization", model=model_path, tokenizer=t5_tokenizer, device=0 if __import__('torch').cuda.is_available() else -1)

sample_text = train_ds[0]['text']
print("ORIGINAL (first 1200 chars):\n", sample_text[:1200], "\n\nGENERATED SUMMARY:")
print(summarizer(sample_text[:1200], max_length=120, min_length=30, do_sample=False)[0]['summary_text'])


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


ORIGINAL (first 1200 chars):
 By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a c

In [16]:
# Cell 13: Evaluate the (trained or pretrained) model on the test set and print ROUGE scores.
import math
from transformers import AutoModelForSeq2SeqLM

# load model (trained if available otherwise pretrained t5-small)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
model.eval()
device = 0 if __import__('torch').cuda.is_available() else -1
if device == 0:
    model = model.to("cuda")

batch_size = 8
preds = []
refs = test_df['summary'].tolist()
texts = test_df['text'].tolist()

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    inputs = t5_tokenizer([PREFIX + t for t in batch_texts], return_tensors='pt', truncation=True, padding=True, max_length=MAX_INPUT_LENGTH)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    if device == 0:
        input_ids = input_ids.to('cuda')
        attention_mask = attention_mask.to('cuda')
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=MAX_TARGET_LENGTH)
    decoded = t5_tokenizer.batch_decode(outputs, skip_special_tokens=True)
    preds.extend(decoded)

# compute rouge
res = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
res = {k: round(v*100,4) for k,v in res.items()}
print("ROUGE results on test set:", res)


ROUGE results on test set: {'rouge1': np.float64(36.8966), 'rouge2': np.float64(15.8695), 'rougeL': np.float64(26.1984), 'rougeLsum': np.float64(31.9057)}


In [65]:
streamlit_code = r"""
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

st.title("T5 Summarization Demo")

MODEL_PATH = r"{model_path}"

@st.cache_resource
def load_summarizer(model_path):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
        # Use GPU if available, else CPU
        device = 0 if torch.cuda.is_available() else -1
        summarizer = pipeline('summarization', model=model, tokenizer=tokenizer, device=device)
        return summarizer
    except Exception as e:
        st.error(f"Error loading model: {{e}}")
        return None

st.write("Using model:", MODEL_PATH)
st.write("Device:", "GPU" if torch.cuda.is_available() else "CPU")

summarizer = load_summarizer(MODEL_PATH)

if summarizer:
    text = st.text_area("Paste article text", height=300)

    col1, col2 = st.columns(2)
    with col1:
        max_length = st.slider("Max summary length", 50, 200, 120)
    with col2:
        min_length = st.slider("Min summary length", 10, 100, 30)

    if st.button("Summarize") and text.strip():
        with st.spinner("Generating summary..."):
            try:
                # Truncate to max tokens the model can handle
                summary = summarizer(text[:1024], max_length=max_length,
                                   min_length=min_length, do_sample=False)[0]['summary_text']
                st.subheader("Generated Summary")
                st.write(summary)
            except Exception as e:
                st.error(f"Error during summarization: {{e}}")
"""

# Write file with OUTPUT_DIR inserted
app_path = os.path.join(DRIVE_ROOT, "streamlit_t5_app.py")
with open(app_path, "w") as f:
    f.write(streamlit_code.format(model_path=OUTPUT_DIR))

print("✓ Streamlit app saved to:", app_path)
print("Run with: streamlit run", app_path)

✓ Streamlit app saved to: /content/drive/MyDrive/streamlit_t5_app.py
Run with: streamlit run /content/drive/MyDrive/streamlit_t5_app.py


In [66]:
!pip install streamlit pyngrok




In [64]:
# All Streamlit processes ko band karein
!pkill -f streamlit

# Ya phir specific port ko free karein
!fuser -k 8501/tcp

# Ab nayi app run karein

8501/tcp:            33320


In [68]:
from pyngrok import ngrok

# Ngrok ko kill karein
ngrok.kill()

# Thoda wait karein
import time
time.sleep(2)

# Phir se start karein
public_url = ngrok.connect(8501)
print("Streamlit app:", public_url)

Streamlit app: NgrokTunnel: "https://nutty-estrella-phthisical.ngrok-free.dev" -> "http://localhost:8501"


In [69]:
# Install required packages
!pip install streamlit pyngrok

# Run in background
!nohup streamlit run {app_path} --server.port 8501 &

# Create public URL with ngrok
from pyngrok import ngrok
public_url = ngrok.connect(8501)
print("Streamlit app running at:", public_url)

nohup: appending output to 'nohup.out'
Streamlit app running at: NgrokTunnel: "https://nutty-estrella-phthisical.ngrok-free.dev" -> "http://localhost:8501"


In [45]:
from pyngrok import ngrok
ngrok.set_auth_token("35PfgOQEG4yXLBuZbnFMhCrUcSy_7uTeuWuKrqXbyttwW4jp")
