In [1]:
!pip install -q transformers accelerate sentencepiece requests


In [2]:
import requests
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)


In [5]:
import requests

url = "https://datasets-server.huggingface.co/rows"

params = {
    "dataset": "EdinburghNLP/xsum",  # ðŸ”´ INI YANG TADI SALAH
    "config": "default",
    "split": "train",
    "offset": 0,
    "length": 10
}

response = requests.get(url, params=params)
print(response.status_code)
print(response.json().keys())


200
dict_keys(['features', 'rows', 'num_rows_total', 'num_rows_per_page', 'partial'])


In [7]:
data = response.json()


In [8]:
rows = data["rows"]


In [9]:
documents = [r["row"]["document"] for r in rows]
summaries = [r["row"]["summary"] for r in rows]


In [10]:
print("DOCUMENT SAMPLE:\n", documents[0][:300])
print("\nREFERENCE SUMMARY:\n", summaries[0])


DOCUMENT SAMPLE:
 The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many

REFERENCE SUMMARY:
 Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.


In [11]:
data = response.json()["rows"]

print(len(data))
print(data[0]["row"].keys())


10
dict_keys(['document', 'summary', 'id'])


In [12]:
documents = [x["row"]["document"] for x in data]
summaries = [x["row"]["summary"] for x in data]

print("DOCUMENT SAMPLE:\n", documents[0][:300])
print("\nREFERENCE SUMMARY:\n", summaries[0])


DOCUMENT SAMPLE:
 The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many

REFERENCE SUMMARY:
 Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.


In [13]:
model_name = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("Model ready on", device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model ready on cuda


In [14]:
train_data = [
    {
        "input_text": "summarize: " + documents[i],
        "target_text": summaries[i]
    }
    for i in range(len(documents))
]

len(train_data)


10

In [15]:
def preprocess(example):
    inputs = tokenizer(
        example["input_text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    labels = tokenizer(
        example["target_text"],
        truncation=True,
        padding="max_length",
        max_length=64
    )
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_data = [preprocess(x) for x in train_data]
print(tokenized_data[0].keys())


KeysView({'input_ids': [21603, 10, 37, 423, 583, 13, 1783, 16, 20126, 16496, 6, 80, 13, 8, 844, 6025, 4161, 6, 19, 341, 271, 14841, 5, 7057, 161, 19, 4912, 16, 1626, 5981, 11, 186, 7540, 16, 1276, 15, 2296, 7, 5718, 2367, 14621, 4161, 57, 4125, 387, 5, 15059, 7, 30, 8, 4653, 4939, 711, 747, 522, 17879, 788, 12, 1783, 44, 8, 15763, 6029, 1813, 9, 7472, 5, 1404, 1623, 11, 5699, 277, 130, 4161, 57, 18368, 16, 20126, 16496, 227, 8, 2473, 5895, 15, 147, 89, 22411, 139, 8, 1511, 5, 1485, 3271, 3, 21926, 9, 472, 19623, 5251, 8, 616, 12, 15614, 8, 1783, 5, 37, 13818, 10564, 15, 26, 3, 9, 3, 19513, 1481, 6, 18368, 186, 1328, 2605, 30, 7488, 1887, 3, 18, 8, 711, 2309, 9517, 89, 355, 5, 3966, 1954, 9233, 15, 6, 113, 293, 7, 8, 16548, 13363, 106, 14022, 84, 47, 14621, 4161, 6, 243, 255, 228, 59, 7828, 8, 1249, 18, 545, 11298, 1773, 728, 8, 8347, 1560, 5, 611, 6, 255, 243, 72, 1709, 1528, 161, 228, 43, 118, 4006, 91, 12, 766, 8, 3, 19513, 1481, 410, 59, 5124, 5, 96, 196, 17, 19, 1256, 68, 27, 103, 

In [16]:
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.data[idx]["input_ids"]),
            "attention_mask": torch.tensor(self.data[idx]["attention_mask"]),
            "labels": torch.tensor(self.data[idx]["labels"]),
        }

train_dataset = SimpleDataset(tokenized_data)
len(train_dataset)


10

In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    num_train_epochs=1,
    fp16=False,
    logging_steps=5,
    save_steps=50,
    report_to="none"
)


In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

print("Trainer siap")


Trainer siap


  trainer = Trainer(


In [19]:
trainer.train()


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss
5,27.21


TrainOutput(global_step=5, training_loss=27.209988403320313, metrics={'train_runtime': 12.817, 'train_samples_per_second': 0.78, 'train_steps_per_second': 0.39, 'total_flos': 1858905047040.0, 'train_loss': 27.209988403320313, 'epoch': 1.0})

In [20]:
test_text = documents[1]

inputs = tokenizer(
    "summarize: " + test_text,
    return_tensors="pt",
    truncation=True
).to(device)

output = model.generate(**inputs, max_new_tokens=80)

print(tokenizer.decode(output[0], skip_special_tokens=True))


A fire has killed two tour groups in Northern Ireland, according to a man who was a teenager.
