In [1]:
!pip install datasets



In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AdamW
from torch.utils.data import DataLoader, Dataset

In [3]:
# Load the dataset
dataset_path = "/content/Corpus_all.csv"
try:
    data = pd.read_csv(dataset_path, encoding="utf-8")
except UnicodeDecodeError:
    data = pd.read_csv(dataset_path, encoding="ISO-8859-1")

In [4]:
print(data.head())

                                            Document  \
0  412.106.1 English is not an official language ...   
1  944.021.1 English is not an official language ...   
2  Title and commencement 1 This order may be cit...   
3  Citation and commencement 1 This order may be ...   
4  Title, commencement and interpretation 1 1 Thi...   

                                             Summary  
0  The Swiss Federal University for Vocational Ed...  
1  The EAER Ordinance on the Declaration for Timb...  
2  The North West Water Authority (Solway Firth) ...  
3  The Trafford Park Development Corporation (Are...  
4  The North West Water Authority (Returns of Eel...  


In [5]:
# Extract documents and summaries
data = data.dropna(subset=["Document", "Summary"])  # Ensure no missing values
data.reset_index(drop=True, inplace=True)

input_texts = data["Document"].tolist()
target_texts = data["Summary"].tolist()

In [6]:
print(input_texts[0])
print(target_texts[0])

412.106.1 English is not an official language of the Swiss Confederation. This translation is provided for information purposes only, has no legal force and may not be relied on in legal proceedings. Ordinance on the Swiss Federal University for Vocational Education and Training(SFUVET Ordinance)of 18 June 2021 (Status as of 1 August 2021)The Swiss Federal Council,on the basis of Article 35 of the SFUVET Act of 25 September 20201,ordains:1 SR 412.106Art. 1 Registered location The Swiss Federal University for Vocational Education and Training (SFUVET) shall be based in Zollikofen.Art. 2 Regional campuses SFUVET shall offer its services through three regional campuses: one in the German-speaking region, one in the French-speaking region and one in the Italian-speaking region of Switzerland.Art. 3 Federal Council's strategic objectives The Federal Department of Economic Affairs, Education and Research (EAER) shall submit SFUVET's strategic objectives drafted by the Federal Council to the 

In [7]:
# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
# Define the custom dataset class
class CustomDataset(Dataset):
    def __init__(self, tokenizer, input_texts, target_texts, max_input_length=512, max_target_length=128):
        self.tokenizer = tokenizer
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_encoding = self.tokenizer(
            self.input_texts[idx],
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        target_encoding = self.tokenizer(
            self.target_texts[idx],
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": input_encoding["input_ids"].squeeze(),
            "attention_mask": input_encoding["attention_mask"].squeeze(),
            "labels": target_encoding["input_ids"].squeeze(),
        }



In [9]:
# Prepare the dataset and dataloader
dataset = CustomDataset(tokenizer, input_texts, target_texts)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Set up training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)



In [26]:
# Training loop
from tqdm import tqdm

epochs = 50
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(dataloader)}")

100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 1/50, Loss: 0.5971155092120171


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 2/50, Loss: 0.5570804625749588


100%|██████████| 4/4 [00:02<00:00,  1.41it/s]


Epoch 3/50, Loss: 0.5687284097075462


100%|██████████| 4/4 [00:02<00:00,  1.41it/s]


Epoch 4/50, Loss: 0.5516206175088882


100%|██████████| 4/4 [00:02<00:00,  1.40it/s]


Epoch 5/50, Loss: 0.5512058585882187


100%|██████████| 4/4 [00:02<00:00,  1.39it/s]


Epoch 6/50, Loss: 0.568045124411583


100%|██████████| 4/4 [00:02<00:00,  1.38it/s]


Epoch 7/50, Loss: 0.5512906014919281


100%|██████████| 4/4 [00:02<00:00,  1.38it/s]


Epoch 8/50, Loss: 0.5341450348496437


100%|██████████| 4/4 [00:02<00:00,  1.39it/s]


Epoch 9/50, Loss: 0.5396833419799805


100%|██████████| 4/4 [00:02<00:00,  1.39it/s]


Epoch 10/50, Loss: 0.5248352885246277


100%|██████████| 4/4 [00:02<00:00,  1.40it/s]


Epoch 11/50, Loss: 0.5292097255587578


100%|██████████| 4/4 [00:02<00:00,  1.41it/s]


Epoch 12/50, Loss: 0.5038350224494934


100%|██████████| 4/4 [00:02<00:00,  1.41it/s]


Epoch 13/50, Loss: 0.5366717278957367


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 14/50, Loss: 0.5139552280306816


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 15/50, Loss: 0.5003674253821373


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 16/50, Loss: 0.4981839954853058


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 17/50, Loss: 0.5173837020993233


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 18/50, Loss: 0.4758581519126892


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 19/50, Loss: 0.48524101823568344


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 20/50, Loss: 0.4832470044493675


100%|██████████| 4/4 [00:02<00:00,  1.44it/s]


Epoch 21/50, Loss: 0.460527628660202


100%|██████████| 4/4 [00:02<00:00,  1.44it/s]


Epoch 22/50, Loss: 0.4704485759139061


100%|██████████| 4/4 [00:02<00:00,  1.44it/s]


Epoch 23/50, Loss: 0.45412173867225647


100%|██████████| 4/4 [00:02<00:00,  1.44it/s]


Epoch 24/50, Loss: 0.4613063260912895


100%|██████████| 4/4 [00:02<00:00,  1.44it/s]


Epoch 25/50, Loss: 0.46500978618860245


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 26/50, Loss: 0.47520340979099274


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 27/50, Loss: 0.4539526626467705


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 28/50, Loss: 0.4499318301677704


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 29/50, Loss: 0.4426821395754814


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 30/50, Loss: 0.4326743558049202


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 31/50, Loss: 0.43978531658649445


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 32/50, Loss: 0.43243376165628433


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 33/50, Loss: 0.4322118014097214


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 34/50, Loss: 0.43673650175333023


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 35/50, Loss: 0.41552528738975525


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 36/50, Loss: 0.427302710711956


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 37/50, Loss: 0.397468626499176


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 38/50, Loss: 0.4234248846769333


100%|██████████| 4/4 [00:02<00:00,  1.41it/s]


Epoch 39/50, Loss: 0.41360220313072205


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 40/50, Loss: 0.39866943657398224


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 41/50, Loss: 0.405057817697525


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 42/50, Loss: 0.3879104405641556


100%|██████████| 4/4 [00:02<00:00,  1.41it/s]


Epoch 43/50, Loss: 0.3792262151837349


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 44/50, Loss: 0.3860490992665291


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 45/50, Loss: 0.3839370757341385


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 46/50, Loss: 0.38085732609033585


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 47/50, Loss: 0.38162772357463837


100%|██████████| 4/4 [00:02<00:00,  1.42it/s]


Epoch 48/50, Loss: 0.3881320059299469


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]


Epoch 49/50, Loss: 0.3768928050994873


100%|██████████| 4/4 [00:02<00:00,  1.43it/s]

Epoch 50/50, Loss: 0.3801989331841469





In [27]:
# Evaluation
def evaluate_model(dataset, tokenizer, model):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=16)
    predictions = []
    references = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=128)
            predictions.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))
            references.extend([tokenizer.decode(ref, skip_special_tokens=True) for ref in batch["labels"]])

    return predictions, references

In [28]:
# Run evaluation
predictions, references = evaluate_model(dataset, tokenizer, model)

Evaluating: 100%|██████████| 4/4 [00:07<00:00,  1.90s/it]


In [29]:
# Calculate BLEU Score
from nltk.translate.bleu_score import corpus_bleu

def calculate_bleu(predictions, references):
    tokenized_predictions = [pred.split() for pred in predictions]
    tokenized_references = [[ref.split()] for ref in references]
    return corpus_bleu(tokenized_references, tokenized_predictions)

bleu_score = calculate_bleu(predictions, references)
print(f"BLEU Score: {bleu_score}")

BLEU Score: 0.700581421159652
