In [3]:
from datasets import Dataset
import json

# Load your dataset (ensure you have the correct path)
def load_json_data(file_path):
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f.readlines()]
    return data

# Load the train data
train_data = load_json_data('/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/train_metrics_np_summary_prompt_category.json')

# Convert list of dictionaries to a dictionary of lists for the dataset format
def convert_to_dict_of_lists(data):
    dict_data = {}
    for entry in data:
        for key, value in entry.items():
            if key not in dict_data:
                dict_data[key] = []
            dict_data[key].append(value)
    return dict_data

train_dict = convert_to_dict_of_lists(train_data)

# Create the dataset from the converted dictionary
train_dataset = Dataset.from_dict(train_dict)

# Optionally print the dataset to verify
print(train_dataset)


Dataset({
    features: ['input', 'input_metrics', 'summary', 'id', 'prompt', 'input_noprompt'],
    num_rows: 286
})


In [6]:
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for sequence to sequence.
"""
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.

import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import nltk  # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset

import evaluate
import transformers
from filelock import FileLock
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    MBart50Tokenizer,
    MBart50TokenizerFast,
    MBartTokenizer,
    MBartTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry
from transformers.utils.versions import require_version

os.environ["NCCL_DEBUG"] = "INFO"

# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
#check_min_version("4.25.0.dev0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

logger = logging.getLogger(__name__)

try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    if is_offline_mode():
        raise LookupError(
            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
        )
    with FileLock(".lock") as lock:
        nltk.download("punkt", quiet=True)

# A list of all multilingual tokenizer which require lang attribute.
MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast]


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": (
                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                "with private models)."
            )
        },
    )
    resize_position_embeddings: Optional[bool] = field(
        default=None,
        metadata={
            "help": (
                "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
                "the model's position embeddings."
            )
        },
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    lang: Optional[str] = field(default=None, metadata={"help": "Language id for summarization."})

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    text_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
    )
    summary_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
    )
    validation_file: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "An optional input evaluation data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
            )
        },
    )
    test_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_source_length: Optional[int] = field(
        default=1024,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    max_target_length: Optional[int] = field(
        default=128,
        metadata={
            "help": (
                "The maximum total sequence length for target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    val_max_target_length: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
                "during ``evaluate`` and ``predict``."
            )
        },
    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to pad all samples to model maximum sentence length. "
                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
                "efficient on GPU but very bad for TPU."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
                "value if set."
            )
        },
    )
    num_beams: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
                "which is used during ``evaluate`` and ``predict``."
            )
        },
    )
    ignore_pad_token_for_loss: bool = field(
        default=True,
        metadata={
            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
        },
    )
    source_prefix: Optional[str] = field(
        default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
    )

    forced_bos_token: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The token to force as the first generated token after the decoder_start_token_id."
                "Useful for multilingual models like mBART where the first generated token"
                "needs to be the target language token (Usually it is the target language token)"
            )
        },
    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
            raise ValueError("Need either a dataset name or a training/validation file.")
        else:
            if self.train_file is not None:
                extension = self.train_file.split(".")[-1]
                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
        if self.val_max_target_length is None:
            self.val_max_target_length = self.max_target_length


summarization_name_mapping = {
    "amazon_reviews_multi": ("review_body", "review_title"),
    "big_patent": ("description", "abstract"),
    "cnn_dailymail": ("article", "highlights"),
    "orange_sum": ("text", "summary"),
    "pn_summary": ("article", "summary"),
    "psc": ("extract_text", "summary_text"),
    "samsum": ("dialogue", "summary"),
    "thaisum": ("body", "summary"),
    "xglue": ("news_body", "news_title"),
    "xsum": ("document", "summary"),
    "wiki_summary": ("article", "highlights"),
    "multi_news": ("document", "summary"),
}



In [12]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id="google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [3]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
model_id="google/flan-t5-base"
 
# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)
 

In [4]:
from peft import PeftModel
from transformers import AutoModelForSeq2SeqLM

base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
model = PeftModel.from_pretrained(base_model, "RMWeerasinghe/flan-t5-base-prompt_tuning-cnn-dailymail")

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
!pip install pytesseract transformers datasets rouge-score nltk tensorboard py7zr --upgrade
# install git-fls for pushing model and logs to the hugging face hub

Collecting py7zr
  Using cached py7zr-0.21.1-py3-none-any.whl (67 kB)
  Using cached py7zr-0.21.0-py3-none-any.whl (67 kB)
  Using cached py7zr-0.20.8-py3-none-any.whl (67 kB)
  Using cached py7zr-0.20.7-py3-none-any.whl (66 kB)


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import train_test_split

# Step 1: Load the model and tokenizer
model_id = "google/flan-t5-small"  # You can change this to a larger model if needed
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Step 2: Preprocess the dataset
# Load your dataset
dataset_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/train_modified.json"

import json

with open(dataset_path, 'r') as file:
    data = [json.loads(line) for line in file]

# Convert data into a HuggingFace dataset
data = [{'input': item['input'], 'output': item['output'], 'prompt': item['prompt']} for item in data]
dataset = Dataset.from_dict({
    "input": [item["input"] for item in data],
    "output": [item["output"] for item in data],
    "prompt": [item["prompt"] for item in data],
})

# Split the dataset into training and evaluation sets
train_dataset, eval_dataset = train_test_split(dataset, test_size=0.2)

# Step 3: Tokenize the inputs and outputs
def preprocess_function(examples):
    # Tokenizing input and output
    inputs = tokenizer(examples['input'], max_length=512, padding='max_length', truncation=True)
    outputs = tokenizer(examples['output'], max_length=128, padding='max_length', truncation=True)

    # Add labels to the input
    inputs["labels"] = outputs["input_ids"]
    return inputs

# Apply the tokenization to the dataset
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

# Step 4: Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan_t5_finetuned",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,
    learning_rate=5e-5,
    num_train_epochs=5,
    logging_dir="./logs",  # Optional: directory for storing logs
)

# Step 5: Define the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
)

# Step 6: Train the model
trainer.train()

# Optionally, save the fine-tuned model
trainer.save_model("./flan_t5_finetuned_model")


Downloading model.safetensors: 100%|██████████| 308M/308M [00:16<00:00, 18.5MB/s] 


AttributeError: 'dict' object has no attribute 'map'

In [None]:
from huggingface_hub import notebook_login
 
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import json
from datasets import load_dataset, Dataset

# Load your dataset from a local JSON file (adjust paths as needed)
data_files = {
    "train": "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/train_modified.json",
}

# Load dataset from local files
dataset = load_dataset("json", data_files=data_files)

# Function to format dataset for fine-tuning (prompt + input as the input text)
def format_for_finetuning(dataset):
    formatted_data = []
    for entry in dataset:
        # Concatenate the prompt and input to form the input text
        input_text = entry["prompt"] + entry["input"]
        output_text = entry["output"]  # Target output for the model
        
        # Add formatted entry to the list
        formatted_data.append({
            "input": input_text,
            "output": output_text
        })
    return formatted_data

Found cached dataset json (/home/surenoobster/.cache/huggingface/datasets/json/default-1b5a05f9096b368e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
100%|██████████| 1/1 [00:00<00:00, 499.98it/s]


In [None]:
import json
from datasets import load_dataset, Dataset

# Load your dataset from a local JSON file (adjust paths as needed)
data_files = {
    "train": "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/train_modified.json",
}

# Load dataset from local files
dataset = load_dataset("json", data_files=data_files)

# Function to format dataset for fine-tuning (prompt + input as the input text)
def format_for_finetuning(dataset):
    formatted_data = []
    for entry in dataset:
        # Concatenate the prompt and input to form the input text
        input_text = entry["prompt"] + entry["input"]
        output_text = entry["output"]  # Target output for the model
        
        # Add formatted entry to the list
        formatted_data.append({
            "input": input_text,
            "output": output_text
        })
    return formatted_data

# Format the dataset
train_data = format_for_finetuning(dataset['train'])

# Save the formatted dataset to a new JSON file in the same directory
formatted_data_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/formatted_train.json"

with open(formatted_data_path, "w") as outfile:
    json.dump(train_data, outfile, indent=4)

print(f"Formatted dataset saved to: {formatted_data_path}")


Found cached dataset json (/home/surenoobster/.cache/huggingface/datasets/json/default-1b5a05f9096b368e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
100%|██████████| 1/1 [00:00<00:00, 497.31it/s]

Formatted dataset saved to: /home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/formatted_train.json





In [None]:
import json

# Path to the newly saved formatted dataset
formatted_data_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/formatted_train.json"

# Load the formatted dataset from the file
with open(formatted_data_path, "r") as infile:
    formatted_data = json.load(infile)

# Print the first few entries to preview the formatted dataset
for i, entry in enumerate(formatted_data[:5]):  # Preview first 5 entries
    print(f"Entry {i+1}:")
    print(f"Input: {entry['input']}")
    print(f"Output: {entry['output']}")
    print("-" * 50)


Entry 1:
Input: Write highlights for this article for a middle school student:

(CNN) -- Six people, including three children, were killed when their plane crashed into rugged mountains east of Phoenix on a Thanksgiving eve trip, authorities said Thursday. Pinal County Sheriff Paul Babeu identified the dead as a father and his three children, plus two other men. All were Arizona residents and knew each other well, he said. The twin-engine Rockwell 690A airplane slammed into a steep cliff near the top of the Superstition Mountains at 6:31 p.m. on Wednesday, strewing debris for hundreds of yards down the 80-degree incline, Babeu told reporters. Recovery efforts by more than 50 deputies and volunteers involved collect and preserving the remains, the sheriff said. "No one could survive that crash," he said. The plane had flown from Safford to Mesa, where it picked up the siblings -- ages 9, 8 and 6 -- for the Thanksgiving holiday, Babeu said. "All of these families are just obviously heart

In [None]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 7fc621c6-5d06-451f-9781-f26858644263)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-base/resolve/main/spiece.model


In [None]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
model_id="google/flan-t5-base"
 
# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id="google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [2]:
from datasets import load_dataset

# Path to your dataset
train_data_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/formatted_train.json"

# Load dataset from the local file
custom_dataset = load_dataset("json", data_files={"train": train_data_path})

# Check dataset sizes
print(f"Train dataset size: {len(custom_dataset['train'])}")

# Preview a random sample
from random import randrange

sample = custom_dataset['train'][randrange(len(custom_dataset['train']))]
print(f"Input (Prompt + Article): \n{sample['input']}\n---------------")
print(f"Output (Summary): \n{sample['output']}\n---------------")


Found cached dataset json (/home/surenoobster/.cache/huggingface/datasets/json/default-f3b513332d46f951/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

Train dataset size: 286
Input (Prompt + Article): 
Write highlights for this article for a middle school student:

By . Brian Marjoribanks . Ally McCoist insists Sir David Murray is owed an apology from people both ‘in and out of football’ after the ‘needless’ collapse of Rangers in 2012. Last week Murray pointed the finger at the taxman after a judge kicked out HMRC’ s appeal in their long-running case against the Ibrox club. The former owner and chairman said the so-called Big Tax Case case had put off potential buyers and led to Craig Whyte’s disastrous takeover, while fans groups have already called for the SFA, SPFL and other club owners to be held to account over the decision to relegate Rangers when they went bust. Leader: Rangers boss Ally McCoist with players Lee McCulloch (left) and Richard Foster . Still got it: McCoist kicks a ball around during training in California . On Tuesday McCoist said he would ‘never forget’ the role played by leading figures in and around the Scot

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import concatenate_datasets, load_dataset

# Model ID for FLAN-T5
model_id = "google/flan-t5-small"

# Load tokenizer for FLAN-T5
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Path to the formatted dataset
train_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/formatted_train.json"

# Load the formatted dataset
data_files = {"train": train_path}
print("Loading dataset...")
dataset = load_dataset("json", data_files=data_files)

# Compute maximum input and output lengths for efficient batching
print("Calculating max input and target lengths...")

# Tokenize the concatenated dataset for inputs
tokenized_inputs = dataset["train"].map(
    lambda x: tokenizer(x["input"], truncation=True),
    batched=True,
    remove_columns=["input", "output"]
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# Tokenize the concatenated dataset for targets
tokenized_targets = dataset["train"].map(
    lambda x: tokenizer(x["output"], truncation=True),
    batched=True,
    remove_columns=["input", "output"]
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

# Preprocessing function
def preprocess_function(sample, padding="max_length"):
    # Prepare inputs with a task prefix (e.g., "summarize:")
    inputs = ["summarize: " + item for item in sample["input"]]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["output"], max_length=max_target_length, padding=padding, truncation=True)

    # Replace padding token ID with -100 to ignore during loss computation
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to the dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["input", "output"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


Loading tokenizer...
Loading dataset...


Found cached dataset json (/home/surenoobster/.cache/huggingface/datasets/json/default-f3b513332d46f951/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

Calculating max input and target lengths...


Map:   0%|          | 0/286 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/286 [00:00<?, ? examples/s]

Max target length: 186
Tokenizing dataset...


Map:   0%|          | 0/286 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [7]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize

# Load FLAN-T5 model
model_id = "google/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Initialize tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Download necessary NLTK resources
nltk.download("punkt")

# Load ROUGE metric
metric = evaluate.load("rouge")

# Helper function to postprocess text for evaluation
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects newlines after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

# Compute metrics function for evaluation
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process predictions and labels
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

# Define a DataCollator for Seq2Seq tasks
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="max_length", label_pad_token_id=-100)

# Print confirmation
print("Model, tokenizer, metric, and data collator loaded successfully.")


[nltk_data] Downloading package punkt to
[nltk_data]     /home/surenoobster/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Model, tokenizer, metric, and data collator loaded successfully.


In [10]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=None,
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./flan_t5_base_output",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    fp16=True,
    learning_rate=5e-5,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="epoch",
    save_total_limit=2,
    evaluation_strategy="no",
    report_to=["tensorboard"],
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)

# Free memory before training
import torch
torch.cuda.empty_cache()

trainer.train()
trainer.save_model("./final_model")


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 5.77 GiB total capacity; 5.50 GiB already allocated; 11.75 MiB free; 5.58 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [16]:
pip install accelerate -U

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [11]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Device Count: {torch.cuda.device_count()}")
else:
    print("GPU is not available!")


GPU is available!
Device Name: NVIDIA GeForce RTX 4050 Laptop GPU
CUDA Device Count: 1
