In [3]:
from datasets import Dataset
import json

# Load your dataset (ensure you have the correct path)
def load_json_data(file_path):
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f.readlines()]
    return data

# Load the train data
train_data = load_json_data('/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/train_metrics_np_summary_prompt_category.json')

# Convert list of dictionaries to a dictionary of lists for the dataset format
def convert_to_dict_of_lists(data):
    dict_data = {}
    for entry in data:
        for key, value in entry.items():
            if key not in dict_data:
                dict_data[key] = []
            dict_data[key].append(value)
    return dict_data

train_dict = convert_to_dict_of_lists(train_data)

# Create the dataset from the converted dictionary
train_dataset = Dataset.from_dict(train_dict)

# Optionally print the dataset to verify
print(train_dataset)


Dataset({
    features: ['input', 'input_metrics', 'summary', 'id', 'prompt', 'input_noprompt'],
    num_rows: 286
})


In [6]:
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for sequence to sequence.
"""
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.

import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import nltk  # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset

import evaluate
import transformers
from filelock import FileLock
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    MBart50Tokenizer,
    MBart50TokenizerFast,
    MBartTokenizer,
    MBartTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry
from transformers.utils.versions import require_version

os.environ["NCCL_DEBUG"] = "INFO"

# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
#check_min_version("4.25.0.dev0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

logger = logging.getLogger(__name__)

try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    if is_offline_mode():
        raise LookupError(
            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
        )
    with FileLock(".lock") as lock:
        nltk.download("punkt", quiet=True)

# A list of all multilingual tokenizer which require lang attribute.
MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast]


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": (
                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                "with private models)."
            )
        },
    )
    resize_position_embeddings: Optional[bool] = field(
        default=None,
        metadata={
            "help": (
                "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
                "the model's position embeddings."
            )
        },
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    lang: Optional[str] = field(default=None, metadata={"help": "Language id for summarization."})

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    text_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
    )
    summary_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
    )
    validation_file: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "An optional input evaluation data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
            )
        },
    )
    test_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_source_length: Optional[int] = field(
        default=1024,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    max_target_length: Optional[int] = field(
        default=128,
        metadata={
            "help": (
                "The maximum total sequence length for target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    val_max_target_length: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
                "during ``evaluate`` and ``predict``."
            )
        },
    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to pad all samples to model maximum sentence length. "
                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
                "efficient on GPU but very bad for TPU."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
                "value if set."
            )
        },
    )
    num_beams: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
                "which is used during ``evaluate`` and ``predict``."
            )
        },
    )
    ignore_pad_token_for_loss: bool = field(
        default=True,
        metadata={
            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
        },
    )
    source_prefix: Optional[str] = field(
        default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
    )

    forced_bos_token: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The token to force as the first generated token after the decoder_start_token_id."
                "Useful for multilingual models like mBART where the first generated token"
                "needs to be the target language token (Usually it is the target language token)"
            )
        },
    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
            raise ValueError("Need either a dataset name or a training/validation file.")
        else:
            if self.train_file is not None:
                extension = self.train_file.split(".")[-1]
                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
        if self.val_max_target_length is None:
            self.val_max_target_length = self.max_target_length


summarization_name_mapping = {
    "amazon_reviews_multi": ("review_body", "review_title"),
    "big_patent": ("description", "abstract"),
    "cnn_dailymail": ("article", "highlights"),
    "orange_sum": ("text", "summary"),
    "pn_summary": ("article", "summary"),
    "psc": ("extract_text", "summary_text"),
    "samsum": ("dialogue", "summary"),
    "thaisum": ("body", "summary"),
    "xglue": ("news_body", "news_title"),
    "xsum": ("document", "summary"),
    "wiki_summary": ("article", "highlights"),
    "multi_news": ("document", "summary"),
}



In [6]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq


In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id="google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [3]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
model_id="google/flan-t5-base"
 
# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)
 

In [4]:
from peft import PeftModel
from transformers import AutoModelForSeq2SeqLM

base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
model = PeftModel.from_pretrained(base_model, "RMWeerasinghe/flan-t5-base-prompt_tuning-cnn-dailymail")

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [4]:
import json
from datasets import load_dataset, Dataset

# Load your dataset from a local JSON file (adjust paths as needed)
data_files = {
    "train": "/home/surenoobster/Documents/controllable-readability-summarization/src/18dec_retried_finalboss/train_prompt_category.json",
}

# Load dataset from local files
dataset = load_dataset("json", data_files=data_files)

# Function to format dataset for fine-tuning (using `input` as input text and `summary` as the output)
def format_for_finetuning(dataset):
    formatted_data = []
    for entry in dataset["train"]:  # Access the 'train' split
        # Use 'input' as the input text
        input_text = entry["input"]
        output_text = entry["summary"]  # Use 'summary' as the target output
        
        # Add formatted entry to the list
        formatted_data.append({
            "input": input_text,
            "output": output_text
        })
    return formatted_data

# Format the dataset
formatted_dataset = format_for_finetuning(dataset)

# Convert the formatted data into a Hugging Face Dataset for further processing
formatted_dataset = Dataset.from_list(formatted_dataset)

# Save the formatted dataset to a JSON file (optional)
output_file = "/home/surenoobster/Documents/controllable-readability-summarization/src/18dec_retried_finalboss/read_for_finetune_flanT5.json"
formatted_dataset.to_json(output_file)

print(f"Formatted dataset saved to: {output_file}")


Found cached dataset json (/home/surenoobster/.cache/huggingface/datasets/json/default-82b87a9f9b0089e0/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Formatted dataset saved to: /home/surenoobster/Documents/controllable-readability-summarization/src/18dec_retried_finalboss/read_for_finetune_flanT5.json


In [3]:
import json
from datasets import load_dataset, Dataset

# Load your dataset from a local JSON file (adjust paths as needed)
data_files = {
    "train": "/home/surenoobster/Documents/controllable-readability-summarization/src/18dec_retried_finalboss/train_prompt_category.json",
}

# Load dataset from local files
dataset = load_dataset("json", data_files=data_files)

# Function to format dataset for fine-tuning (prompt + input as the input text)
def format_for_finetuning(dataset):
    formatted_data = []
    for entry in dataset:
        # Concatenate the prompt and input to form the input text
        input_text = entry["prompt"] + entry["input"]
        output_text = entry["output"]  # Target output for the model
        
        # Add formatted entry to the list
        formatted_data.append({
            "input": input_text,
            "output": output_text
        })
    return formatted_data

# Format the dataset
train_data = format_for_finetuning(dataset['train'])

# Save the formatted dataset to a new JSON file in the same directory
formatted_data_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/18dec_retried_finalboss/read_for_finetune_flanT5.json"

with open(formatted_data_path, "w") as outfile:
    json.dump(train_data, outfile, indent=4)

print(f"Formatted dataset saved to: {formatted_data_path}")


Downloading and preparing dataset json/default to /home/surenoobster/.cache/huggingface/datasets/json/default-82b87a9f9b0089e0/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/surenoobster/.cache/huggingface/datasets/json/default-82b87a9f9b0089e0/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

KeyError: 'output'

In [5]:
import json

# Path to the newly saved formatted dataset
formatted_data_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/18dec_retried_finalboss/read_for_finetune_flanT5.json"

# Load the formatted dataset from the file
with open(formatted_data_path, "r") as infile:
    formatted_data = json.load(infile)

# Print the first few entries to preview the formatted dataset
for i, entry in enumerate(formatted_data[:5]):  # Preview first 5 entries
    print(f"Entry {i+1}:")
    print(f"Input: {entry['input']}")
    print(f"Output: {entry['output']}")
    print("-" * 50)


JSONDecodeError: Extra data: line 2 column 1 (char 3639)

In [1]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [13]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
model_id="google/flan-t5-base"
 
# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id="google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [6]:
from datasets import load_dataset

# Path to your dataset
train_data_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/18dec_retried_finalboss/read_for_finetune_flanT5.json"

# Load dataset from the local file
custom_dataset = load_dataset("json", data_files={"train": train_data_path})

# Check dataset sizes
print(f"Train dataset size: {len(custom_dataset['train'])}")

# Preview a random sample
from random import randrange

sample = custom_dataset['train'][randrange(len(custom_dataset['train']))]
print(f"Input (Prompt + Article): \n{sample['input']}\n---------------")
print(f"Output (Summary): \n{sample['output']}\n---------------")


Downloading and preparing dataset json/default to /home/surenoobster/.cache/huggingface/datasets/json/default-b19da6c87aedebd4/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/surenoobster/.cache/huggingface/datasets/json/default-b19da6c87aedebd4/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Train dataset size: 4884
Input (Prompt + Article): 
Write highlights for this article for a high school student:

A teenager accused of shooting and stabbing an elderly stranger to death had penned a detailed list of plans including 'select prey' and 'enjoy kill', according to reports. Maxwell Winkler, 17, was found riding his bike through a wooded area near to the murder scene in Fishers, Indiana on Monday, two days after Henry Kim, 73, was found killed in Windermere Park. After they took the high school student into custody, police went to his mother's house and found the notes inside a wallet, according to court documents seen by the Indianapolis Star. 'Prepare site and prepare tools for cleanup,' the notes read. 'Select prey. Wait and act inauspicious (play with phone and wave) till prey passes by to strike or surprise attack using bike near site. 'Sick': High school student Maxwell Winkler, 17, left, has been accused of shooting and stabbing 73-year-old Henry Kim, right, to death 

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import concatenate_datasets, load_dataset

# Model ID for FLAN-T5
model_id = "google/flan-t5-small"

# Load tokenizer for FLAN-T5
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Path to the formatted dataset
train_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/18dec_retried_finalboss/read_for_finetune_flanT5.json"

# Load the formatted dataset
data_files = {"train": train_path}
print("Loading dataset...")
dataset = load_dataset("json", data_files=data_files)

# Compute maximum input and output lengths for efficient batching
print("Calculating max input and target lengths...")

# Tokenize the concatenated dataset for inputs
tokenized_inputs = dataset["train"].map(
    lambda x: tokenizer(x["input"], truncation=True),
    batched=True,
    remove_columns=["input", "output"]
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# Tokenize the concatenated dataset for targets
tokenized_targets = dataset["train"].map(
    lambda x: tokenizer(x["output"], truncation=True),
    batched=True,
    remove_columns=["input", "output"]
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

# Preprocessing function
def preprocess_function(sample, padding="max_length"):
    # Prepare inputs with a task prefix (e.g., "summarize:")
    inputs = ["summarize: " + item for item in sample["input"]]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["output"], max_length=max_target_length, padding=padding, truncation=True)

    # Replace padding token ID with -100 to ignore during loss computation
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to the dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["input", "output"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


Loading tokenizer...
Loading dataset...


Found cached dataset json (/home/surenoobster/.cache/huggingface/datasets/json/default-b19da6c87aedebd4/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

Calculating max input and target lengths...


Map:   0%|          | 0/4884 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/4884 [00:00<?, ? examples/s]

Max target length: 512
Tokenizing dataset...


Map:   0%|          | 0/4884 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [8]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize

# Load FLAN-T5 model
model_id = "google/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Initialize tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Download necessary NLTK resources
nltk.download("punkt")

# Load ROUGE metric
metric = evaluate.load("rouge")

# Helper function to postprocess text for evaluation
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects newlines after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

# Compute metrics function for evaluation
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process predictions and labels
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

# Define a DataCollator for Seq2Seq tasks
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="max_length", label_pad_token_id=-100)

# Print confirmation
print("Model, tokenizer, metric, and data collator loaded successfully.")


[nltk_data] Downloading package punkt to
[nltk_data]     /home/surenoobster/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Model, tokenizer, metric, and data collator loaded successfully.


In [13]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=None,
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./flan_t5_base_output",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    fp16=True,
    learning_rate=5e-5,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="epoch",
    save_total_limit=2,
    evaluation_strategy="no",
    report_to=["tensorboard"],
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)

# Free memory before training
import torch
torch.cuda.empty_cache()

trainer.train()
trainer.save_model("./final_model_CATEGORY")
# Save the tokenizer
tokenizer.save_pretrained("./final_model_CATEGORY")






  0%|          | 0/3050 [00:00<?, ?it/s]

{'loss': 0.0, 'learning_rate': 4.452459016393443e-05, 'epoch': 0.82}
{'loss': 0.0, 'learning_rate': 3.632786885245902e-05, 'epoch': 1.64}
{'loss': 0.0, 'learning_rate': 2.813114754098361e-05, 'epoch': 2.46}
{'loss': 0.0, 'learning_rate': 1.99344262295082e-05, 'epoch': 3.28}
{'loss': 0.0, 'learning_rate': 1.1737704918032788e-05, 'epoch': 4.1}
{'loss': 0.0, 'learning_rate': 3.540983606557377e-06, 'epoch': 4.91}
{'train_runtime': 1560.7708, 'train_samples_per_second': 15.646, 'train_steps_per_second': 1.954, 'train_loss': 0.0, 'epoch': 5.0}


('./final_model_CATEGORY/tokenizer_config.json',
 './final_model_CATEGORY/special_tokens_map.json',
 './final_model_CATEGORY/spiece.model',
 './final_model_CATEGORY/added_tokens.json',
 './final_model_CATEGORY/tokenizer.json')

In [10]:
pip install accelerate -U

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [11]:
import torch
print(torch.cuda.device_count())  # Number of GPUs available


1


In [12]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Device Count: {torch.cuda.device_count()}")
else:
    print("GPU is not available!")


GPU is available!
Device Name: NVIDIA GeForce RTX 4050 Laptop GPU
CUDA Device Count: 1


In [9]:
import torch
print(torch.cuda.device_count())  # Number of GPUs available


1


In [15]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load your fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./final_model_CATEGORY")
tokenizer = AutoTokenizer.from_pretrained("./final_model_CATEGORY")

# Input text to summarize (as "Prompt + Article")
input_text = """
Los Angeles (CNN)It's more than just one state's internal problem. The historic California drought hurts the rest of the union, too. That's because California is a breadbasket to the nation, growing more than a third of its vegetables and nearly two-thirds of its fruits and nuts. Here's why we should heed the ongoing drought in the most populous state, a slowly expanding natural disaster now in its fourth year that this week prompted Gov. Jerry Brown to announce a mandatory 25% cutback in water consumption in all cities. In 2014, one expert predicted consumers would pay more for some groceries because of the California drought. He was often right, according to statistics gathered by Timothy Richards, agribusiness professor at Arizona State University. Prices rose last year for these items on your kitchen table: . \u2022 Berries rose in price by about 80 cents per clamshell to $3.88 . \u2022 Broccoli by 11 cents per pound to $1.89. \u2022 Grapes by 64 cents a pound to $3.06 . \u2022 Melons by 24 cents a pound to $1.23. \u2022 Packaged salad by 23 cents a bag to $2.91. \u2022 Peppers by 26 cents a pound to $2.39. Though fruits and vegetable prices fell in February, overall prices are expected to rise this year, because of inflation, U.S. Department of Agriculture economist Annemarie Kuhns said. Fresh fruit prices are projected to rise between 2.5% and 3.5%, and vegetables between 2% and 3%, close to historical average increases, Kuhns said. Whether the California drought will affect food prices again this year is unknown, thanks to a strong dollar. The greenback's strength allows producers to import crops that may be withering under the absence of West Coast rain or other misfortunes elsewhere in the nation, Kuhns said. Moreover, the drop in oil prices also eases the cost of transporting food from California to the other 49 states, she said. What economists don't know yet is whether farmers will plant fewer crops because of the drought. Those decisions are now being made in the field and could boost supermarket prices, she said. \"The drought in California does have the potential to impact the price we pay for fresh fruit and fresh vegetables and dairy and fresh eggs we pay at the counter,\" Kuhns said. \"We are not sure what the exact impact will be.\" The reality is there's a major drought throughout the West and Southwest. While not as bad as California, Texas and Oklahoma are also seeing extreme and exceptional drought -- the two worst categories -- in several parts of their states, the U.S. Drought Monitor said this week. Overall, the Western drought affects more than 52 million people, the monitor says. As a result, consumers paid a whopping extra 12.1% for beef and veal in 2014, the USDA reports. Straining under a drought that began in 2012, ranchers in Texas and Oklahoma last year saw smaller grazing pastures, paid more for feed, and experienced difficulties accessing water to cool their cattle. So the cattlemen began culling their herds, Kuhns said. This year's beef and veal prices should rise only by 6% at most, still higher than the 4.1% historical average, the feds project. But beef prices offer an object lesson about the drought. \"There's other areas being affected,\" Kuhns said. It's called the Golden State for the gold rush of yore, but let's face it: the rest of the nation flocks to California for vacation because of another golden reason. Its year-round sunshine. So the next time you take a holiday in California, you'll find a few changes around here, thanks to the drought. Like asking for a glass of water at a restaurant. You won't find water waiting for you on the table. Eateries now \"can only serve water to customers on request,\" the State Water Resources Control Board declared in March under expanded emergency regulations. Tourists can also expect to hear a lot of requests at hotels about whether they want their linens and towels laundered daily. These requests are mandatory under the new regulations. And they'll see fewer homes running decorative fountains. Because much of the snowpack in the Sierra Nevada has alarmingly disappeared, many ski resorts shut down early this year, including at Lake Tahoe, and some are now building zip lines, mountain bike trails and wedding venues to keep tourists coming, the Sacramento Bee reported. \"If the drought continues through next winter and we do not conserve more, the consequences could be even more catastrophic than they already are,\" State Water Board Chair Felicia Marcus said in March. But what about those yummy California wines, you ask? Guess what. They're only getting better -- because of the drought. Yes, you read that right. The 2014 wine grape harvest was \"third in a string of great vintages this decade,\" the Wine Institute says. \"California vintners and growers across the state are grateful for another excellent vintage, despite an ongoing drought and earthquake that rocked south Napa in late August just as crush was getting underway,\" the institute said in a statement last year. \"A mild winter and spring caused early bud break, although the overall length of the growing season was similar to past years.\" Wine grapes use relatively low water, said institute spokeswoman Gladys Horiuchi. \"Yes, drought years tend to produce terrific quality,\" she added. \"With the record high California wine grape harvests in 2012, 2013 and 2014, there is a good supply of California wine.\" That may be the only thing to toast about this drought.","summary":"Americans paid more for some fruits and vegetables last year because of the drought .\nTourists will now have to ask for a glass of water at a California restaurant .\nPerhaps the only good thing is another \"great\" wine grape harvest last year ."
"""

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)

# Generate the summary
summary_ids = model.generate(inputs["input_ids"], max_length=150, num_beams=4, early_stopping=True)

# Decode and format the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summary in the requested format
print("Requested Output:")
print(summary)


Requested Output:
California is a breadbasket to the nation, growing more than a third of its vegetables and nearly two-thirds of its fruits and nuts. Here's why we should heed the ongoing drought in the most populous state, a slowly expanding natural disaster now in its fourth year that this week prompted Gov. Jerry Brown to announce a mandatory 25% cutback in water consumption in all cities. In 2014, one expert predicted consumers would pay more for some groceries because of the California drought.


In [9]:
import torch

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Example tensor operation on GPU
tensor = torch.randn(1000, 1000).to(device)
result = tensor @ tensor
print(result)


tensor([[ 16.7388,  21.6648, -48.9924,  ..., -32.6129, -14.0957, -53.3370],
        [ 29.6425,  23.5408,   1.3587,  ..., -21.5251, -24.3179, -33.2913],
        [  5.2434, -22.0096, -11.9205,  ...,  -0.9432,  20.6076,  -1.3561],
        ...,
        [  7.8429,  20.9002,  58.5818,  ..., -66.3809,  26.0704,  23.6928],
        [  4.7462,  35.3645,  45.0954,  ..., -16.6990,  -2.6634,   5.6799],
        [ 42.1958,  -9.2703,  35.9708,  ...,   8.2285,  38.3363,  20.0297]])


In [10]:
print(torch.cuda.memory_summary())



KeyError: 'allocated_bytes.all.current'

In [29]:
import torch
print(torch.__version__)
print(torch.version.cuda)


1.13.1+cu117
11.7


In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_path = "/home/surenoobster/Documents/controllable-readability-summarization/Finetuning_flant5/final_model"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [None]:
input_text = "Summarize: write it for  Artificial intelligence is transforming industries worldwide."

# Tokenize the input
inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to(device)

# Generate predictions
outputs = model.generate(
    inputs["input_ids"],
    max_length=50,  # Adjust based on your task
    num_beams=4,    # Optional: Use beam search for better results
    early_stopping=True
)

# Decode the generated tokens
predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text:", predicted_text)


Generated Text: Artificial intelligence is transforming industries worldwide.


In [15]:
# Check the CUDA_VISIBLE_DEVICES environment variable
!echo $CUDA_VISIBLE_DEVICES

# Alternatively, set it within the notebook
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Set to GPU 0
!echo $CUDA_VISIBLE_DEVICES


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0


In [16]:
!echo $CUDA_VISIBLE_DEVICES


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0


In [17]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
