In [3]:
from datasets import Dataset
import json

# Load your dataset (ensure you have the correct path)
def load_json_data(file_path):
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f.readlines()]
    return data

# Load the train data
train_data = load_json_data('/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/train_metrics_np_summary_prompt_category.json')

# Convert list of dictionaries to a dictionary of lists for the dataset format
def convert_to_dict_of_lists(data):
    dict_data = {}
    for entry in data:
        for key, value in entry.items():
            if key not in dict_data:
                dict_data[key] = []
            dict_data[key].append(value)
    return dict_data

train_dict = convert_to_dict_of_lists(train_data)

# Create the dataset from the converted dictionary
train_dataset = Dataset.from_dict(train_dict)

# Optionally print the dataset to verify
print(train_dataset)


Dataset({
    features: ['input', 'input_metrics', 'summary', 'id', 'prompt', 'input_noprompt'],
    num_rows: 286
})


In [6]:
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for sequence to sequence.
"""
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.

import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import nltk  # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset

import evaluate
import transformers
from filelock import FileLock
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    MBart50Tokenizer,
    MBart50TokenizerFast,
    MBartTokenizer,
    MBartTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry
from transformers.utils.versions import require_version

os.environ["NCCL_DEBUG"] = "INFO"

# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
#check_min_version("4.25.0.dev0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

logger = logging.getLogger(__name__)

try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    if is_offline_mode():
        raise LookupError(
            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
        )
    with FileLock(".lock") as lock:
        nltk.download("punkt", quiet=True)

# A list of all multilingual tokenizer which require lang attribute.
MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast]


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": (
                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                "with private models)."
            )
        },
    )
    resize_position_embeddings: Optional[bool] = field(
        default=None,
        metadata={
            "help": (
                "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
                "the model's position embeddings."
            )
        },
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    lang: Optional[str] = field(default=None, metadata={"help": "Language id for summarization."})

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    text_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
    )
    summary_column: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
    )
    validation_file: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "An optional input evaluation data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
            )
        },
    )
    test_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_source_length: Optional[int] = field(
        default=1024,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    max_target_length: Optional[int] = field(
        default=128,
        metadata={
            "help": (
                "The maximum total sequence length for target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    val_max_target_length: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
                "during ``evaluate`` and ``predict``."
            )
        },
    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to pad all samples to model maximum sentence length. "
                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
                "efficient on GPU but very bad for TPU."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
                "value if set."
            )
        },
    )
    num_beams: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
                "which is used during ``evaluate`` and ``predict``."
            )
        },
    )
    ignore_pad_token_for_loss: bool = field(
        default=True,
        metadata={
            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
        },
    )
    source_prefix: Optional[str] = field(
        default="", metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
    )

    forced_bos_token: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The token to force as the first generated token after the decoder_start_token_id."
                "Useful for multilingual models like mBART where the first generated token"
                "needs to be the target language token (Usually it is the target language token)"
            )
        },
    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
            raise ValueError("Need either a dataset name or a training/validation file.")
        else:
            if self.train_file is not None:
                extension = self.train_file.split(".")[-1]
                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
        if self.val_max_target_length is None:
            self.val_max_target_length = self.max_target_length


summarization_name_mapping = {
    "amazon_reviews_multi": ("review_body", "review_title"),
    "big_patent": ("description", "abstract"),
    "cnn_dailymail": ("article", "highlights"),
    "orange_sum": ("text", "summary"),
    "pn_summary": ("article", "summary"),
    "psc": ("extract_text", "summary_text"),
    "samsum": ("dialogue", "summary"),
    "thaisum": ("body", "summary"),
    "xglue": ("news_body", "news_title"),
    "xsum": ("document", "summary"),
    "wiki_summary": ("article", "highlights"),
    "multi_news": ("document", "summary"),
}



In [6]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq


In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id="google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [3]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
model_id="google/flan-t5-base"
 
# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)
 

In [4]:
from peft import PeftModel
from transformers import AutoModelForSeq2SeqLM

base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
model = PeftModel.from_pretrained(base_model, "RMWeerasinghe/flan-t5-base-prompt_tuning-cnn-dailymail")

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [8]:
import json
from datasets import load_dataset, Dataset

# Load your dataset from a local JSON file (adjust paths as needed)
data_files = {
    "train": "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/train_modified.json",
}

# Load dataset from local files
dataset = load_dataset("json", data_files=data_files)

# Function to format dataset for fine-tuning (prompt + input as the input text)
def format_for_finetuning(dataset):
    formatted_data = []
    for entry in dataset:
        # Concatenate the prompt and input to form the input text
        input_text = entry["prompt"] + entry["input"]
        output_text = entry["output"]  # Target output for the model
        
        # Add formatted entry to the list
        formatted_data.append({
            "input": input_text,
            "output": output_text
        })
    return formatted_data

Found cached dataset json (/home/surenoobster/.cache/huggingface/datasets/json/default-1b5a05f9096b368e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
import json
from datasets import load_dataset, Dataset

# Load your dataset from a local JSON file (adjust paths as needed)
data_files = {
    "train": "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/train_modified.json",
}

# Load dataset from local files
dataset = load_dataset("json", data_files=data_files)

# Function to format dataset for fine-tuning (prompt + input as the input text)
def format_for_finetuning(dataset):
    formatted_data = []
    for entry in dataset:
        # Concatenate the prompt and input to form the input text
        input_text = entry["prompt"] + entry["input"]
        output_text = entry["output"]  # Target output for the model
        
        # Add formatted entry to the list
        formatted_data.append({
            "input": input_text,
            "output": output_text
        })
    return formatted_data

# Format the dataset
train_data = format_for_finetuning(dataset['train'])

# Save the formatted dataset to a new JSON file in the same directory
formatted_data_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/formatted_train.json"

with open(formatted_data_path, "w") as outfile:
    json.dump(train_data, outfile, indent=4)

print(f"Formatted dataset saved to: {formatted_data_path}")


Found cached dataset json (/home/surenoobster/.cache/huggingface/datasets/json/default-1b5a05f9096b368e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

Formatted dataset saved to: /home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/formatted_train.json


In [10]:
import json

# Path to the newly saved formatted dataset
formatted_data_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/formatted_train.json"

# Load the formatted dataset from the file
with open(formatted_data_path, "r") as infile:
    formatted_data = json.load(infile)

# Print the first few entries to preview the formatted dataset
for i, entry in enumerate(formatted_data[:5]):  # Preview first 5 entries
    print(f"Entry {i+1}:")
    print(f"Input: {entry['input']}")
    print(f"Output: {entry['output']}")
    print("-" * 50)


Entry 1:
Input: Write highlights for this article for a middle school student:

(CNN) -- Six people, including three children, were killed when their plane crashed into rugged mountains east of Phoenix on a Thanksgiving eve trip, authorities said Thursday. Pinal County Sheriff Paul Babeu identified the dead as a father and his three children, plus two other men. All were Arizona residents and knew each other well, he said. The twin-engine Rockwell 690A airplane slammed into a steep cliff near the top of the Superstition Mountains at 6:31 p.m. on Wednesday, strewing debris for hundreds of yards down the 80-degree incline, Babeu told reporters. Recovery efforts by more than 50 deputies and volunteers involved collect and preserving the remains, the sheriff said. "No one could survive that crash," he said. The plane had flown from Safford to Mesa, where it picked up the siblings -- ages 9, 8 and 6 -- for the Thanksgiving holiday, Babeu said. "All of these families are just obviously heart

In [11]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [13]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
model_id="google/flan-t5-base"
 
# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id="google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [2]:
from datasets import load_dataset

# Path to your dataset
train_data_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/formatted_train.json"

# Load dataset from the local file
custom_dataset = load_dataset("json", data_files={"train": train_data_path})

# Check dataset sizes
print(f"Train dataset size: {len(custom_dataset['train'])}")

# Preview a random sample
from random import randrange

sample = custom_dataset['train'][randrange(len(custom_dataset['train']))]
print(f"Input (Prompt + Article): \n{sample['input']}\n---------------")
print(f"Output (Summary): \n{sample['output']}\n---------------")


Found cached dataset json (/home/surenoobster/.cache/huggingface/datasets/json/default-223d9c197d3e5077/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

Train dataset size: 286
Input (Prompt + Article): 
Write highlights for this article for a high school student:

(CNN) -- Monday's 2009 Major League Baseball home opener for the New York Mets at their new ballpark promises to reignite controversy -- and not just over the team's suspect pitching. Some Mets fans feel the $400 million stadium-name deal is a bad one for all parties in these economic times. Though the Mets played exhibition games there earlier this month, they will officially debut their new home, Citi Field, against the visiting San Diego Padres. The $800 million-plus Queens facility is undoubtedly a state-of-the-art baseball complex with many modern upgrades from the Mets' home for the past 44 years, Shea Stadium, which is still in the process of being demolished a few hundreds yards away. One lifelong Mets fan summed up his emotions about saying goodbye to Shea and hello to Citi by saying: "It was time for a new stadium, but I'm going to miss Shea very much." The controv

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import concatenate_datasets, load_dataset

# Model ID for FLAN-T5
model_id = "google/flan-t5-small"

# Load tokenizer for FLAN-T5
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Path to the formatted dataset
train_path = "/home/surenoobster/Documents/controllable-readability-summarization/src/15Dec_realise/formatted_train.json"

# Load the formatted dataset
data_files = {"train": train_path}
print("Loading dataset...")
dataset = load_dataset("json", data_files=data_files)

# Compute maximum input and output lengths for efficient batching
print("Calculating max input and target lengths...")

# Tokenize the concatenated dataset for inputs
tokenized_inputs = dataset["train"].map(
    lambda x: tokenizer(x["input"], truncation=True),
    batched=True,
    remove_columns=["input", "output"]
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# Tokenize the concatenated dataset for targets
tokenized_targets = dataset["train"].map(
    lambda x: tokenizer(x["output"], truncation=True),
    batched=True,
    remove_columns=["input", "output"]
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

# Preprocessing function
def preprocess_function(sample, padding="max_length"):
    # Prepare inputs with a task prefix (e.g., "summarize:")
    inputs = ["summarize: " + item for item in sample["input"]]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["output"], max_length=max_target_length, padding=padding, truncation=True)

    # Replace padding token ID with -100 to ignore during loss computation
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to the dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["input", "output"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


Loading tokenizer...
Loading dataset...


Found cached dataset json (/home/surenoobster/.cache/huggingface/datasets/json/default-223d9c197d3e5077/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/surenoobster/.cache/huggingface/datasets/json/default-223d9c197d3e5077/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-d4a255714efdcc5a.arrow
Loading cached processed dataset at /home/surenoobster/.cache/huggingface/datasets/json/default-223d9c197d3e5077/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-e47ff5bd2bad3f8e.arrow
Loading cached processed dataset at /home/surenoobster/.cache/huggingface/datasets/json/default-223d9c197d3e5077/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-6492fc3843c42381.arrow


Calculating max input and target lengths...
Max source length: 512
Max target length: 186
Tokenizing dataset...
Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [4]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize

# Load FLAN-T5 model
model_id = "google/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Initialize tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Download necessary NLTK resources
nltk.download("punkt")

# Load ROUGE metric
metric = evaluate.load("rouge")

# Helper function to postprocess text for evaluation
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects newlines after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

# Compute metrics function for evaluation
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process predictions and labels
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

# Define a DataCollator for Seq2Seq tasks
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="max_length", label_pad_token_id=-100)

# Print confirmation
print("Model, tokenizer, metric, and data collator loaded successfully.")


[nltk_data] Downloading package punkt to
[nltk_data]     /home/surenoobster/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Model, tokenizer, metric, and data collator loaded successfully.


In [11]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=None,
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./flan_t5_base_output",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    fp16=False,
    learning_rate=5e-5,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="epoch",
    save_total_limit=2,
    evaluation_strategy="no",
    report_to=["tensorboard"],
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)

# Free memory before training
import torch
torch.cuda.empty_cache()

trainer.train()
trainer.save_model("./final_model")
# Save the tokenizer
tokenizer.save_pretrained("./final_model")




  0%|          | 0/180 [00:00<?, ?it/s]

{'train_runtime': 593.3607, 'train_samples_per_second': 2.41, 'train_steps_per_second': 0.303, 'train_loss': 2.259277852376302, 'epoch': 5.0}


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/spiece.model',
 './final_model/added_tokens.json',
 './final_model/tokenizer.json')

In [6]:
pip install accelerate -U

Note: you may need to restart the kernel to use updated packages.


In [8]:
import torch
print(torch.cuda.device_count())  # Number of GPUs available


1


In [5]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Device Count: {torch.cuda.device_count()}")
else:
    print("GPU is not available!")


GPU is not available!


In [9]:
import torch
print(torch.cuda.device_count())  # Number of GPUs available


1


In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load your fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./final_model")
tokenizer = AutoTokenizer.from_pretrained("./final_model")

# Input text to summarize (as "Prompt + Article")
input_text = """
Write highlights for this article for a high school student:

(CNN) -- Monday's 2009 Major League Baseball home opener for the New York Mets at their new ballpark promises to reignite controversy -- and not just over the team's suspect pitching. Some Mets fans feel the $400 million stadium-name deal is a bad one for all parties in these economic times. Though the Mets played exhibition games there earlier this month, they will officially debut their new home, Citi Field, against the visiting San Diego Padres. The $800 million-plus Queens facility is undoubtedly a state-of-the-art baseball complex with many modern upgrades from the Mets' home for the past 44 years, Shea Stadium, which is still in the process of being demolished a few hundreds yards away. One lifelong Mets fan summed up his emotions about saying goodbye to Shea and hello to Citi by saying: "It was time for a new stadium, but I'm going to miss Shea very much." The controversy arises not from the disappearance of the venerated old stadium but from the fact that rather than attach the iconic Shea name to the new ballpark, in November 2006 the Mets entered into an agreement with Citigroup for naming rights for the new stadium. Reports have put the value of the deal at $400 million spread out in payments of $20 million per year over the course of the next 20 years. That would make it one of the most lucrative stadium naming arrangements in history. According to the Mets, besides the naming rights for Citi Field, "The fully integrated partnership includes Citi brand and business unit presence throughout the new ballpark." While other corporations have invested in similar deals in the past, the financial crisis has focused particular attention on Citigroup, the Mets, and the practice of paying vast sums of money for what is essentially a long-term advertisement. Citigroup, which now does business as Citi, has been the recipient of billions of dollars in taxpayer-funded bailout money over the past year, causing many to question the prudence of $400 million going toward branding Citi Field, especially when Citigroup cut nearly 75,000 jobs in 2008, capped by 50,000 announced in November. Rep. Dennis J. Kucinich, D-Ohio, has been an outspoken critic of the deal. He has called attention to the issue of corporate spending discretion, or lack thereof, particularly when jobs are at risk. "At the same time they're defending this $400 million stadium-naming deal, they lay off 50,000 people. Now, how many people could you employ for $400 million?" The Mets claim that the construction of Citi Field created more than 6,000 temporary full-time equivalent jobs, with approximately 1,000 new positions resulting from ongoing operations at the ballpark. One person whose job isn't at risk is Mets all-star third baseman David Wright. One of the franchise's most popular and marketable players, Wright is careful not to make any "errors" when confronted with the controversy surrounding his new Citi Field home. "I don't comment on things that I don't know about. I'm a baseball player, so I go out there and worry about my swing," he said.  Watch fans give views on Citi Field name game » . Kucinich conceded that paying to name a stadium is indeed "great advertising except for one thing, the American taxpayers have invested heavily in these banks and the bailout fund should not be used for this purpose." He advocates that the government, now a direct investor in Citigroup, has the obligation to monitor all aspects of how federal bailout funds are used, which in Kucinich's opinion includes any marketing or promotional endeavors. Not all Mets fans agree. One ardent fan said he's happy his hard-earned tax dollars are going to help fund the Mets' new stadium. "A lot of people pay for advertisements every day. We shouldn't be ashamed of it. Taxpayer money goes to waste on a lot of other things. Let's go Mets, I say!" But another fan said, "If it is indeed our tax money that is actually running the stadium, I wish that I wouldn't have to pay as much money to go see a game." A group of fellow fans, many liking the sound of a "Mets rebate" in these troubling economic times, warmly greeted his idea before an exhibition game against Boston on April 3. David Wright and the rest of the New York Mets, along with 42,000 boisterous fans, will formally open the new ballpark when the first regular-season game there begins at 7 p.m. Monday.
"""

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)

# Generate the summary
summary_ids = model.generate(inputs["input_ids"], max_length=150, num_beams=4, early_stopping=True)

# Decode and format the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summary in the requested format
print("Requested Output:")
print(summary)


Requested Output:
Mets will officially debut their new home, Citi Field, against the San Diego Padres. The $800 million-plus Queens facility is undoubtedly a state-of-the-art baseball complex with many modern upgrades from the Mets' home for the past 44 years. Reports have put the value of the deal at $400 million spread out in payments of $20 million per year over the course of the next 20 years.


In [9]:
import torch

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Example tensor operation on GPU
tensor = torch.randn(1000, 1000).to(device)
result = tensor @ tensor
print(result)


tensor([[ 16.7388,  21.6648, -48.9924,  ..., -32.6129, -14.0957, -53.3370],
        [ 29.6425,  23.5408,   1.3587,  ..., -21.5251, -24.3179, -33.2913],
        [  5.2434, -22.0096, -11.9205,  ...,  -0.9432,  20.6076,  -1.3561],
        ...,
        [  7.8429,  20.9002,  58.5818,  ..., -66.3809,  26.0704,  23.6928],
        [  4.7462,  35.3645,  45.0954,  ..., -16.6990,  -2.6634,   5.6799],
        [ 42.1958,  -9.2703,  35.9708,  ...,   8.2285,  38.3363,  20.0297]])


In [10]:
print(torch.cuda.memory_summary())



KeyError: 'allocated_bytes.all.current'

In [29]:
import torch
print(torch.__version__)
print(torch.version.cuda)


1.13.1+cu117
11.7


In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_path = "/home/surenoobster/Documents/controllable-readability-summarization/Finetuning_flant5/final_model"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [None]:
input_text = "Summarize: write it for  Artificial intelligence is transforming industries worldwide."

# Tokenize the input
inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to(device)

# Generate predictions
outputs = model.generate(
    inputs["input_ids"],
    max_length=50,  # Adjust based on your task
    num_beams=4,    # Optional: Use beam search for better results
    early_stopping=True
)

# Decode the generated tokens
predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text:", predicted_text)


Generated Text: Artificial intelligence is transforming industries worldwide.


In [15]:
# Check the CUDA_VISIBLE_DEVICES environment variable
!echo $CUDA_VISIBLE_DEVICES

# Alternatively, set it within the notebook
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Set to GPU 0
!echo $CUDA_VISIBLE_DEVICES


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0


In [16]:
!echo $CUDA_VISIBLE_DEVICES


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0


In [17]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
