In [1]:
import os

In [2]:
%pwd

'/Users/manraj/Documents/GitHub/text-summariser-E2E-project/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/manraj/Documents/GitHub/text-summariser-E2E-project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

In [6]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        create_directories([self.config.artifacts_root])
        
    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_training
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_training_config = ModelTrainingConfig(
            root_dir=config.root_dir,
            data_path=config.DATA_PATH,
            model_ckpt = config.MODEL_CKPT,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.eval_steps,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )

        return model_training_config

In [8]:
from transformers import  TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset,load_from_disk
import torch    
import wandb

  from .autonotebook import tqdm as notebook_tqdm


[2024-12-20 18:59:15,393]: INFO: config: PyTorch version 2.2.2 available.


In [9]:
# class ModelTraining:
#     def __init__(self, config: ModelTrainingConfig):
#         self.config = config
        
#     def train(self):
#         device = "cuda" if torch.cuda.is_available() else "cpu"
#         tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
#         model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
#         seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
        
#         dataset_samsum_pt = load_from_disk(self.config.data_path)
#         trainer_args = TrainingArguments(
#             output_dir=self.config.root_dir,
#             num_train_epochs=self.config.num_train_epochs,
#             warmup_steps=self.config.warmup_steps,
#             per_device_train_batch_size=self.config.per_device_train_batch_size,
#             per_device_eval_batch_size=self.config.per_device_train_batch_size,
#             weight_decay=self.config.weight_decay,
#             logging_steps=self.config.logging_steps,
#             eval_strategy=self.config.evaluation_strategy,
#             eval_steps=self.config.eval_steps,
#             save_steps=self.config.save_steps,
#             gradient_accumulation_steps=self.config.gradient_accumulation_steps,
#         )

#         trainer = Trainer(model=model_pegasus, args=trainer_args,
#                     processing_class=tokenizer, data_collator=seq2seq_data_collator,
#                     train_dataset=dataset_samsum_pt["test"], 
#                     eval_dataset=dataset_samsum_pt["validation"])
        
#         os.environ["WANDB_MODE"] = "disabled"
#         # wandb.login(key="adaa721de476acf858912e6f7f868e52d8ea1fb1")
#         trainer.train()

#         model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
#         tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))

In [10]:
class ModelTraining:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config
        
    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
        
        # Load the dataset
        dataset_samsum_pt = load_from_disk(self.config.data_path)
        
        # Use only a small subset of the dataset for testing
        small_train_dataset = dataset_samsum_pt["train"].shuffle(seed=42).select(range(5))  # First 100 samples
        small_eval_dataset = dataset_samsum_pt["validation"].shuffle(seed=42).select(range(1))  # First 50 samples
    
        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir,
            num_train_epochs=self.config.num_train_epochs,
            warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_train_batch_size,
            weight_decay=self.config.weight_decay,
            logging_steps=self.config.logging_steps,
            eval_strategy=self.config.evaluation_strategy,
            eval_steps=self.config.eval_steps,
            save_steps=self.config.save_steps,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
        )
    
        trainer = Trainer(
            model=model_pegasus, 
            args=trainer_args,
            processing_class=tokenizer, 
            data_collator=seq2seq_data_collator,
            train_dataset=small_train_dataset, 
            eval_dataset=small_eval_dataset
        )
        
        os.environ["WANDB_MODE"] = "disabled"
        trainer.train()
    
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir, "pegasus-samsum-model"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer"))


In [11]:
try:
    config = ConfigurationManager()
    model_training_config = config.get_model_training_config()
    model_training = ModelTraining(config=model_training_config)
    model_training.train()
except Exception as e:
    raise e

[2024-12-20 18:59:17,402]: INFO: common: yaml file: config/config.yaml loaded successfully
[2024-12-20 18:59:17,405]: INFO: common: yaml file: params.yaml loaded successfully
[2024-12-20 18:59:17,406]: INFO: common: Created directory: artifacts
[2024-12-20 18:59:17,407]: INFO: common: Created directory: artifacts/model_training


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2024-12-20 18:59:32,392]: INFO: wandb_config: config set model/num_parameters = 570797056 - None


  0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 