In [1]:
import os

In [2]:
%pwd

'd:\\A_Category\\iNeuron\\End-To-End-NLP-Project-News-Article-Sorting\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\A_Category\\iNeuron\\End-To-End-NLP-Project-News-Article-Sorting'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen= True)
class ModeTrainerConfig:
  root_dir: Path
  train_data_path: Path
  val_data_path: Path
  model_ckpt: Path
  output_dir: Path
  learning_rate: float
  per_device_train_batch_size: int
  per_device_eval_batch_size: int
  num_train_epochs: int
  weight_decay: float
  eval_steps: int
  evaluation_strategy: str
  save_strategy: str
  load_best_model_at_end: bool

In [11]:
from ArticleSorting.constants import *
from ArticleSorting.utils.common import read_yaml, create_directories

In [12]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModeTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModeTrainerConfig(
            root_dir = config.root_dir,
            train_data_path = config.train_data_path,
            val_data_path = config.val_data_path,
            model_ckpt = config.model_ckpt,
            output_dir = params.output_dir,
            learning_rate  = params.learning_rate,
            per_device_train_batch_size  = params.per_device_train_batch_size,
            per_device_eval_batch_size  = params.per_device_eval_batch_size,
            num_train_epochs  = params.num_train_epochs,
            weight_decay= params.weight_decay,
            eval_steps= params.eval_steps,
            evaluation_strategy= params.evaluation_strategy,
            save_strategy = params.save_strategy,
            load_best_model_at_end= params.load_best_model_at_end
        )

        return model_trainer_config


In [8]:
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader
from datasets import load_from_disk

import evaluate
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
class ModeTrainer:
    def __init__(self, config: ModeTrainerConfig) :
        self.config = config

    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(device)
        # Empty cache
        torch.cuda.empty_cache()

        # Loading data
        train_dataset = load_from_disk(self.config.train_data_path)
        val_dataset = load_from_disk(self.config.val_data_path)
    
        # DataLoader
        #train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=4)
        #eval_dataloader = DataLoader(dataset=test_dataset, batch_size=4)

        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        id2label = {0: "business", 1: "entertainment", 2: "politics", 3: "sport", 4: "tech"}
        label2id = {"business": 0, "entertainment": 1, "politics": 2, "sport": 3, "tech": 4 }
        model = AutoModelForSequenceClassification.from_pretrained(
            self.config.model_ckpt,
            num_labels=5,
            id2label=id2label, 
            label2id=label2id
            ).to(device)
        
        
        accuracy = evaluate.load("accuracy")
        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.argmax(predictions, axis=1)
            return accuracy.compute(predictions=predictions, references=labels)

        
                
        training_args = TrainingArguments(
            output_dir="bert-base-cased",
            learning_rate=2e-5,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            num_train_epochs=3,
            weight_decay=0.01,
            eval_steps = 10,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,

        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        trainer.evaluate()

         ## Save model
        model.save_pretrained(os.path.join(self.config.root_dir,"bert-base-uncased-model"))
        ## Save tokenizer
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))



In [14]:
try:
    config = ConfigurationManager()
    model_trainer_comfig = config.get_model_trainer_config()
    model_trainer = ModeTrainer(config= model_trainer_comfig)
    model_trainer.train()

except Exception as e:
    raise e

[2023-11-21 12:24:07,130:  INFO: common: yaml file:config\config.yaml loaded successfully]
[2023-11-21 12:24:09,412:  INFO: common: yaml file:params.yaml loaded successfully]
[2023-11-21 12:24:09,414:  INFO: common: created directory at : artifacts]
[2023-11-21 12:24:09,416:  INFO: common: created directory at : artifacts/model_trainer]
cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/225 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                
 33%|███▎      | 75/225 [11:07<17:36,  7.04s/it]

{'eval_loss': 0.7591565847396851, 'eval_accuracy': 0.8322147651006712, 'eval_runtime': 33.3838, 'eval_samples_per_second': 4.463, 'eval_steps_per_second': 1.138, 'epoch': 1.0}


                                                 
 67%|██████▋   | 150/225 [22:23<08:38,  6.91s/it]

{'eval_loss': 0.22947506606578827, 'eval_accuracy': 0.9731543624161074, 'eval_runtime': 32.9361, 'eval_samples_per_second': 4.524, 'eval_steps_per_second': 1.154, 'epoch': 2.0}


                                                 
100%|██████████| 225/225 [32:44<00:00,  6.28s/it]

{'eval_loss': 0.1277976632118225, 'eval_accuracy': 0.9865771812080537, 'eval_runtime': 33.5438, 'eval_samples_per_second': 4.442, 'eval_steps_per_second': 1.133, 'epoch': 3.0}


100%|██████████| 225/225 [33:01<00:00,  8.80s/it]


{'train_runtime': 1981.0467, 'train_samples_per_second': 0.451, 'train_steps_per_second': 0.114, 'train_loss': 0.6336573621961805, 'epoch': 3.0}


100%|██████████| 38/38 [00:30<00:00,  1.26it/s]


In [26]:
torch.cuda.empty_cache()
