In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Vincent\\Desktop\\text-classification-Trials\\notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Vincent\\Desktop\\text-classification-Trials'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_dataset_path: Path
    val_dataset_path: Path
    test_dataset_path: Path
    model_data_path: Path


In [6]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [7]:
## Update the configuration manager in src config

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH):
        
        self.config = read_yaml(config_filepath)

        create_directories([self.config.output_root])
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_dataset_path=config.train_dataset_path,
            val_dataset_path=config.val_dataset_path,
            test_dataset_path=config.test_dataset_path,
            model_data_path=config.model_data_path
        )

        return model_trainer_config

In [8]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
from keras.utils import to_categorical
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
import os
import sys
from dataclasses import dataclass
import pickle
import warnings
warnings.filterwarnings("ignore")
from src.exception import CustomException
from src import logger

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
## 5. Update the components

class ModelTrainer:
    def __init__(self, config:ModelTrainerConfig):
        self.config=config

    def initiate_model_trainer(self):
        '''
        This function is responsible for model training
        
        '''
        try:
            logger.info(f"Loading the train, validation and test datasets")
            train_dataset = tf.data.Dataset.load(self.config.train_dataset_path)
            val_dataset = tf.data.Dataset.load(self.config.val_dataset_path)
            test_dataset = tf.data.Dataset.load(self.config.test_dataset_path)
            

            logger.info(f"Finetuning model starts")
            model = TFAutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased",num_labels=4)
            
            num_epochs = 1

            # The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
            # by the total number of epochs. Since our dataset is already batched, we can simply take the len.
            num_train_steps = len(train_dataset) * num_epochs

            lr_scheduler = keras.optimizers.schedules.PolynomialDecay(
                initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
            )
            
            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            
            model.compile(optimizer="adam", loss=loss, metrics=["accuracy"])
            
            callbacks = [keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)]

            model.fit(train_dataset, validation_data=val_dataset, epochs=num_epochs, callbacks=callbacks)

            logger.info(f"Evaluating finetuned model")
            model.evaluate(test_dataset)
            
            logger.info(f"Saving finetuned model")
            model.save_pretrained(os.path.join(self.config.model_data_path))
                    
            return model
    
        except Exception as e:
            raise CustomException(e,sys)

In [10]:
## 6. Update the pipeline

try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.initiate_model_trainer()
except Exception as e:
  raise e

[2024-07-15 11:11:52,094: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-15 11:11:52,095: INFO: common: created directory at: output]
[2024-07-15 11:11:52,097: INFO: common: created directory at: output/model_trainer]
[2024-07-15 11:11:52,098: INFO: 2724184690: Loading the train, validation and test datasets]
[2024-07-15 11:11:52,185: INFO: 2724184690: Finetuning model starts]


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 


Epoch 1: LearningRateScheduler setting learning rate to 4.999999873689376e-05.
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported
  4/900 [..............................] - ETA: 3:22:33 - loss: 1.3923 - accuracy: 0.2344

KeyboardInterrupt: 