In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Vincent\\Desktop\\text-classification-Trials\\notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Vincent\\Desktop\\text-classification-Trials'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    tokenizer_data_path: Path
    X_train_data_path: Path
    X_test_data_path: Path
    y_train_data_path: Path
    y_test_data_path: Path
    X_val_data_path: Path
    y_val_data_path: Path
    raw_data_path: Path
    train_dataset_path: Path
    val_dataset_path: Path
    test_dataset_path: Path

In [6]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [7]:
## Update the configuration manager in src config

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH):
        
        self.config = read_yaml(config_filepath)

        create_directories([self.config.output_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            tokenizer_data_path=config.tokenizer_data_path,
            X_train_data_path=config.X_train_data_path,
            X_test_data_path=config.X_test_data_path,
            y_train_data_path=config.y_train_data_path,
            y_test_data_path=config.y_test_data_path,
            X_val_data_path=config.X_val_data_path,
            y_val_data_path=config.y_val_data_path,
            raw_data_path=config.raw_data_path,
            train_dataset_path=config.train_dataset_path,
            val_dataset_path=config.val_dataset_path,
            test_dataset_path=config.test_dataset_path
        )

        return data_transformation_config

In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from transformers import AutoTokenizer
from src.utils.common import save_tokenizer
from src.exception import CustomException
from src import logger
import os
import sys

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
## 5. Update the components

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config=config
        
    def tokenizing(self):
        '''
        This function is responsible for tokenizing the data
        
        '''

        logger.info("Loading tokenizer from pretrained model")
        tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

        logger.info("Saving tokenizer from pretrained model")
        save_tokenizer(self.config.tokenizer_data_path, tokenizer)

        logger.info("Reading in train, val and test text data")        
        train_texts=pd.read_csv(self.config.X_train_data_path)
        train_texts = train_texts["TITLE"].to_list()
        train_labels=pd.read_csv(self.config.y_train_data_path)
        train_labels = train_labels["TARGET"].to_list()
        val_texts=pd.read_csv(self.config.X_val_data_path)
        val_texts = val_texts["TITLE"].to_list()
        val_labels=pd.read_csv(self.config.y_val_data_path)
        val_labels = val_labels["TARGET"].to_list()
        test_texts=pd.read_csv(self.config.X_test_data_path)
        test_texts = test_texts["TITLE"].to_list()
        test_labels=pd.read_csv(self.config.y_test_data_path)
        test_labels = test_labels["TARGET"].to_list()
        
        logger.info("Tokenzing the train, val and test text data") 
        train_encodings = tokenizer(train_texts, padding=True, truncation=True)
        val_encodings = tokenizer(val_texts, padding=True, truncation=True)
        test_encodings = tokenizer(test_texts, padding=True, truncation=True)

        logger.info("Generating the train, val and test datasets.") 
        batch_size = 16
        
        train_dataset = tf.data.Dataset.from_tensor_slices((
            dict(train_encodings),
            train_labels
        )).batch(batch_size)

        val_dataset = tf.data.Dataset.from_tensor_slices((
            dict(val_encodings),
            val_labels
        )).batch(batch_size)

        test_dataset = tf.data.Dataset.from_tensor_slices((
            dict(test_encodings),
            test_labels
        )).batch(batch_size)

        logger.info(f"Saving the train, val and test datasets.")
        train_dataset.save(self.config.train_dataset_path)
        val_dataset.save(self.config.val_dataset_path)
        test_dataset.save(self.config.test_dataset_path)
        
        return (
                train_dataset,
                val_dataset,
                test_dataset
            )

In [10]:
## 6. Update the pipeline

try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.tokenizing()
except Exception as e:
  raise e

[2024-07-15 10:14:29,025: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-15 10:14:29,027: INFO: common: created directory at: output]
[2024-07-15 10:14:29,028: INFO: common: created directory at: output/data_transformation]
[2024-07-15 10:14:29,028: INFO: 1408085289: Loading tokenizer from pretrained model]
[2024-07-15 10:14:29,396: INFO: 1408085289: Saving tokenizer from pretrained model]
[2024-07-15 10:14:29,396: INFO: 1408085289: Reading in train, val and test text data]
[2024-07-15 10:14:29,502: INFO: 1408085289: Tokenzing the train, val and test text data]
[2024-07-15 10:14:30,260: INFO: 1408085289: Generating the train, val and test datasets.]
[2024-07-15 10:14:57,980: INFO: 1408085289: Saving the train, val and test datasets.]
