In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Vincent\\Documents\\GitHub\\Text-Data-Translation\\notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Vincent\\Documents\\GitHub\\Text-Data-Translation'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    word_array_path: Path
    eng_tokenizer_data_path: Path
    fra_tokenizer_data_path: Path
    test_array_path: Path
    X_train_array_path: Path
    y_train_array_path: Path
    X_test_array_path: Path
    y_test_array_path: Path


In [6]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [7]:
## Update the configuration manager in src config

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH):
        
        self.config = read_yaml(config_filepath)

        create_directories([self.config.output_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            word_array_path=config.word_array_path,
            eng_tokenizer_data_path=config.eng_tokenizer_data_path,
            fra_tokenizer_data_path=config.fra_tokenizer_data_path,
            test_array_path=config.test_array_path,
            X_train_array_path=config.X_train_array_path,
            y_train_array_path=config.y_train_array_path,
            X_test_array_path=config.X_test_array_path,
            y_test_array_path=config.y_test_array_path
        )

        return data_transformation_config

In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from transformers import AutoTokenizer
from src.utils.common import save_tokenizer, tokenization, encode_sequences
from src.exception import CustomException
from src import logger
import os
import sys
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
## 5. Update the components

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config=config
        
    def tokenizing(self):
        '''
        This function is responsible for tokenizing the data
        
        '''
        logger.info("Loading the array")

        fra_eng = np.load(self.config.word_array_path)

        logger.info("Tokenzing the text data") 

        # prepare english tokenizer
        eng_tokenizer = tokenization(fra_eng[:, 0])

        eng_length = 8
        
        # prepare french tokenizer
        fra_tokenizer = tokenization(fra_eng[:, 1])

        fra_length = 8
        
        save_tokenizer(self.config.eng_tokenizer_data_path, eng_tokenizer)
        save_tokenizer(self.config.fra_tokenizer_data_path, fra_tokenizer)
        
        # split data into train and test set
        train, test = train_test_split(fra_eng, test_size=0.2, random_state = 12)
        
        # prepare training data
        X_train = encode_sequences(fra_tokenizer, fra_length, train[:, 1])
        y_train = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
        
        # prepare validation data
        X_test = encode_sequences(fra_tokenizer, fra_length, test[:, 1])
        y_test = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

        logger.info(f"Saving the train and test datasets.")

        np.save(self.config.test_array_path, test)
        np.save(self.config.X_train_array_path, X_train)
        np.save(self.config.y_train_array_path, y_train)
        np.save(self.config.X_test_array_path, X_test)
        np.save(self.config.y_test_array_path, y_test)

        return(
                self.config.test_array_path,
                self.config.X_train_array_path,
                self.config.y_train_array_path,
                self.config.X_test_array_path,
                self.config.y_test_array_path
            )

In [10]:
## 6. Update the pipeline

try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.tokenizing()
except Exception as e:
  raise e

[2024-07-17 06:15:12,755: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-17 06:15:12,755: INFO: common: created directory at: output]
[2024-07-17 06:15:12,755: INFO: common: created directory at: output/data_transformation]
[2024-07-17 06:15:12,755: INFO: 4200427557: Loading the array]
[2024-07-17 06:15:12,830: INFO: 4200427557: Tokenzing the text data]
[2024-07-17 06:15:14,936: INFO: 4200427557: Saving the train and test datasets.]
