In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Vincent\\Desktop\\Text-Data-Translation-Trials\\notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Vincent\\Desktop\\Text-Data-Translation-Trials'

In [5]:
## 3. Update the entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    input_data_path: Path
    word_array_path: Path

In [6]:
## 3. Update the entity

from src.constants import *
from src.utils.common import read_yaml, create_directories

In [7]:
## 4. Update the configuration manager in src config

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH):
        
        self.config = read_yaml(config_filepath)

        create_directories([self.config.output_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            input_data_path=config.input_data_path,
            word_array_path=config.word_array_path
        )

        return data_ingestion_config    

In [8]:
## 5. Update the components

import os
import sys
from src.exception import CustomException
from src import logger
import pandas as pd
from dataclasses import dataclass
from src.utils.common import read_text, to_lines 
import numpy as np
from numpy import array
import string

In [9]:
## 5. Update the components

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def initiate_data_ingestion(self):
        '''
        This function is responsible for data ingestion
        
        '''
        logger.info("Data Ingestion") 
        
        try:
            logger.info("Reading the data")

            data = read_text(self.config.input_data_path)

            fra_eng = to_lines(data)

            fra_eng = array(fra_eng)

            fra_eng = fra_eng[:50000,:]

            # Remove punctuation
            fra_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,0]]
            fra_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,1]]

            # convert text to lowercase
            for i in range(len(fra_eng)):
                fra_eng[i,0] = fra_eng[i,0].lower()
                fra_eng[i,1] = fra_eng[i,1].lower()

            os.makedirs(os.path.dirname(self.config.word_array_path),exist_ok=True)

            logger.info("Saving the data")

            np.save(self.config.word_array_path, fra_eng)

            logger.info("Ingestion of the data is completed")

            return fra_eng
        
        except Exception as e:
            raise CustomException(e,sys)  

In [10]:
## 6. Update the pipeline

try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.initiate_data_ingestion()
except Exception as e:
  raise e

[2024-07-16 19:28:41,318: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-16 19:28:41,323: INFO: common: created directory at: output]
[2024-07-16 19:28:41,324: INFO: common: created directory at: output/data_ingestion]
[2024-07-16 19:28:41,325: INFO: 1398589606: Data Ingestion]
[2024-07-16 19:28:41,326: INFO: 1398589606: Reading the data]
[2024-07-16 19:28:42,554: INFO: 1398589606: Saving the data]
[2024-07-16 19:28:45,990: INFO: 1398589606: Ingestion of the data is completed]
