In [3]:
%pwd

'/Users/suyash/Desktop/Intent-classification-/research'

In [4]:
import os

In [5]:
os.chdir('../')

In [6]:
import os 
import yaml
from pathlib import Path
from box import ConfigBox
from box.exceptions import BoxValueError
from ensure import ensure_annotations
from src.ic.logging import logging
from src.ic.constants import *
from src.ic.utils.common import read_yaml,create_directories
# Define DataIngestionConfig class
class DataIngestionConfig:
    def __init__(self, root_dir: str, train_dir: str, test_dir: str,data_path:str):
        self.root_dir = root_dir
        self.train_dir = train_dir
        self.test_dir = test_dir
        self.data_path= data_path

# Read YAML function with error handling
@ensure_annotations
def read_yaml(path_to_yaml: Path) -> ConfigBox:
    try:
        with open(path_to_yaml) as yaml_file:
            contents = yaml.safe_load(yaml_file)
            return ConfigBox(contents)  # Ensures it's a dictionary-like object
    except Exception as e:
        raise ValueError(f"Error reading YAML file: {e}")

# Function to create directories
@ensure_annotations
def create_directories(path_to_directories: list, verbose=True):
    for path in path_to_directories:
        os.makedirs(path, exist_ok=True)
    if verbose:
        logging.info(f"Created directories: {path_to_directories}")

# ConfigurationManager to load configurations and directories
class ConfigurationManager:
    def __init__(self,
                 config_path=CONFIG_FILE_PATH,  # Update to the correct path if necessary
                 params_filepath=PARAMS_FILE_PATH):  # Update to the correct path if necessary
        # Convert paths to Path objects
        self.config = read_yaml(Path(config_path))  # Read the config YAML
        self.params = read_yaml(Path(params_filepath))  # Read the params YAML
        
        # Create the root directory defined in config
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        This method returns the DataIngestionConfig object,
        which holds the paths for root, train, and test directories.
        """
        config = self.config.data_ingestion
        

        print(f"Config read from YAML: {config}")
        
        create_directories([config.root_dir, config.train_dir, config.test_dir])
        
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            train_dir=config.train_dir,
            test_dir=config.test_dir,
            data_path=config.data_path
            
        )

        return data_ingestion_config

# Example Usage:
config_manager = ConfigurationManager()
data_ingestion_config = config_manager.get_data_ingestion_config()


[2025-01-19 09:02:31,332: INFO: 2954924526: Created directories: ['artifacts']]
Config read from YAML: {'root_dir': 'artifacts/dataingestion', 'data_path': 'data/Bitext_Sample_Customer_Service_Training_Dataset/Training/Bitext_Sample_Customer_Service_Training_Dataset.csv', 'train_dir': 'artifacts/data_ingestion/train', 'test_dir': 'artifacts/data_ingestion/test'}
[2025-01-19 09:02:31,333: INFO: 2954924526: Created directories: ['artifacts/dataingestion', 'artifacts/data_ingestion/train', 'artifacts/data_ingestion/test']]


In [7]:
import os
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from pathlib import Path

class DataIngestion:
    def __init__(self, config):
        """
        Initializes the DataIngestion class with a configuration object.
        """
        self.config = config

    def dataset_read(self):
        """
        Reads dataset from the given CSV path and saves it as train/test CSV files.
        """
        try:
            # Load the dataset from the CSV file path
            datasets = load_dataset("csv", data_files=self.config.data_path)

            # Ensure the directory for saving train and test datasets exists
            os.makedirs(self.config.train_dir, exist_ok=True)
            os.makedirs(self.config.test_dir, exist_ok=True)

            # Check if 'train' and 'test' splits are available
            if 'train' not in datasets or 'test' not in datasets:
                # If not, manually split the dataset
                print("Splitting dataset into train and test splits...")

                full_df = datasets['train'].to_pandas()  # assuming the dataset has a 'train' split
                train_df, test_df = train_test_split(full_df, test_size=0.2, random_state=42)

                # Save the train and test splits as CSV files
                train_output_path = os.path.join(self.config.train_dir, 'train.csv')
                test_output_path = os.path.join(self.config.test_dir, 'test.csv')

                train_df.to_csv(train_output_path, index=False)
                test_df.to_csv(test_output_path, index=False)

                print(f"Train dataset saved to {train_output_path}")
                print(f"Test dataset saved to {test_output_path}")
            else:
                # If splits exist, directly save them
                train_df = datasets['train'].to_pandas()  
                test_df = datasets['test'].to_pandas()  

                train_output_path = os.path.join(self.config.train_dir, 'train.csv')
                test_output_path = os.path.join(self.config.test_dir, 'test.csv')

                train_df.to_csv(train_output_path, index=False)
                test_df.to_csv(test_output_path, index=False)

                print(f"Train dataset saved to {train_output_path}")
                print(f"Test dataset saved to {test_output_path}")
        except Exception as e:
            print(f"Error reading or saving dataset: {str(e)}")


[2025-01-19 09:02:32,312: INFO: config: PyTorch version 2.5.1 available.]
[2025-01-19 09:02:32,313: INFO: config: TensorFlow version 2.18.0 available.]


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Initialize the ConfigurationManager to get the config
config_manager = ConfigurationManager()

# Get the data ingestion config
data_ingestion_config = config_manager.get_data_ingestion_config()

# Create an instance of DataIngestion with the config
data_ingestion = DataIngestion(config=data_ingestion_config)

# Call the dataset_read method on the instance
data_ingestion.dataset_read()


[2025-01-19 09:02:33,583: INFO: 2954924526: Created directories: ['artifacts']]
Config read from YAML: {'root_dir': 'artifacts/dataingestion', 'data_path': 'data/Bitext_Sample_Customer_Service_Training_Dataset/Training/Bitext_Sample_Customer_Service_Training_Dataset.csv', 'train_dir': 'artifacts/data_ingestion/train', 'test_dir': 'artifacts/data_ingestion/test'}
[2025-01-19 09:02:33,584: INFO: 2954924526: Created directories: ['artifacts/dataingestion', 'artifacts/data_ingestion/train', 'artifacts/data_ingestion/test']]
Splitting dataset into train and test splits...
Train dataset saved to artifacts/data_ingestion/train/train.csv
Test dataset saved to artifacts/data_ingestion/test/test.csv
