In [18]:
import os

In [19]:
%pwd

'/Users/suyash/Desktop/projects/Intent-classification-'

In [3]:
os.chdir('../')

In [4]:
%pwd

'/Users/suyash/Desktop/projects/Intent-classification-'

In [58]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir: Path
    train_path: Path
    tokenizer_name: Path
    transformed_data_path: Path


In [59]:
from src.ic.constants import *
from src.ic.utils.common import read_yaml,create_directories

In [60]:
class ConfigurationManager:
    def __init__(self,
                 config_path=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH):
        self.config=read_yaml(config_path)
        self.paramss=read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self)-> DataTransformationConfig:
        config=self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            train_path=config.train_path,
            tokenizer_name=config.tokenizer_name,
            transformed_data_path=config.transformed_data_path
        )

        return data_transformation_config



In [61]:
import pandas as pd

In [64]:
import os
import pandas as pd
import json
from transformers import AutoTokenizer


class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)

       
        os.makedirs(self.config.transformed_data_path, exist_ok=True)

    def convert_examples_to_features(self, example_batch):
        input_encodings = self.tokenizer(example_batch['intent'], padding=True, truncation=True)
        target_encodings = self.tokenizer(example_batch['utterance'], padding=True, truncation=True)

        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }
    
    def convert(self, dataset_path=None):
        dataset_path = self.config.train_path
        df = pd.read_csv(dataset_path)

        # Apply the function to all examples
        transformed_data = df.apply(lambda x: self.convert_examples_to_features(x), axis=1)

        # Convert to DataFrame
        transformed_df = pd.DataFrame(transformed_data.tolist())

        # Save as CSV and JSON
        transformed_csv_path = os.path.join(self.config.transformed_data_path, "transformed_data.csv")
        transformed_json_path = os.path.join(self.config.transformed_data_path, "transformed_data.json")

        transformed_df.to_csv(transformed_csv_path, index=False)

        with open(transformed_json_path, "w") as f:
            json.dump(transformed_df.to_dict(orient="records"), f, indent=4)

        print(f"Transformed data saved to:\n- {transformed_csv_path}\n- {transformed_json_path}")


In [65]:
config=ConfigurationManager()
data_transformation_config=config.get_data_transformation_config()
data_transformation=DataTransformation(config=data_transformation_config)
data_transformation.convert()


[2025-02-18 01:51:26,489: INFO: common: created directory at artifacts]
[2025-02-18 01:51:26,490: INFO: common: created directory at artifacts/data_transformation]
Transformed data saved to:
- artifacts/data_transformation/transformed_data.csv
- artifacts/data_transformation/transformed_data.json
