In [1]:
import os

In [2]:
%pwd

'D:\\Machine_Learning\\Portfolio_Project_one\\Experimentation'

In [3]:
os.chdir('../')

In [4]:
%pwd

'D:\\Machine_Learning\\Portfolio_Project_one'

In [30]:
# ENTITY
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    test_size: float
    random_state: None|float


In [31]:
# Configuration Imports

from portfolio_project_one.constants import *
from portfolio_project_one.utils.common import read_yaml, create_directories

In [32]:
#  Updating Configuration Manager
class ConfigurationManager:                  
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,                     # These were all defined in constants
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH
    ):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifact_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:

        config = self.config.data_transformation
        
        parameters = self.params.data_transformation
        
        create_directories([config.root_dir])

        data_transformation_configuration = DataTransformationConfig(
            root_dir= config.root_dir,
            data_path= config.data_path,
            test_size= parameters.test_size,
            random_state= parameters.random_state
        )

        return data_transformation_configuration

In [33]:
#  Components

import os
from portfolio_project_one import logger
from sklearn.model_selection import train_test_split
import pandas as pd

class DataTransformtion:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    
    def train_test_split(self):
        df = pd.read_csv(self.config.data_path)
        # print(df)
        train_set, test_set = train_test_split(df,test_size= self.config.test_size, random_state= self.config.random_state)
        train_set.to_csv(os.path.join(self.config.root_dir, 'train.csv'), index = False)
        test_set.to_csv(os.path.join(self.config.root_dir, 'test.csv'), index = False)

        logger.info("Splitted data into training and test datasets")
        logger.info(f' Training dataset shape:  {train_set.shape}')
        logger.info(f' Testing dataset shape:  {test_set.shape}')

        print(f' Training dataset shape:  {train_set.shape}. \n Testing dataset shape:  {test_set.shape}')
        

In [34]:
#  Pipeline

try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformtion(config= data_transformation_config)
    data_transformation.train_test_split() # at default random state is None and test_size is 20 percent
except Exception as e:
    raise e

[2023-10-08 15:47:22,410: INFO: common: yaml file: config\config.yaml loaded succesfully]
[2023-10-08 15:47:22,410: INFO: common: yaml file: params.yaml loaded succesfully]
[2023-10-08 15:47:22,422: INFO: common: yaml file: schema.yaml loaded succesfully]
[2023-10-08 15:47:22,422: INFO: common: Created directory at : artifacts]
[2023-10-08 15:47:22,422: INFO: common: Created directory at : artifacts/data_transformation]
[2023-10-08 15:47:22,454: INFO: 2971003245: Splitted data into training and test datasets]
[2023-10-08 15:47:22,470: INFO: 2971003245:  Training dataset shape:  (1279, 12)]
[2023-10-08 15:47:22,472: INFO: 2971003245:  Testing dataset shape:  (320, 12)]
 Training dataset shape:  (1279, 12). 
 Testing dataset shape:  (320, 12)


In [36]:
#  here i tried changing test_size in params.yaml to 0.5
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformtion(config= data_transformation_config)
    data_transformation.train_test_split() # at default random state is None and test_size is 20 percent
except Exception as e:
    raise e

[2023-10-08 15:47:36,215: INFO: common: yaml file: config\config.yaml loaded succesfully]
[2023-10-08 15:47:36,226: INFO: common: yaml file: params.yaml loaded succesfully]
[2023-10-08 15:47:36,232: INFO: common: yaml file: schema.yaml loaded succesfully]
[2023-10-08 15:47:36,234: INFO: common: Created directory at : artifacts]
[2023-10-08 15:47:36,236: INFO: common: Created directory at : artifacts/data_transformation]
[2023-10-08 15:47:36,278: INFO: 2971003245: Splitted data into training and test datasets]
[2023-10-08 15:47:36,278: INFO: 2971003245:  Training dataset shape:  (799, 12)]
[2023-10-08 15:47:36,278: INFO: 2971003245:  Testing dataset shape:  (800, 12)]
 Training dataset shape:  (799, 12). 
 Testing dataset shape:  (800, 12)
