In [1]:
import os

In [2]:
%pwd

'c:\\Users\\Vincent\\Desktop\\Cancer-Prediction-Trials\\notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Vincent\\Desktop\\Cancer-Prediction-Trials'

In [5]:
## 3. Update the entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    processed_data_path: Path
    train_data_path: Path
    test_data_path: Path



In [6]:
## 3. Update the entity

from src.constants import *
from src.utils.common import read_yaml, create_directories

In [7]:
## 4. Update the configuration manager in src config

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH):
        
        self.config = read_yaml(config_filepath)

        create_directories([self.config.output_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            processed_data_path=config.processed_data_path,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path
        )

        return data_ingestion_config

In [8]:
## 5. Update the components

import sys
from src.exception import CustomException
from src import logger
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedShuffleSplit


In [9]:
## 5. Update the components

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def initiate_data_ingestion(self):
        logger.info("Initiate data ingestion method or component")

        try:
            df_bc = load_breast_cancer() 
            
            data = np.c_[df_bc.data, df_bc.target]
            
            column_names = np.append(df_bc.feature_names, ['malignant'])
            
            df = pd.DataFrame(data, columns=column_names)

            df = df.drop(columns = ['mean perimeter', 'mean area', 'worst radius', 'worst perimeter', 'worst area', 'fractal dimension error', 'mean fractal dimension','radius error',
                        'texture error', 'smoothness error', 'symmetry error', 'worst texture', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 
                        'worst symmetry', 'worst fractal dimension', 'perimeter error', 'area error', 'concavity error', 'concave points error', 'compactness error'])
            
            df.columns = df.columns.str.replace(' ', '_')
            
            df['malignant'] = df['malignant'].map(lambda x: 1 if x != 1.0 else 0)
            
            os.makedirs(os.path.dirname(self.config.processed_data_path),exist_ok=True)
            
            df.to_csv(self.config.processed_data_path,index=False,header=True)

            return(
                self.config.processed_data_path
            )
        
        except Exception as e:
            raise CustomException(e,sys) 

    def complete_data_ingestion(self):
        logger.info("Resume data ingestion method or component") 

        try:
            df=pd.read_csv(self.config.processed_data_path)
            
            logger.info('Read the dataset as dataframe')

            strat_shuff_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
        
            X = df.drop(columns=['malignant'],axis=1)
            
            y = df['malignant']
            
            train_idx, test_idx = next(strat_shuff_split.split(X, y))
            
            train_set = df.loc[train_idx]
            
            test_set = df.loc[test_idx]

            logger.info("Train test split initiated")

            os.makedirs(os.path.dirname(self.config.train_data_path),exist_ok=True)

            train_set.to_csv(self.config.train_data_path,index=False,header=True)

            test_set.to_csv(self.config.test_data_path,index=False,header=True)

            logger.info("Ingestion of the data is completed")

            return(
                self.config.train_data_path,
                self.config.test_data_path

            )
        except Exception as e:
            raise CustomException(e,sys)



In [10]:
## 6. Update the pipeline

try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.initiate_data_ingestion()
    data_ingestion.complete_data_ingestion()
except Exception as e:
  raise e

[2024-07-12 12:39:09,869: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-12 12:39:09,869: INFO: common: created directory at: output]
[2024-07-12 12:39:09,869: INFO: common: created directory at: output/data_ingestion]
[2024-07-12 12:39:09,869: INFO: 2862756393: Initiate data ingestion method or component]
[2024-07-12 12:39:09,901: INFO: 2862756393: Resume data ingestion method or component]
[2024-07-12 12:39:09,910: INFO: 2862756393: Read the dataset as dataframe]
[2024-07-12 12:39:09,910: INFO: 2862756393: Train test split initiated]
[2024-07-12 12:39:09,926: INFO: 2862756393: Ingestion of the data is completed]
