In [4]:
import os

In [5]:
%pwd

'd:\\Python\\MLProjectsPW\\Thyroid-Disease-Prediction\\research'

In [6]:
os.chdir('../')

In [29]:
%pwd

'd:\\Python\\MLProjectsPW\\Thyroid-Disease-Prediction'

In [30]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    columns: dict
    

In [31]:
from ThyroidProject.constants import *
from ThyroidProject.utils.common import read_yaml, create_directories

In [32]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        """
        This function returns the data transformation configuration.

        Args:
            self (DataTransformation): An instance of the DataTransformation class.

        Returns:
            DataTransformationConfig: The data transformation configuration.

        """
        config = self.config.data_transformation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            columns=schema
            
        )

        return data_transformation_config

In [33]:
import os
from ThyroidProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [34]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()

[2024-01-26 21:10:45,634:INFO:common:yaml file: config\config.yaml loaded successfully]
[2024-01-26 21:10:45,636:INFO:common:yaml file: params.yaml loaded successfully]
[2024-01-26 21:10:45,640:INFO:common:yaml file: schema.yaml loaded successfully]
[2024-01-26 21:10:45,642:INFO:common:created directory at :artifacts]
[2024-01-26 21:10:45,642:INFO:common:created directory at :artifacts/data_transformation]


In [35]:
list(data_transformation_config.columns.keys())

['age',
 'sex',
 'on_thyroxine',
 'query_on_thyroxine',
 'on_antihyroid_meds',
 'sick',
 'pregnant',
 'thyroid_surgery',
 'I131_treatment',
 'query_hypothyroid',
 'query hyperthyroid',
 'lithium',
 'goitre',
 'tumor',
 'hypopituitary',
 'psych',
 'TSH_measured',
 'TSH',
 'T3_measured',
 'T3',
 'TT4_measured',
 'TT4',
 'T4U_measured',
 'T4U',
 'FTI_measured',
 'FTI',
 'TBG_measured',
 'TBG',
 'referral_source',
 'target']

In [36]:
class DataTransformation:
    def __init__(self, config=DataTransformationConfig):
        self.config = config

    def data_clenser_splitter(self):
        """
        This function takes in the data transformation configuration and splits the data into train and test sets.

        Args:
            self (DataTransformation): An instance of the DataTransformation class.

        Returns:
            None

        """
        df = pd.read_csv(self.config.data_path, names=list(self.config.columns))

        # tidy the target column
        df['patient_id'] = df["target"].apply(lambda x: x.split("[")[1].strip(']'))
        df['target'] = df["target"].apply(lambda x: x.split("[")[0])

        # replacing ? with np.nan
        df.replace({"?":np.nan}, inplace=True)

        # converting object to float
        num_cols = ["TSH", "T3", "TT4", "T4U", "FTI", "TBG"]
        for i in num_cols:
            df[i] = df[i].astype(float)

        # age cannot be 65526 
        # capping age to 100 years
        df = df[df["age"] <= 100]

        # Remove reduntant columns
        df.drop(['TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'referral_source', 'patient_id', "TBG"], axis=1, inplace=True)

        # Selecting a subset of target which can be classified as Hyper , hypo or Euthyroid (Negative) state
        df = df[df['target'].isin(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'AK', 'C|I', 'H|K', 'GK', 'FK', 'GI', 'GKJ', 'D|R', '-'])]
        # mapping the target column
        mapping = {'-': "Negative",
                'A': 'Hyperthyroid', 'AK': "Hyperthyroid", 'B': "Hyperthyroid", 'C': "Hyperthyroid", 'C|I': 'Hyperthyroid', 'D': "Hyperthyroid", 'D|R': "Hyperthyroid",
                'E': "Hypothyroid", 'F': "Hypothyroid", 'FK': "Hypothyroid", "G": "Hypothyroid", "GK": "Hypothyroid", "GI": "Hypothyroid", 'GKJ': 'Hypothyroid', 'H|K': 'Hypothyroid',
                }
        df['target'] = df['target'].map(mapping)

        # impute some missing values of sex (total = 254 missing) using pregnancy
        df["sex"] = np.where((df["sex"].isnull()) & (df["pregnant"] == "t"), 'F', df["sex"])

        # replacing t with 1 and f with 0
        # df = df.replace({"t": 1, "f": 0}) # this will be onehot encoded

        # Mapping sex to 0 for female and 1 for male
        # df["sex"] = df["sex"].map({"F": 0, "M": 1}) # this will be onehot encoded

        # Mapping target
        df["target"] = df.target.map(
            {'Negative': 0, 'Hypothyroid': 1, 'Hyperthyroid': 2})

        # Split the data into train and test sets. (0.75 , 0.25) split
        train, test = train_test_split(df, random_state=0)

        train.to_csv(os.path.join(
            self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(
            self.config.root_dir, "test.csv"), index=False)

        logger.info(f"Splited data into training and testing sets")
        logger.info(f"Train data shape is: {train.shape}")
        logger.info(f"Train data shape is: {test.shape}")

In [37]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.data_clenser_splitter()
except Exception as e:
    raise e

[2024-01-26 21:10:48,382:INFO:common:yaml file: config\config.yaml loaded successfully]
[2024-01-26 21:10:48,385:INFO:common:yaml file: params.yaml loaded successfully]
[2024-01-26 21:10:48,389:INFO:common:yaml file: schema.yaml loaded successfully]
[2024-01-26 21:10:48,392:INFO:common:created directory at :artifacts]
[2024-01-26 21:10:48,392:INFO:common:created directory at :artifacts/data_transformation]
[2024-01-26 21:10:48,528:INFO:2031194464:Splited data into training and testing sets]
[2024-01-26 21:10:48,528:INFO:2031194464:Train data shape is: (5756, 22)]
[2024-01-26 21:10:48,532:INFO:2031194464:Train data shape is: (1919, 22)]
