In [22]:
import os
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

In [2]:
# Loading data in pandas dataframe
current_directory = os.path.abspath(os.getcwd())
parent_directory = os.path.join(current_directory, '..')
grandparent_directory = os.path.join(parent_directory, '..')
data_directory = os.path.join(grandparent_directory, 'data')
csv_path = os.path.join(data_directory, 'healthcare_dataset.csv')

df = pd.read_csv(csv_path)

In [3]:
df.drop(['Name', 'Doctor', 'Hospital', 'Insurance Provider', 'Room Number', 'Date of Admission', 'Discharge Date'], axis=1, inplace=True)

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
Age,81,35,61,49,51
Gender,Female,Male,Male,Male,Male
Blood Type,O-,O+,B-,B-,O-
Medical Condition,Diabetes,Asthma,Obesity,Asthma,Arthritis
Billing Amount,37490.983364,47304.064845,36874.896997,23303.322092,18086.344184
Admission Type,Elective,Emergency,Emergency,Urgent,Urgent
Medication,Aspirin,Lipitor,Lipitor,Penicillin,Paracetamol
Test Results,Inconclusive,Normal,Normal,Abnormal,Normal


## Data Preprocessing

In [43]:
def preprocess_data(data):
    """Preprocess the input data.
    Args:
        data (pandas.Dataframe): The input data.
    Returns:
        pandas.Dataframe: The preprocessed data.
    """
    # Handling missing values
    numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
    categorical_features = data.select_dtypes(include=['object']).columns

    # Impute missing values for numerical features
    imputer_num = SimpleImputer(strategy='mean')
    data_imputed_num = pd.DataFrame(imputer_num.fit_transform(data[numerical_features]), columns=numerical_features)

    # Impute missing values for categorical features
    imputer_cat = SimpleImputer(strategy='most_frequent')
    data_imputed_cat = pd.DataFrame(imputer_cat.fit_transform(data[categorical_features]), columns=categorical_features)

    # Combine imputed numerical and categorical data
    data_imputed = pd.concat([data_imputed_num, data_imputed_cat], axis=1)

    # Label encoding categorical features
    label_encoder = LabelEncoder()
    categorical_labels = data_imputed[categorical_features].apply(lambda col: label_encoder.fit_transform(col))

    # Normalizing numerical features
    scaler = StandardScaler()
    numerical_scaled = pd.DataFrame(scaler.fit_transform(data_imputed[numerical_features]), columns=numerical_features)

    # One Hot Encoding categorical features
    encoder = OneHotEncoder(handle_unknown='ignore')
    categorical_encoded = pd.DataFrame(encoder.fit_transform(data_imputed[categorical_features]).toarray())

    # Get one-hot encoded column names
    encoded_columns = []
    for feature, categories in zip(categorical_features, encoder.categories_):
        encoded_columns.extend([f"{feature}_{category}" for category in categories])
    categorical_encoded.columns = encoded_columns

    # Combining processed data
    preprocessed_data = pd.concat([categorical_encoded, categorical_labels, numerical_scaled], axis=1)
    return preprocessed_data

In [44]:
processed_data = preprocess_data(df)

In [45]:
processed_data.head()

Unnamed: 0,Gender_Female,Gender_Male,Blood Type_A+,Blood Type_A-,Blood Type_AB+,Blood Type_AB-,Blood Type_B+,Blood Type_B-,Blood Type_O+,Blood Type_O-,...,Test Results_Inconclusive,Test Results_Normal,Gender,Blood Type,Medical Condition,Admission Type,Medication,Test Results,Age,Billing Amount
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0,7,3,0,0,1,1.508465,0.851249
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1,6,1,1,2,2,-0.839912,1.548866
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1,5,5,1,2,2,0.487431,0.807452
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1,5,1,2,4,0,-0.125189,-0.157358
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1,7,0,2,3,2,-0.023086,-0.528235


## Internal Data Partitioning
For train and testing

In [47]:
# Necessary imports
from sklearn.model_selection import train_test_split

In [51]:
def split_data(data, column='Gender', test_size=0.2, random_state=42):      # USING GENDER AS THE TARGET COLUMN FOR SPLITTING
    """Split the input data into training and testing sets.
    Args:
        data (pandas.Dataframe): The input data.
        test_size (float): The proportion of the dataset to include in the test split.
        random_state (int): Controls the shuffling applied to the data before applying the split.
    Returns:
        pandas.Dataframe: The training data.
        pandas.Dataframe: The testing data.
    """
    X = data.drop(column, axis=1)
    y = data[column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [52]:
X_train, X_val, y_train, y_val = split_data(processed_data)