In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [7]:
# Extract
def extract_data(file_path):
    data = pd.read_csv(file_path, na_values='?')
    print("Data extracted. Shape:", data.shape)
    return data

In [8]:
def transform_data(data):
    # Handle missing values
    imputer = SimpleImputer(strategy='median')
    data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
    
    # Convert categorical variables
    categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
    data_encoded = pd.get_dummies(data_imputed, columns=categorical_cols)
    
    # Scale numerical features
    numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
    scaler = StandardScaler()
    data_encoded[numerical_cols] = scaler.fit_transform(data_encoded[numerical_cols])
    
    print("Data transformed. Shape:", data_encoded.shape)
    return data_encoded

In [9]:
# Load
def load_data(data, output_file):
    data.to_csv(output_file, index=False)
    print(f"Data loaded to {output_file}")

In [10]:
# ETL Pipeline
def etl_pipeline(input_file, output_file):
    raw_data = extract_data(input_file)
    transformed_data = transform_data(raw_data)
    load_data(transformed_data, output_file)
    return transformed_data

In [13]:
# Run ETL
input_file = 'Heart Disease data.csv'
output_file = 'heart_disease_processed.csv'
processed_data = etl_pipeline(input_file, output_file)

print("\nETL Process Completed")
print("Processed Data Info:")
print(processed_data.info())
print("\nProcessed Data Sample:")
print(processed_data.head())

Data extracted. Shape: (1025, 14)
Data transformed. Shape: (1025, 31)
Data loaded to heart_disease_processed.csv

ETL Process Completed
Processed Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          1025 non-null   float64
 1   trestbps     1025 non-null   float64
 2   chol         1025 non-null   float64
 3   thalach      1025 non-null   float64
 4   oldpeak      1025 non-null   float64
 5   target       1025 non-null   float64
 6   sex_0.0      1025 non-null   bool   
 7   sex_1.0      1025 non-null   bool   
 8   cp_0.0       1025 non-null   bool   
 9   cp_1.0       1025 non-null   bool   
 10  cp_2.0       1025 non-null   bool   
 11  cp_3.0       1025 non-null   bool   
 12  fbs_0.0      1025 non-null   bool   
 13  fbs_1.0      1025 non-null   bool   
 14  restecg_0.0  1025 non-null   bool   
 15  restecg_1.0  1025