# Task 3: Data Preprocessing & Feature Engineering

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer

# Load the data from the CSV file
df = pd.read_csv('Global_Health_Statistics.csv')  # Replace 'your_file.csv' with the actual file path

# Define numerical and categorical features
numerical_features = ['Prevalence Rate (%)', 'Incidence Rate (%)', 'Mortality Rate (%)', 'Healthcare Access (%)', 'Doctors per 1000', 'Recovery Rate (%)']
categorical_features = ['Country', 'Disease Name', 'Disease Category', 'Age Group', 'Gender']

# Step 1: Handle Missing Values
numerical_imputer = SimpleImputer(strategy='median')  # Fill numerical features with median
categorical_imputer = SimpleImputer(strategy='most_frequent')  # Fill categorical features with mode

# Step 2: Convert Categorical Features to Numerical
categorical_encoder = OneHotEncoder(handle_unknown='ignore')  # One-Hot Encoding for categorical features

# Step 3: Scale Numerical Features
numerical_scaler = MinMaxScaler()  # MinMax Scaling for numerical features

# Step 4: Create New Features (Example: Feature Interaction)
def create_new_features(X):
    X['Healthcare_Doctors_Interaction'] = X['Healthcare Access (%)'] * X['Doctors per 1000']
    return X

feature_engineering = FunctionTransformer(create_new_features)

# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', numerical_imputer),
            ('scaler', numerical_scaler)
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', categorical_imputer),
            ('encoder', categorical_encoder)
        ]), categorical_features)
    ]
)

# Combine preprocessing and feature engineering into a single pipeline
pipeline = Pipeline(steps=[
    ('feature_engineering', feature_engineering),
    ('preprocessor', preprocessor)
])

# Apply the pipeline to the data
processed_data = pipeline.fit_transform(df)

# Convert the processed data back to a DataFrame (for readability)
# Note: After OneHotEncoding, the categorical features will be expanded into multiple columns
processed_columns = numerical_features + list(pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_features))
processed_df = pd.DataFrame(processed_data.toarray(), columns=processed_columns)

# Save the processed data to a new CSV file (optional)
# processed_df.to_csv('processed_data.csv', index=False)

print(processed_df.head())

   Prevalence Rate (%)  Incidence Rate (%)  Mortality Rate (%)  \
0             0.042714            0.097315            0.840404   
1             0.621106            0.572483            0.873737   
2             0.040704            0.151007            0.618182   
3             0.230151            0.415436            0.392929   
4             0.036683            0.905369            0.697980   

   Healthcare Access (%)  Doctors per 1000  Recovery Rate (%)  \
0                 0.1548          0.631111           0.853469   
1                 0.7842          0.184444           0.543878   
2                 0.1282          0.793333           0.990816   
3                 0.7040          0.595556           0.354082   
4                 0.3400          0.913333           0.001224   

   Country_Argentina  Country_Australia  Country_Brazil  Country_Canada  ...  \
0                0.0                0.0             0.0             0.0  ...   
1                0.0                0.0             