In [294]:
%matplotlib inline

In [355]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Feature manipulation

In [356]:
asthma_data = pd.read_csv("data/processed_asthma_desease_data.csv")

In [357]:
asthma_data.head()

Unnamed: 0,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,...,gastroesophageal_reflux,lung_function_fev1,lung_function_fvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis
0,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,...,0,1.369051,4.941206,0,0,1,0,0,1,0
1,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,1.969838,...,0,2.197767,1.702393,1,0,0,1,1,1,0
2,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,1.460593,...,0,1.698011,5.022553,1,1,1,0,1,1,0
3,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,0.581905,...,0,3.032037,2.300159,1,0,1,1,1,0,0
4,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,0.980875,...,0,3.470589,3.067944,1,1,1,0,0,1,0


To perform feature selection and engineering, i will create new interaction features and binning numerical features into categorical groups. Then i will convert categorical variables to numerical format using one-hot encoding. I will standardize numerical features using StandardScaler, and any missing values will be impute with the mean:

In [358]:
def create_new_features(data):
    """
    Creates new features by combining existing numerical features.

    Args:
        data: The input df containing the original features.

    Returns:
        df: The DataFrame with additional interaction features.
    """
    
    try:
        data['bmi_physical_activity'] = data['bmi'] * data['physical_activity']
        data['total_exposure'] = data['pollution_exposure'] + data['pollen_exposure'] + data['dust_exposure']

        bins = [0, 18, 30, 40, 50, 60, 100]
        labels = ['0-18', '19-30', '31-40', '41-50', '51-60', '60+']
        data['age_group'] = pd.cut(data['age'], bins=bins, labels=labels, right=False)
        
        bins = [0, 18.5, 24.9, 29.9, 40]
        labels = ['Underweight', 'Normal weight', 'Overweight', 'Obesity']
        data['bmi_category'] = pd.cut(data['bmi'], bins=bins, labels=labels)
        
        return data
        
    except KeyError as e:
        print(f"KeyError: {e}")
        raise
        
    except Exception as e:
        print(f"An unexpected error occurred in create_new_features: {e}")
        raise

In [359]:
asthma_data = create_new_features(asthma_data)

In [360]:
def encode_categorical_features(data):
    """
    Encodes categorical features into numerical features using one-hot encoding.

    Args:
        data: The input DataFrame containing categorical features.

    Returns:
        df: The DataFrame with encoded categorical features.
    """
    
    try:
        categorical_columns = ['ethnicity', 'education_level', 'smoking', 'gender', 'age_group', 'bmi_category']
        
        encoder = OneHotEncoder(drop = 'first', sparse_output = False)

        encoded_features = encoder.fit_transform(data[categorical_columns])
        encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

        data = data.drop(categorical_columns, axis=1)
        data = pd.concat([data, encoded_df], axis=1)

        return data
    
    except KeyError as e:
        print(f"KeyError: {e}")
        raise
    except ValueError as e:
        print(f"ValueError: {e}")
        raise
    except Exception as e:
        print(f"An unexpected error occurred in encode_categorical_features: {e}")
        raise

In [361]:
asthma_data = encode_categorical_features(asthma_data)

In [362]:
def scale_features(data):
    """
    Scales numerical features to have zero mean and unit variance.

    Args:
        data (pd.DataFrame): The input DataFrame containing numerical features.

    Returns:
        pd.DataFrame: The DataFrame with scaled numerical features.
    """
    
    numerical_features = ['bmi', 'physical_activity', 'diet_quality', 'sleep_quality','pollution_exposure', 'pollen_exposure',
                          'dust_exposure', 'lung_function_fev1', 'lung_function_fvc', 'bmi_physical_activity', 'total_exposure']
    
    scaler = StandardScaler()
    data[numerical_features] = scaler.fit_transform(data[numerical_features])
    
    return data

In [363]:
asthma_data = scale_features(asthma_data)

In [364]:
asthma_data

Unnamed: 0,age,bmi,physical_activity,diet_quality,sleep_quality,pollution_exposure,pollen_exposure,dust_exposure,pet_allergy,family_history_asthma,...,smoking_1,gender_1,age_group_19-30,age_group_31-40,age_group_41-50,age_group_51-60,age_group_60+,bmi_category_Obesity,bmi_category_Overweight,bmi_category_Underweight
0,63,-1.582769,-1.432099,0.160113,0.971063,0.809355,-0.780866,-1.401921,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,26,-0.623300,0.291269,0.453069,-1.076746,-1.036866,0.810184,0.560684,0,0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,57,-1.229074,0.581330,1.434458,-0.102976,-1.210374,-1.267434,0.162295,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,40,1.565307,-1.256398,0.276233,-1.596880,-1.509757,0.849659,-0.355611,0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,61,-1.105686,-0.154081,-0.651625,1.504976,-1.373822,-0.713717,1.146977,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,43,0.252042,-0.699950,0.376978,0.740107,-0.861740,0.760717,-0.544470,1,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2388,18,-0.903322,0.259526,-0.218561,0.411163,0.927074,-0.980178,0.519779,0,0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2389,54,1.365905,-0.109067,1.096868,0.268175,-0.755772,-0.711873,1.574952,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2390,46,-0.527792,1.591768,0.804295,-0.174204,1.511361,0.898316,0.024327,0,1,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [365]:
def handle_missing_values(data):
    """
    Handles missing values in numerical features by imputing with the mean.

    Args:
        data : The input DataFrame containing numerical features with potential missing values.

    Returns:
        pd.DataFrame: The DataFrame with missing values imputed.
    """
    
    # numerical_features = ['bmi', 'physical_activity', 'diet_quality', 'sleep_quality', 'pollution_exposure', 
    #                       'pollen_exposure', 'dust_exposure', 'lung_function_fev1', 'lung_function_fvc',
    #                       'bmi_physical_activity', 'total_exposure']
    try:
        imputer = SimpleImputer(strategy='mean')
        numerical_features = data.select_dtypes(include=['float64']).columns
        data[numerical_features] = imputer.fit_transform(data[numerical_features])
        return data
        
    except KeyError as e:
        print(f"KeyError: {e}")
        raise
        
    except Exception as e:
        print(f"An unexpected error occurred in handle_missing_values: {e}")
        raise

In [366]:
handle_missing_values(prepared_data)

Unnamed: 0,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,...,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis,bmi_physical_activity,total_exposure,age_group,bmi_category
0,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,...,0,1,0,0,1,0,14.175883,11.218398,60+,Underweight
1,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,1.969838,...,0,0,1,1,1,0,134.205776,16.012134,19-30,Normal weight
2,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,1.460593,...,1,1,0,1,1,0,123.973328,8.354581,51-60,Underweight
3,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,0.581905,...,0,1,1,1,0,0,54.094811,12.119066,41-50,Obesity
4,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,0.980875,...,1,1,0,0,1,0,88.792124,12.291287,60+,Normal weight
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,43,1,0,2,29.059613,0,3.019854,6.119637,8.300960,2.483829,...,1,0,0,0,1,1,87.755774,13.223856,41-50,Overweight
2388,18,1,0,1,20.740850,0,5.805180,4.386992,7.731192,7.733983,...,0,0,1,1,0,1,120.404371,16.480757,19-30,Normal weight
2389,54,0,3,2,37.079560,0,4.735169,8.214064,7.483521,2.794847,...,0,1,1,0,1,1,175.577966,15.333999,51-60,Obesity
2390,46,1,0,2,23.444712,0,9.672637,7.362861,6.717272,9.448862,...,1,1,0,1,1,0,226.772189,22.212851,41-50,Normal weight


## Feature selection:

In [367]:
correlation_matrix = asthma_data.corr()
print(correlation_matrix.diagnosis.sort_values(ascending=False))

diagnosis                   1.000000
exercise_induced            0.053956
education_level_3           0.044432
lung_function_fvc           0.029629
wheezing                    0.027197
lung_function_fev1          0.023336
gastroesophageal_reflux     0.022770
ethnicity_3                 0.022309
sleep_quality               0.018022
pollen_exposure             0.015099
bmi_category_Overweight     0.010725
education_level_1           0.010182
age_group_60+               0.006644
physical_activity           0.005066
age_group_31-40             0.003446
gender_1                    0.003128
ethnicity_1                 0.001778
age_group_41-50             0.001659
age_group_19-30             0.001016
family_history_asthma      -0.001334
bmi_physical_activity      -0.001640
history_of_allergies       -0.001951
diet_quality               -0.003149
pollution_exposure         -0.004535
ethnicity_2                -0.005584
eczema                     -0.008592
total_exposure             -0.008746
b

In [372]:
threshold = 0.01

correlation_matrix = asthma_data.corr()

correlation_with_target = correlation_matrix['diagnosis']
important_features = correlation_with_target[correlation_with_target.abs() > threshold].index.tolist()
print(f"Important features: {important_features}")

Important features: ['age', 'bmi', 'sleep_quality', 'pollen_exposure', 'dust_exposure', 'pet_allergy', 'hay_fever', 'gastroesophageal_reflux', 'lung_function_fev1', 'lung_function_fvc', 'wheezing', 'shortness_of_breath', 'chest_tightness', 'coughing', 'nighttime_symptoms', 'exercise_induced', 'diagnosis', 'ethnicity_3', 'education_level_1', 'education_level_2', 'education_level_3', 'smoking_1', 'age_group_51-60', 'bmi_category_Obesity', 'bmi_category_Overweight']
