In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from manipulation_functions import binarize_feature, log_transform_feature, add_polynomial_features, impute_missing_values

In [3]:
asthma_data_tidy = pd.read_pickle('asthma_data_tidy.pkl')

In [4]:
asthma_data_tidy

Unnamed: 0,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,...,gastro_esophageal_reflux,lung_function_fev_1,lung_function_fv_c,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis
0,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,...,0,1.369051,4.941206,0,0,1,0,0,1,0
1,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,1.969838,...,0,2.197767,1.702393,1,0,0,1,1,1,0
2,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,1.460593,...,0,1.698011,5.022553,1,1,1,0,1,1,0
3,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,0.581905,...,0,3.032037,2.300159,1,0,1,1,1,0,0
4,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,0.980875,...,0,3.470589,3.067944,1,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,43,1,0,2,29.059613,0,3.019854,6.119637,8.300960,2.483829,...,0,3.125249,5.166032,0,1,0,0,0,1,1
2388,18,1,0,1,20.740850,0,5.805180,4.386992,7.731192,7.733983,...,0,1.132977,5.509502,0,0,0,1,1,0,1
2389,54,0,3,2,37.079560,0,4.735169,8.214064,7.483521,2.794847,...,0,1.685962,3.346877,1,0,1,1,0,1,1
2390,46,1,0,2,23.444712,0,9.672637,7.362861,6.717272,9.448862,...,1,3.481549,1.713274,0,1,1,0,1,1,0


Lets standartdize the variables, so they have a mean of 0 and are ready for ML.

In [5]:
from sklearn.preprocessing import StandardScaler

# List of continuous variables to standardize
continuous_vars = ['age', 'bmi', 'physical_activity', 'diet_quality', 
                   'sleep_quality', 'pollution_exposure', 'lung_function_fev_1', 
                   'lung_function_fv_c']

scaler = StandardScaler()

asthma_data_tidy[continuous_vars] = scaler.fit_transform(asthma_data_tidy[continuous_vars])

asthma_data_tidy.head()

Unnamed: 0,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,...,gastro_esophageal_reflux,lung_function_fev_1,lung_function_fv_c,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis
0,0.96574,0,1,0,-1.582769,0,-1.432099,0.160113,0.971063,0.809355,...,0,-1.368934,0.920608,0,0,1,0,0,1,0
1,-0.747054,1,2,2,-0.6233,0,0.291269,0.453069,-1.076746,-1.036866,...,0,-0.407132,-1.564256,1,0,0,1,1,1,0
2,0.687989,0,2,1,-1.229074,0,0.58133,1.434458,-0.102976,-1.210374,...,0,-0.987146,0.983019,1,1,1,0,1,1,0
3,-0.09897,1,2,1,1.565307,0,-1.256398,0.276233,-1.59688,-1.509757,...,0,0.561114,-1.105641,1,0,1,1,1,0,0
4,0.873156,0,0,3,-1.105686,0,-0.154081,-0.651625,1.504976,-1.373822,...,0,1.070095,-0.516586,1,1,1,0,0,1,0


In [6]:
asthma_data_tidy.bmi.mean()

2.3541183866633753e-16

In [7]:
asthma_data_tidy

Unnamed: 0,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,...,gastro_esophageal_reflux,lung_function_fev_1,lung_function_fv_c,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis
0,0.965740,0,1,0,-1.582769,0,-1.432099,0.160113,0.971063,0.809355,...,0,-1.368934,0.920608,0,0,1,0,0,1,0
1,-0.747054,1,2,2,-0.623300,0,0.291269,0.453069,-1.076746,-1.036866,...,0,-0.407132,-1.564256,1,0,0,1,1,1,0
2,0.687989,0,2,1,-1.229074,0,0.581330,1.434458,-0.102976,-1.210374,...,0,-0.987146,0.983019,1,1,1,0,1,1,0
3,-0.098970,1,2,1,1.565307,0,-1.256398,0.276233,-1.596880,-1.509757,...,0,0.561114,-1.105641,1,0,1,1,1,0,0
4,0.873156,0,0,3,-1.105686,0,-0.154081,-0.651625,1.504976,-1.373822,...,0,1.070095,-0.516586,1,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,0.039905,1,0,2,0.252042,0,-0.699950,0.376978,0.740107,-0.861740,...,0,0.669296,1.093099,0,1,0,0,0,1,1
2388,-1.117388,1,0,1,-0.903322,0,0.259526,-0.218561,0.411163,0.927074,...,0,-1.642920,1.356614,0,0,0,1,1,0,1
2389,0.549114,0,3,2,1.365905,0,-0.109067,1.096868,0.268175,-0.755772,...,0,-1.001130,-0.302584,1,0,1,1,0,1,1
2390,0.178780,1,0,2,-0.527792,0,1.591768,0.804295,-0.174204,1.511361,...,1,1.082816,-1.555908,0,1,1,0,1,1,0


In [8]:
# Example: Binarizing SleepQuality
asthma_data_tidy = binarize_feature(asthma_data_tidy, 'sleep_quality', threshold=0)

# Example: Log transform of PollutionExposure
asthma_data_tidy = log_transform_feature(asthma_data_tidy, 'pollution_exposure')

# Example: Adding polynomial features to Age
asthma_data_tidy = add_polynomial_features(asthma_data_tidy, 'age', degree=3)

# Example: Impute missing values (if any)
asthma_data_tidy = impute_missing_values(asthma_data_tidy, strategy='mean')

# Display the first few rows of the modified DataFrame

asthma_data_tidy

Unnamed: 0,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,...,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis,sleep_quality_binarized,pollution_exposure_log,age_poly2,age_poly3
0,0.965740,0,1,0,-1.582769,0,-1.432099,0.160113,0.971063,0.809355,...,0,1,0,0,1,0,1,0.592971,0.932653,0.900700
1,-0.747054,1,2,2,-0.623300,0,0.291269,0.453069,-1.076746,0.866747,...,0,0,1,1,1,0,0,0.586279,0.558089,-0.416923
2,0.687989,0,2,1,-1.229074,0,0.581330,1.434458,-0.102976,0.866747,...,1,1,0,1,1,0,0,0.586279,0.473329,0.325645
3,-0.098970,1,2,1,1.565307,0,-1.256398,0.276233,-1.596880,0.866747,...,0,1,1,1,0,0,0,0.586279,0.009795,-0.000969
4,0.873156,0,0,3,-1.105686,0,-0.154081,-0.651625,1.504976,0.866747,...,1,1,0,0,1,0,1,0.586279,0.762402,0.665696
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,0.039905,1,0,2,0.252042,0,-0.699950,0.376978,0.740107,0.866747,...,1,0,0,0,1,1,1,0.586279,0.001592,0.000064
2388,-1.117388,1,0,1,-0.903322,0,0.259526,-0.218561,0.411163,0.927074,...,0,0,1,1,0,1,1,0.656003,1.248555,-1.395120
2389,0.549114,0,3,2,1.365905,0,-0.109067,1.096868,0.268175,0.866747,...,0,1,1,0,1,1,1,0.586279,0.301526,0.165572
2390,0.178780,1,0,2,-0.527792,0,1.591768,0.804295,-0.174204,1.511361,...,1,1,0,1,1,0,0,0.920825,0.031962,0.005714


In [9]:
sleep_quality_asthma = asthma_data_tidy[asthma_data_tidy.diagnosis == 1].sleep_quality_binarized.value_counts().reset_index()

sleep_quality_asthma.value_counts()

sleep_quality_binarized  count
0                        56       1
1                        68       1
Name: count, dtype: int64