# Feature Engineering
- Age Categoy
- BMI Category - Underweight, Normal, Overweight, Obese
- Pollution Risk Score - Location and Air Pollution Level
- Smoking Status Encoding
- Interaction Features
- Location Encoding

In [35]:
# import the libraries needed
import pandas as pd
import numpy as np

In [36]:
# load the data
df = pd.read_csv("../../data/synthetic_COPD_data.csv")

In [37]:
df.sample(8)

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis
857,61,Male,Current,1,1,1,26.48,Hetauda,148,0,1
751,35,Male,Current,1,0,0,27.57,Butwal,71,1,1
651,47,Female,Current,1,1,1,28.09,Chitwan,56,0,1
496,32,Male,Current,0,0,1,26.01,Butwal,126,1,1
913,55,Female,Current,1,1,1,20.09,Butwal,140,0,1
359,57,Male,Former,0,1,0,34.28,Chitwan,138,0,0
453,44,Female,Current,0,1,1,20.25,Bhaktapur,84,0,1
187,69,Male,Former,1,0,0,28.23,Bhaktapur,139,1,0


In [38]:
# Age categories
df["Age_Category"] = pd.cut(df["Age"], bins=[29, 39, 49, 59, 69, 79, 89], labels=["30-39", "40-49", "50-59", "60-69", "70-79", "80-89"])

# BMI categories
df["BMI_Categories"] = pd.cut(df["BMI"], bins=[0, 18.5, 24.5, 29.9, 35], labels=["Underweight", "Normal", "Overweight", "Obese"])

In [39]:
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_Categories
0,31,Male,Former,1,1,1,27.56,Lalitpur,84,0,0,30-39,Overweight
1,60,Male,Never,1,0,0,30.3,Pokhara,131,1,0,60-69,Obese
2,33,Male,Former,0,0,1,28.45,Pokhara,123,1,0,30-39,Overweight
3,36,Female,Current,1,0,0,27.49,Kathmandu,253,0,1,30-39,Overweight
4,58,Male,Never,0,0,0,25.49,Pokhara,117,1,0,50-59,Overweight


In [40]:
# pollution risk score
df["Pollution_Risk_Score"] = np.where(df["Air_Pollution_Level"] > 150, 1, 0)

In [41]:
# Encode smoking status
df["Smoking_Status_encoded"] = df["Smoking_Status"].map({"Current":1, "Former":0.5, "Never": 0})

# label encoding for gender
df["Gender_"] = df["Gender"].map({"Male":1, "Female": 0})



In [42]:
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_Categories,Pollution_Risk_Score,Smoking_Status_encoded,Gender_
0,31,Male,Former,1,1,1,27.56,Lalitpur,84,0,0,30-39,Overweight,0,0.5,1
1,60,Male,Never,1,0,0,30.3,Pokhara,131,1,0,60-69,Obese,0,0.0,1
2,33,Male,Former,0,0,1,28.45,Pokhara,123,1,0,30-39,Overweight,0,0.5,1
3,36,Female,Current,1,0,0,27.49,Kathmandu,253,0,1,30-39,Overweight,1,1.0,0
4,58,Male,Never,0,0,0,25.49,Pokhara,117,1,0,50-59,Overweight,0,0.0,1


In [43]:
#interaction features
df["Smoking_Pollution_Interaction"] = df["Smoking_Status_encoded"] * df["Air_Pollution_Level"]

In [44]:
# One hot encoding for the location
df = pd.get_dummies(df, columns=["Location"], drop_first = True)

## Machine Learning Data

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   Age                               1000 non-null   int64   
 1   Gender                            1000 non-null   object  
 2   Smoking_Status                    1000 non-null   object  
 3   Biomass_Fuel_Exposure             1000 non-null   int64   
 4   Occupational_Exposure             1000 non-null   int64   
 5   Family_History_COPD               1000 non-null   int64   
 6   BMI                               1000 non-null   float64 
 7   Air_Pollution_Level               1000 non-null   int64   
 8   Respiratory_Infections_Childhood  1000 non-null   int64   
 9   COPD_Diagnosis                    1000 non-null   int64   
 10  Age_Category                      1000 non-null   category
 11  BMI_Categories                    1000 non-null   categor

In [46]:
df = df.drop(columns=["Smoking_Status", "Age_Category", "BMI_Categories", "Gender"])

In [47]:
# save my engineered data
df.to_csv("../../data/engineered_COPD_data.csv", index=False)