In [113]:
import pandas as pd

In [114]:
# Feature Engineering involves creating new features or transforming existing ones to 
# improve the performance of machine learning models.

# Creating new columns based on existing ones.

In [115]:
data={
    'height (cm)':[160,170,180,475],
    'weight (kg)':[60,35,65,60]
}

df=pd.DataFrame(data)

In [116]:
df

Unnamed: 0,height (cm),weight (kg)
0,160,60
1,170,35
2,180,65
3,475,60


In [117]:
# Create a new column BMI based on height and weight
# 
df['bmi']=df['weight (kg)']*10000/df['height (cm)']**2

In [118]:
df

Unnamed: 0,height (cm),weight (kg),bmi
0,160,60,23.4375
1,170,35,12.110727
2,180,65,20.061728
3,475,60,2.65928


In [119]:
print(len(df['bmi']))

4


In [120]:
for i in range(len(df['bmi'])):
    bmi=df['bmi'].loc[i]
    print(bmi)
    if bmi<18.5:
        df.loc[i,'weight status']='Under Weight'
    elif 18.5<=bmi<=24.5:
        df.loc[i,'weight status']='Healthy Weight'
    elif 24.5<bmi<=29.9:
        df.loc[i,'weight status']='Over Weight'
    elif bmi>29.9:
        df.loc[i,'weight status']='Obese'

23.4375
12.110726643598616
20.061728395061728
2.6592797783933517


In [121]:
df

Unnamed: 0,height (cm),weight (kg),bmi,weight status
0,160,60,23.4375,Healthy Weight
1,170,35,12.110727,Under Weight
2,180,65,20.061728,Healthy Weight
3,475,60,2.65928,Under Weight


# Extracting Features from Text, Dates or Categorical Values

In [122]:
text_data=pd.DataFrame({'text':['Hello World','How are you', 'Good Morning']})


# get_dummies - Converts categorical variable into dummy/indicator variables.
# Each variable is converted in as many 0/1 variables as there are different values.
text_features = pd.get_dummies(text_data['text'])
print(text_features)

   Good Morning  Hello World  How are you
0         False         True        False
1         False        False         True
2          True        False        False


In [123]:
# Example for date data (yr-m-d)
date_data=pd.DataFrame({'date':['2021-01-01','2022-02-01','2023-03-01']})

In [124]:
date_data['day']=pd.to_datetime(date_data['date']).dt.day
date_data['month']=pd.to_datetime(date_data['date']).dt.month
date_data['year']=pd.to_datetime(date_data['date']).dt.year

In [125]:
print(date_data)

         date  day  month  year
0  2021-01-01    1      1  2021
1  2022-02-01    1      2  2022
2  2023-03-01    1      3  2023


In [126]:
# Example for Categorical data

categorical_data=pd.DataFrame({'category':['A','B','B','C','B']})

category_features=pd.get_dummies(categorical_data['category'])

In [127]:
category_features

Unnamed: 0,A,B,C
0,True,False,False
1,False,True,False
2,False,True,False
3,False,False,True
4,False,True,False


# Binning numerical data into categories

In [128]:
# Bining numerical data involves dividing continuous variables into
# discrete bins or categories

In [129]:
df['height_category']=pd.cut(df['height (cm)'],bins=[0,160,170,180,float('inf')],
                             labels=['Short','Average','Tall','Very Tall'])

In [130]:
df

Unnamed: 0,height (cm),weight (kg),bmi,weight status,height_category
0,160,60,23.4375,Healthy Weight,Short
1,170,35,12.110727,Under Weight,Average
2,180,65,20.061728,Healthy Weight,Tall
3,475,60,2.65928,Under Weight,Very Tall
