<a href="https://colab.research.google.com/github/zahid-bracu/feature-engineering/blob/main/Part_5_2_Column_Transformation_With_Column_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder

# Create sample data
data = {
    'age': np.random.randint(18, 70, size=100),  # Random ages between 18 and 70
    'gender': np.random.choice(['Male', 'Female'], size=100),  # Random genders
    'fever': np.round(np.random.uniform(97, 104, size=100), 1),  # Random temperatures between 97 and 104
    'cough': np.random.choice(['Mild', 'Strong'], size=100),  # Random cough severity
    'city': np.random.choice(['Dhaka', 'Mymensingh','Chittagong'], size=100),  # List of cities
    'has_covid': np.random.choice([True, False], size=100)  # Random covid status
}

# Convert to DataFrame
df = pd.DataFrame(data)

df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,18,Female,100.8,Mild,Dhaka,True
1,36,Male,100.7,Strong,Dhaka,False
2,42,Female,99.0,Strong,Mymensingh,False
3,46,Female,98.1,Strong,Dhaka,True
4,62,Female,97.2,Strong,Dhaka,True


In [None]:
# Define the column transformer
column_transformer = ColumnTransformer(
    transformers=[
        ('age_scaler', StandardScaler(), ['age']),
        ('gender_ohe', OneHotEncoder(), ['gender']),
        ('fever_scaler', StandardScaler(), ['fever']),
        ('cough_ordinal', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),
        ('city_ohe', OneHotEncoder(), ['city'])
    ],
    remainder='passthrough'  # Keep other columns as is
)

# Apply transformations
transformed_data = column_transformer.fit_transform(df)

# Encode the target column
label_encoder = LabelEncoder()
df['has_covid'] = label_encoder.fit_transform(df['has_covid'])

# Convert transformed data to DataFrame and concatenate with the target column
transformed_df = pd.DataFrame(transformed_data, columns=column_transformer.get_feature_names_out())
final_df = pd.concat([transformed_df, df['has_covid'].reset_index(drop=True)], axis=1)

# Display the final DataFrame
final_df

Unnamed: 0,age_scaler__age,gender_ohe__gender_Female,gender_ohe__gender_Male,fever_scaler__fever,cough_ordinal__cough,city_ohe__city_Chittagong,city_ohe__city_Dhaka,city_ohe__city_Mymensingh,remainder__has_covid,has_covid
0,-1.497882,1.0,0.0,0.478845,0.0,0.0,1.0,0.0,1.0,1
1,-0.362167,0.0,1.0,0.427522,1.0,0.0,1.0,0.0,0.0,0
2,0.016405,1.0,0.0,-0.444972,1.0,0.0,0.0,1.0,0.0,0
3,0.268786,1.0,0.0,-0.906880,1.0,0.0,1.0,0.0,1.0,1
4,1.278310,1.0,0.0,-1.368789,1.0,0.0,1.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...
95,-1.056215,1.0,0.0,0.068260,0.0,0.0,1.0,0.0,1.0,1
96,1.404501,1.0,0.0,0.273552,0.0,1.0,0.0,0.0,0.0,0
97,0.458072,0.0,1.0,-0.855557,1.0,0.0,0.0,1.0,1.0,1
98,-0.046690,0.0,1.0,-1.163496,1.0,1.0,0.0,0.0,0.0,0
