In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd

# Sample data
data = [
    {'age': 25, 'salary': 50000, 'gender': 'Male',   'city': 'Mumbai'},
    {'age': 32, 'salary': 72000, 'gender': 'Female', 'city': 'Delhi'},
    {'age': None, 'salary': 61000, 'gender': 'Male', 'city': None},
    {'age': 45, 'salary': None, 'gender': None,      'city': 'Bengaluru'},
    {'age': 29, 'salary': 58000, 'gender': 'Female', 'city': 'Delhi'},
]

df = pd.DataFrame(data)


In [25]:

numeric_features = ['age', 'salary']
categorical_features = ['gender', 'city']


numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Transform
X_transformed = preprocessor.fit_transform(df)

print(X_transformed)


[[-1.15594416 -1.45320647  0.          1.          0.          0.
   0.          1.          0.        ]
 [-0.11186556  1.66587083  1.          0.          0.          0.
   1.          0.          0.        ]
 [ 0.          0.10633218  0.          1.          0.          0.
   0.          0.          1.        ]
 [ 1.82713754  0.          0.          0.          1.          1.
   0.          0.          0.        ]
 [-0.55932782 -0.31899654  1.          0.          0.          0.
   1.          0.          0.        ]]
