In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Step 1: Create a sample dataset
data = {
    "Age": [25, 30, np.nan, 35, 40],
    "Salary": [50000, 60000, 55000, np.nan, 80000],
    "City": ["New York", "Los Angeles", "New York", "Chicago", "Los Angeles"],
    "Purchased": ["Yes", "No", "Yes", "No", "Yes"]
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)

# Step 2: Split into features and target
X = df.drop("Purchased", axis=1)
y = df["Purchased"]

# Step 3: Define preprocessing steps
numeric_features = ["Age", "Salary"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_features = ["City"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Step 4: Create a full pipeline with preprocessing and loading
pipeline = Pipeline(steps=[("preprocessor", preprocessor)])

# Step 5: Fit and transform the data
X_preprocessed = pipeline.fit_transform(X)

# Convert the transformed data into a DataFrame
# Extract feature names after one-hot encoding
categorical_columns = list(pipeline.named_steps["preprocessor"]
                           .named_transformers_["cat"]
                           .named_steps["onehot"]
                           .get_feature_names_out(categorical_features))

all_columns = numeric_features + categorical_columns
X_transformed_df = pd.DataFrame(X_preprocessed, columns=all_columns)

print("\nTransformed Data:")
print(X_transformed_df)

# Step 6: Combine transformed features with target
final_df = pd.concat([X_transformed_df, y.reset_index(drop=True)], axis=1)
print("\nFinal DataFrame:")
print(final_df)

Original Data:
    Age   Salary         City Purchased
0  25.0  50000.0     New York       Yes
1  30.0  60000.0  Los Angeles        No
2   NaN  55000.0     New York       Yes
3  35.0      NaN      Chicago        No
4  40.0  80000.0  Los Angeles       Yes

Transformed Data:
   Age    Salary  City_Chicago  City_Los Angeles  City_New York
0 -1.5 -1.104482           0.0               0.0            1.0
1 -0.5 -0.122720           0.0               1.0            0.0
2  0.0 -0.613601           0.0               0.0            1.0
3  0.5  0.000000           1.0               0.0            0.0
4  1.5  1.840803           0.0               1.0            0.0

Final DataFrame:
   Age    Salary  City_Chicago  City_Los Angeles  City_New York Purchased
0 -1.5 -1.104482           0.0               0.0            1.0       Yes
1 -0.5 -0.122720           0.0               1.0            0.0        No
2  0.0 -0.613601           0.0               0.0            1.0       Yes
3  0.5  0.000000           1