```{contents}
```

## Column Transformer & Imputer

In [21]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

url = "https://github.com/svgoudar/datasets/blob/main/loan_applications.csv?raw=true"
df = pd.read_csv(url)
df.head()


Unnamed: 0,application_id,customer_id,application_date,loan_type,loan_amount_requested,loan_tenure_months,interest_rate_offered,purpose_of_loan,employment_status,monthly_income,...,existing_emis_monthly,debt_to_income_ratio,property_ownership_status,residential_address,applicant_age,gender,number_of_dependents,loan_status,fraud_flag,fraud_type
0,c8bf0bea-70e6-4870-9125-41b8210c527f,CUST109427,2023-04-09,Business Loan,604000.0,12,11.66,Medical Emergency,Retired,34700.0,...,1100.0,3.17,Rented,"94/31, Sehgal Zila, Vadodara-380521, Anantapur...",28,Female,3,Approved,0,
1,91224cec-3544-4bc7-ac15-a9792da54c02,CUST106146,2023-09-23,Car Loan,100000.0,240,13.62,Education,Unemployed,51600.0,...,0.0,0.0,Owned,"H.No. 00, Sheth Chowk, Ichalkaranji 006728, Im...",44,Other,3,Approved,0,
2,4efcd02d-4a03-4ab7-9bd1-0ff430493d0c,CUST100674,2023-05-22,Education Loan,431000.0,60,11.4,Medical Emergency,Self-Employed,14800.0,...,4600.0,31.08,Rented,"H.No. 81, Dutta Path, Kozhikode-340301, Tadepa...",56,Other,4,Approved,0,
3,a61337d4-ba04-4a68-b492-2cb8266e6ed7,CUST106466,2024-07-09,Car Loan,324000.0,120,10.36,Debt Consolidation,Self-Employed,28800.0,...,4000.0,13.89,Rented,"H.No. 022, Rege Road, Tiruvottiyur-927857, Aur...",27,Other,4,Declined,0,
4,a8d1639e-170b-41b2-826a-55c7dae38d16,CUST112319,2023-11-20,Personal Loan,100000.0,36,14.14,Business Expansion,Salaried,43900.0,...,1100.0,2.51,Rented,"85/24, Bali Zila, Sambalpur 922071, Tumkur, Ke...",50,Other,0,Declined,0,


* ### Identify column types

In [22]:
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(exclude=np.number).columns.tolist()

# Identify skewed numeric columns
skewed_cols = df[numeric_cols].skew().sort_values(ascending=False)
skewed_cols = skewed_cols[abs(skewed_cols) > 0.5].index.tolist()  # threshold for skew
normal_cols = [col for col in numeric_cols if col not in skewed_cols]

print("Normal numeric columns:", normal_cols)
print("Skewed numeric columns:", skewed_cols)
print("Categorical columns:", categorical_cols)


Normal numeric columns: ['loan_amount_requested', 'interest_rate_offered', 'monthly_income', 'cibil_score', 'existing_emis_monthly', 'applicant_age', 'number_of_dependents']
Skewed numeric columns: ['fraud_flag', 'debt_to_income_ratio', 'loan_tenure_months']
Categorical columns: ['application_id', 'customer_id', 'application_date', 'loan_type', 'purpose_of_loan', 'employment_status', 'property_ownership_status', 'residential_address', 'gender', 'loan_status', 'fraud_type']




---

* ### Define transformers


In [23]:
# Numeric transformers
mean_imputer_scaler = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    # ('scaler', StandardScaler())
])

median_imputer_scaler = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    # ('scaler', StandardScaler())
])

# Categorical transformer
cat_imputer_encoder = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
])

# ColumnTransformer combining numeric and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('normal_num', mean_imputer_scaler, normal_cols),
        ('skewed_num', median_imputer_scaler, skewed_cols)
    ]
)




---

* ### ColumnTransformer


In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ('normal_num', mean_imputer_scaler, normal_cols),
        ('skewed_num', median_imputer_scaler, skewed_cols)
    ]
)






* ### Fit and transform



In [25]:
df_processed_array = preprocessor.fit_transform(df)

# For demonstration, check shape
print("Processed array shape:", df_processed_array.shape)
df_processed_array


Processed array shape: (50000, 10)


array([[6.040e+05, 1.166e+01, 3.470e+04, ..., 0.000e+00, 3.170e+00,
        1.200e+01],
       [1.000e+05, 1.362e+01, 5.160e+04, ..., 0.000e+00, 0.000e+00,
        2.400e+02],
       [4.310e+05, 1.140e+01, 1.480e+04, ..., 0.000e+00, 3.108e+01,
        6.000e+01],
       ...,
       [4.360e+05, 1.098e+01, 4.200e+04, ..., 0.000e+00, 1.430e+00,
        1.200e+01],
       [8.270e+05, 1.618e+01, 4.670e+04, ..., 0.000e+00, 1.734e+01,
        2.400e+01],
       [1.000e+05, 9.440e+00, 8.360e+04, ..., 0.000e+00, 2.630e+00,
        2.400e+01]])



---

* ### Convert back to DataFrame (optional)



In [26]:
# Get column names after one-hot encoding
# cat_columns = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
all_columns = normal_cols + skewed_cols

df_processed = pd.DataFrame(df_processed_array, columns=all_columns)
df_processed.head()


Unnamed: 0,loan_amount_requested,interest_rate_offered,monthly_income,cibil_score,existing_emis_monthly,applicant_age,number_of_dependents,fraud_flag,debt_to_income_ratio,loan_tenure_months
0,604000.0,11.66,34700.0,714.0,1100.0,28.0,3.0,0.0,3.17,12.0
1,100000.0,13.62,51600.0,667.0,0.0,44.0,3.0,0.0,0.0,240.0
2,431000.0,11.4,14800.0,808.0,4600.0,56.0,4.0,0.0,31.08,60.0
3,324000.0,10.36,28800.0,647.0,4000.0,27.0,4.0,0.0,13.89,120.0
4,100000.0,14.14,43900.0,624.0,1100.0,50.0,0.0,0.0,2.51,36.0


---

**Explanation**

1. **Normal numeric columns:** imputed with **mean** and scaled.
2. **Skewed numeric columns:** imputed with **median** and scaled.
3. **Categorical columns:** imputed with **mode** and **one-hot encoded**.
4. **ColumnTransformer** allows applying **different preprocessing steps** to different column types in a single pipeline.

