# Preprocessing Lanjutan

In [None]:
!pip -q install category_encoders imbalanced-learn

import numpy as np
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import RobustScaler
import pickle


In [None]:
with open("train_test_data.pkl", "rb") as f:
    data = pickle.load(f)

X_train = data["X_train"]
X_test  = data["X_test"]
y_train = data["y_train"]
y_test  = data["y_test"]


In [12]:
binary_features = [
    'high_claim_without_witnesses',
    'no_police_report_but_severe_damage',
    'unknown_police_report_but_severe_damage',
    'unknown_property_damage',
    'unknown_collision_type',
    'multi_vehicles_no_injury'
]

numeric_cols_raw = [
    'months_as_customer', 'age', 'policy_annual_premium',
    'capital-gains', 'capital-loss',
    'incident_hour_of_the_day', 'number_of_vehicles_involved',
    'bodily_injuries', 'witnesses',
    'total_claim_amount', 'injury_claim',
    'property_claim', 'vehicle_claim',
    'auto_year', 'days_since_policy'
] + binary_features  # biner ikut sebagai numerik juga

categorical_cols_raw = [
    'policy_state', 'policy_csl', 'insured_sex',
    'insured_education_level', 'insured_occupation',
    'insured_hobbies', 'insured_relationship',
    'incident_type', 'collision_type',
    'incident_severity', 'authorities_contacted',
    'incident_state', 'incident_city',
    'property_damage', 'police_report_available',
    'zip_prefix', 'umbrella_limit',
    'policy_deductable', 'auto_make_model'
]


In [13]:
# =========================
# preprocessing dalam pipeline (menghindari leakage)
# =========================

# kolom yang akan di encoding
te_cols = [c for c in ['zip_prefix', 'auto_make_model', 'insured_hobbies', 'insured_occupation'] if c in X_train.columns]

ordinal_cols = [c for c in ['insured_education_level', 'incident_severity', 'umbrella_limit', 'policy_deductable', 'policy_csl']
                if c in X_train.columns]

# nominal = kategorikal lain selain ordinal dan TE
all_cat = [c for c in categorical_cols_raw if c in X_train.columns]
nominal_cols = [c for c in all_cat if c not in set(te_cols + ordinal_cols)]

# numerik
num_cols = [c for c in numeric_cols_raw if c in X_train.columns]

# ---- Pipeline per tipe fitur ----
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

te_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("te", ce.TargetEncoder(handle_unknown="value", handle_missing="value")) #handle_missing="value" artinya missing value pakai nilai global mean
])

# ORDINAL
transformers = []

if num_cols:
    transformers.append(("num", num_pipe, num_cols))

if te_cols:
    transformers.append(("te", te_pipe, te_cols))

# insured_education_level
if 'insured_education_level' in ordinal_cols:
    edu_order = ['High School', 'College', 'Associate', 'Masters', 'MD', 'JD', 'PhD']
    edu_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ord", OrdinalEncoder(categories=[edu_order], handle_unknown="use_encoded_value", unknown_value=-1))
    ])
    transformers.append(("edu_ord", edu_pipe, ['insured_education_level']))

# incident_severity
if 'incident_severity' in ordinal_cols:
    sev_order = ['Trivial Damage', 'Minor Damage', 'Major Damage', 'Total Loss']
    sev_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ord", OrdinalEncoder(categories=[sev_order], handle_unknown="use_encoded_value", unknown_value=-1))
    ])
    transformers.append(("sev_ord", sev_pipe, ['incident_severity']))

# umbrella_limit
if 'umbrella_limit' in ordinal_cols:
    umbrella_order = [0, 1000000, 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000]
    umb_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ord", OrdinalEncoder(categories=[umbrella_order], handle_unknown="use_encoded_value", unknown_value=-1))
    ])
    transformers.append(("umb_ord", umb_pipe, ['umbrella_limit']))

# policy_deductable
if 'policy_deductable' in ordinal_cols:
    deductible_order = [500, 1000, 2000]
    ded_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ord", OrdinalEncoder(categories=[deductible_order], handle_unknown="use_encoded_value", unknown_value=-1))
    ])
    transformers.append(("ded_ord", ded_pipe, ['policy_deductable']))

# policy_csl
if 'policy_csl' in ordinal_cols:
    csl_order = ['100/300', '250/500', '500/1000']
    csl_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ord", OrdinalEncoder(categories=[csl_order], handle_unknown="use_encoded_value", unknown_value=-1))
    ])
    transformers.append(("csl_ord", csl_pipe, ['policy_csl']))

# OneHot untuk nominal
if nominal_cols:
    ohe_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", drop="first"))
    ])
    transformers.append(("nom_ohe", ohe_pipe, nominal_cols))

preprocess = ColumnTransformer(
    transformers=transformers,
    remainder="drop"
)

In [16]:
# Simpan pipeline yang BELUM DI-FIT
with open("preprocessing_pipeline.pkl", "wb") as f:
    pickle.dump(preprocess, f)

In [17]:
# NOTE: fit ini HANYA untuk inspeksi feature names,
# pipeline training & CV tetap fit ulang secara aman
preprocess.fit(X_train, y_train)

def get_feature_names(preprocessor):
    feature_names = []

    for name, transformer, cols in preprocessor.transformers_:
        if transformer == 'drop':
            continue

        # kalau pipeline
        if hasattr(transformer, 'named_steps'):
            last_step = list(transformer.named_steps.values())[-1]

            # OneHotEncoder
            if isinstance(last_step, OneHotEncoder):
                ohe_features = last_step.get_feature_names_out(cols)
                feature_names.extend(ohe_features)

            # TargetEncoder / OrdinalEncoder / scaler
            else:
                feature_names.extend(cols)

        else:
            feature_names.extend(cols)

    return feature_names

feature_names = get_feature_names(preprocess)
for f in feature_names:
    print(f)

months_as_customer
age
policy_annual_premium
capital-gains
capital-loss
incident_hour_of_the_day
number_of_vehicles_involved
bodily_injuries
witnesses
total_claim_amount
injury_claim
property_claim
vehicle_claim
auto_year
days_since_policy
high_claim_without_witnesses
no_police_report_but_severe_damage
unknown_police_report_but_severe_damage
unknown_property_damage
unknown_collision_type
multi_vehicles_no_injury
zip_prefix
auto_make_model
insured_hobbies
insured_occupation
insured_education_level
incident_severity
umbrella_limit
policy_deductable
policy_csl
policy_state_IN
policy_state_OH
insured_sex_MALE
insured_relationship_not-in-family
insured_relationship_other-relative
insured_relationship_own-child
insured_relationship_unmarried
insured_relationship_wife
incident_type_Parked Car
incident_type_Single Vehicle Collision
incident_type_Vehicle Theft
collision_type_Rear Collision
collision_type_Side Collision
authorities_contacted_Fire
authorities_contacted_Other
authorities_contacted

In [18]:
# Simpan feature_names terpisah (opsional, untuk analisis)
with open("feature_names.pkl", "wb") as f:
    pickle.dump(feature_names, f)