# Health Insurance Cross-Sell — Clean & Preprocess ONLY
Loads train/test CSVs (same folder as notebook), cleans, groups rare categories, winsorizes numerics, and builds a preprocessing pipeline. Saves cleaned outputs, model-ready arrays, and pipeline for later use.

In [2]:

# 0) CONFIG
import os
DATA_DIR   = "./"      # same folder as this notebook
ZIP_PATH   = None      # leave None (you already unzipped)
RARE_THRESH = 0.01     # rare-category threshold (1%)
LOW_QUANT   = 0.01     # winsorization low quantile
HIGH_QUANT  = 0.99     # winsorization high quantile
RANDOM_STATE = 42

OUTPUT_DIR = "./preprocessed_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("Outputs will be written to:", OUTPUT_DIR)


Outputs will be written to: ./preprocessed_outputs


In [3]:

# 1) LOAD (auto-locate train/test in DATA_DIR)
import zipfile
import numpy as np, pandas as pd
from pathlib import Path

def find_csv(name):
    for root, _, files in os.walk(DATA_DIR):
        for f in files:
            if f.lower() == name.lower():
                return os.path.join(root, f)
    return None

if ZIP_PATH and Path(ZIP_PATH).exists():
    with zipfile.ZipFile(ZIP_PATH, 'r') as z:
        z.extractall(DATA_DIR)

train_path = find_csv("train.csv")
test_path  = find_csv("test.csv")
assert train_path and test_path, "Could not find train.csv/test.csv next to this notebook."

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)
train.head(3)


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1


In [4]:

# 2) BASIC CLEAN
def basic_clean(df):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    before = len(df)
    df = df.drop_duplicates()
    removed = before - len(df)

    for c in ['Gender','Vehicle_Age','Vehicle_Damage']:
        if c in df.columns:
            df[c] = df[c].astype('category')
    for c in ['Region_Code','Policy_Sales_Channel']:
        if c in df.columns:
            df[c] = df[c].astype(str).astype('category')
    return df, removed

train, tr_dupes = basic_clean(train)
test,  te_dupes = basic_clean(test)

print({
    "train_shape": train.shape,
    "test_shape":  test.shape,
    "train_dupes_removed": tr_dupes,
    "test_dupes_removed":  te_dupes
})
print("Missing (train):\n", train.isna().sum())
print("Missing (test):\n",  test.isna().sum())

train.to_csv(os.path.join(OUTPUT_DIR, "train_clean_basic.csv"), index=False)
test.to_csv(os.path.join(OUTPUT_DIR,  "test_clean_basic.csv"),  index=False)


{'train_shape': (381109, 12), 'test_shape': (127037, 11), 'train_dupes_removed': 0, 'test_dupes_removed': 0}
Missing (train):
 id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64
Missing (test):
 id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
dtype: int64


In [5]:

# 3) RARE-CATEGORY GROUPING
def group_rare_categories(df, cols, min_frac=0.01, other_label="__RARE__"):
    df = df.copy()
    n = len(df)
    for c in cols:
        if c in df.columns:
            freq = df[c].value_counts(dropna=False) / n
            rare_vals = set(freq[freq < min_frac].index)
            df[c] = df[c].astype(str).map(lambda x: other_label if x in rare_vals else x)
            df[c] = df[c].astype('category')
    return df

high_card_cols = [c for c in ['Region_Code','Policy_Sales_Channel'] if c in train.columns]
train = group_rare_categories(train, high_card_cols, RARE_THRESH)
test  = group_rare_categories(test,  high_card_cols, RARE_THRESH)

for c in high_card_cols:
    print(c, "nunique(train):", train[c].nunique())


Region_Code nunique(train): 26
Policy_Sales_Channel nunique(train): 10


In [6]:

# 4) WINSORIZE NUMERICS + PIPELINE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer

id_col     = 'id'
target_col = 'Response' if 'Response' in train.columns else None

numeric_cols = [c for c in ['Age','Annual_Premium','Vintage'] if c in train.columns]
binary_pass  = [c for c in ['Driving_License','Previously_Insured'] if c in train.columns]
cat_cols     = [c for c in ['Gender','Vehicle_Age','Vehicle_Damage','Region_Code','Policy_Sales_Channel'] if c in train.columns]

def winsorize_series(s, lower=0.01, upper=0.99):
    lo, hi = s.quantile(lower), s.quantile(upper)
    return s.clip(lo, hi)

def apply_winsor(df, cols, lo=0.01, hi=0.99):
    df = df.copy()
    for c in cols:
        if c in df.columns:
            df[c] = winsorize_series(df[c], lo, hi)
    return df

train_w = apply_winsor(train, numeric_cols, LOW_QUANT, HIGH_QUANT)
test_w  = apply_winsor(test,  numeric_cols, LOW_QUANT, HIGH_QUANT)

numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy='median')),
    ("scaler", RobustScaler())
])

# Version-safe OHE
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("ohe", ohe)
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, numeric_cols),
        ("cat", categorical_tf, cat_cols),
        ("passthrough", "passthrough", binary_pass),
    ],
    remainder="drop"
)

X_train = train_w.drop(columns=[c for c in [id_col, target_col] if c in train_w.columns])
y_train = train_w[target_col].values if target_col else None

preprocess.fit(X_train, y_train)
print("Fitted preprocessing pipeline.")


Fitted preprocessing pipeline.


In [7]:

# 5) TRANSFORM & SAVE
import joblib

Xtr = preprocess.transform(X_train)
Xte = preprocess.transform(test_w.drop(columns=[id_col]))

# feature names
try:
    feat_names = preprocess.get_feature_names_out().tolist()
except Exception:
    ohe_step = preprocess.named_transformers_['cat'].named_steps['ohe']
    ohe_cols = []
    for i, col in enumerate(cat_cols):
        cats = ohe_step.categories_[i]
        ohe_cols += [f"{col}__{c}" for c in cats]
    feat_names = numeric_cols + ohe_cols + binary_pass

np.savez(os.path.join(OUTPUT_DIR, "train_model_ready_arrays.npz"),
         X=Xtr, y=y_train, feature_names=np.array(feat_names, dtype=object),
         ids=train_w[id_col].values)

np.savez(os.path.join(OUTPUT_DIR, "test_model_ready_arrays.npz"),
         X=Xte, feature_names=np.array(feat_names, dtype=object),
         ids=test_w[id_col].values)

tr_df = pd.DataFrame(Xtr, columns=feat_names)
tr_df.insert(0, 'id', train_w[id_col].values)
if target_col: tr_df['Response'] = y_train
te_df = pd.DataFrame(Xte, columns=feat_names)
te_df.insert(0, 'id', test_w[id_col].values)

tr_df.head(20000).to_csv(os.path.join(OUTPUT_DIR, "train_model_ready_sample.csv"), index=False)
te_df.head(20000).to_csv(os.path.join(OUTPUT_DIR, "test_model_ready_sample.csv"), index=False)

joblib.dump(preprocess, os.path.join(OUTPUT_DIR, "preprocess_pipeline.joblib"))
train.to_csv(os.path.join(OUTPUT_DIR, "train_clean_with_rare.csv"), index=False)
test.to_csv(os.path.join(OUTPUT_DIR, "test_clean_with_rare.csv"), index=False)

print("Saved to:", OUTPUT_DIR)


Saved to: ./preprocessed_outputs


In [8]:

# 6) QUICK CHECKS
if 'Response' in train.columns:
    print("Target balance (train):")
    print(train['Response'].value_counts(normalize=True).rename('proportion'))
else:
    print("No target column detected in train.")
tr_df.head()


Target balance (train):
Response
0    0.877437
1    0.122563
Name: proportion, dtype: float64


Unnamed: 0,id,num__Age,num__Annual_Premium,num__Vintage,cat__Gender_Female,cat__Gender_Male,cat__Vehicle_Age_1-2 Year,cat__Vehicle_Age_< 1 Year,cat__Vehicle_Age_> 2 Years,cat__Vehicle_Damage_No,...,cat__Policy_Sales_Channel_152.0,cat__Policy_Sales_Channel_154.0,cat__Policy_Sales_Channel_156.0,cat__Policy_Sales_Channel_157.0,cat__Policy_Sales_Channel_160.0,cat__Policy_Sales_Channel_26.0,cat__Policy_Sales_Channel___RARE__,passthrough__Driving_License,passthrough__Previously_Insured,Response
0,1,0.333333,0.585862,0.434483,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
1,2,1.666667,0.124508,0.2,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0
2,3,0.458333,0.441814,-0.875862,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
3,4,-0.625,-0.203401,0.337931,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
4,5,-0.291667,-0.278293,-0.793103,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
