
# EDA-Only Cleaning for `train.csv` and `test.csv` (keeps `sample_submission.csv` untouched)
This notebook expects **three files in the same folder**:
- `train.csv`
- `test.csv`
- `sample_submission.csv` (not modified; just checked for presence)

It performs **EDA-only cleaning** on `train.csv` and `test.csv`:
- standardizes column names to snake_case
- parses obvious dates (if any)
- coerces numeric-like text to numbers
- derives `age` and `num_vintage` if `dob` / `open_date` exist
- fills missing values (numeric → median, categorical → "Unknown")
- clips common non-negative columns; light winsorization of outliers
- drops duplicates and near-constant columns
- saves:
  - `train_clean_basic.csv`
  - `test_clean_basic.csv`

> No train/test splitting, no encoders, no NPZ. Purely EDA-friendly outputs.


In [2]:

from pathlib import Path
import pandas as pd, numpy as np

# Filenames (fixed to your three files)
TRAIN_PATH = Path("train.csv")
TEST_PATH  = Path("test.csv")
SUB_PATH   = Path("sample_submission.csv")  # just to verify it's there

# Date-like columns we might parse if present
DATE_COLS = ["dob", "open_date", "last_activity"]

# Columns expected to be non-negative if present
NON_NEGATIVE_COLS = ["price", "amount", "quantity", "revenue", "num_vintage"]

# Obvious ID-like columns (skip outlier capping on these)
ID_LIKE_COLS = ["id", "customer_id", "region_id", "order_id", "policy_id"]

print("Working dir:", Path.cwd())
print("Expecting files next to this notebook:")
for p in [TRAIN_PATH, TEST_PATH, SUB_PATH]:
    print("-", p, "| exists:", p.exists())


Working dir: C:\Users\User\505
Expecting files next to this notebook:
- train.csv | exists: True
- test.csv | exists: True
- sample_submission.csv | exists: True


In [3]:

# Helpers
def normalize_col(c: str) -> str:
    return c.strip().lower().replace(" ", "_") if isinstance(c, str) else c

def winsorize_series(s: pd.Series, low=0.01, high=0.99):
    if not np.issubdtype(s.dtype, np.number) or s.isna().all():
        return s
    lo, hi = s.quantile([low, high])
    return s.clip(lower=lo, upper=hi)

def clean_df(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = df_raw.copy()

    # 1) Standardize column names
    df.columns = [normalize_col(c) for c in df.columns]

    # 2) Parse date columns if present
    for c in DATE_COLS:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce")

    # 3) Coerce object columns that are mostly numeric
    obj_cols = df.select_dtypes(include=["object"]).columns.tolist()
    for c in obj_cols:
        coerced = pd.to_numeric(df[c].astype(str).str.replace(",", "", regex=False), errors="coerce")
        if coerced.notna().mean() >= 0.60:
            df[c] = coerced

    # 4) Derived fields (optional)
    today = pd.Timestamp.today().normalize()
    if "dob" in df.columns:
        age_years = ((today - df["dob"]).dt.days / 365.25)
        df["age"] = age_years.round().clip(lower=0, upper=120).astype("Int64")
    if "open_date" in df.columns:
        tenure_m = ((today - df["open_date"]).dt.days / 30.44)
        df["num_vintage"] = tenure_m.round(0).clip(lower=0).astype("Int64")

    # 5) Missing values
    num_cols = df.select_dtypes(include=[np.number, "Float64", "Int64"]).columns.tolist()
    for c in num_cols:
        df[c] = df[c].fillna(df[c].median(skipna=True))

    cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
    for c in cat_cols:
        df[c] = df[c].astype("string").str.strip().fillna("Unknown")

    # 6) Invalids & outliers
    for c in NON_NEGATIVE_COLS:
        if c in df.columns:
            df[c] = df[c].clip(lower=0)

    skip = set(k for k in ID_LIKE_COLS if k in df.columns)
    for c in num_cols:
        if c not in skip and df[c].nunique(dropna=True) > 5:
            df[c] = winsorize_series(df[c], 0.01, 0.99)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    for c in num_cols:
        df[c] = df[c].fillna(df[c].median(skipna=True))

    # 7) Dedup & near-constants
    before = df.shape[0]
    df.drop_duplicates(inplace=True)
    removed = before - df.shape[0]

    to_drop = [c for c in df.columns if df[c].nunique(dropna=True) <= 1]
    if to_drop:
        df.drop(columns=to_drop, inplace=True)

    print(f"Removed {removed} duplicate rows. Dropped near-constant columns: {to_drop if to_drop else 'None'}")
    return df


In [4]:

# Load the two datasets
if not TRAIN_PATH.exists():
    raise FileNotFoundError("train.csv not found next to the notebook.")
if not TEST_PATH.exists():
    raise FileNotFoundError("test.csv not found next to the notebook.")
if not SUB_PATH.exists():
    print("⚠ sample_submission.csv not found; continuing anyway (not required for cleaning).")

train_raw = pd.read_csv(TRAIN_PATH)
test_raw  = pd.read_csv(TEST_PATH)

print("Raw shapes | train:", train_raw.shape, " test:", test_raw.shape)


Raw shapes | train: (381109, 12)  test: (127037, 11)


In [5]:

# Clean both
train_clean = clean_df(train_raw)
test_clean  = clean_df(test_raw)

print("Clean shapes | train:", train_clean.shape, " test:", test_clean.shape)


Removed 0 duplicate rows. Dropped near-constant columns: None
Removed 0 duplicate rows. Dropped near-constant columns: None
Clean shapes | train: (381109, 12)  test: (127037, 11)


In [6]:

# Save outputs
train_out = Path("train_clean_basic.csv")
test_out  = Path("test_clean_basic.csv")

train_clean.to_csv(train_out, index=False)
test_clean.to_csv(test_out, index=False)

print(f"✅ Saved: {train_out.resolve()}")
print(f"✅ Saved: {test_out.resolve()}")

# Preview
from IPython.display import display
display(train_clean.head(5))
display(test_clean.head(5))


✅ Saved: C:\Users\User\505\train_clean_basic.csv
✅ Saved: C:\Users\User\505\test_clean_basic.csv


Unnamed: 0,id,gender,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,vintage,response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


Unnamed: 0,id,gender,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,vintage
0,381110,Male,25,1,11.0,1,< 1 Year,No,35786.0,152.0,53
1,381111,Male,40,1,28.0,0,1-2 Year,Yes,33762.0,8.0,111
2,381112,Male,47,1,28.0,0,1-2 Year,Yes,40050.0,124.0,199
3,381113,Male,24,1,27.0,1,< 1 Year,Yes,37356.0,152.0,187
4,381114,Male,27,1,28.0,1,< 1 Year,No,59097.0,152.0,296
