In [None]:
import pandas as pd
import numpy as np
from math import log2
from collections import Counter
import pprint

# --- Load & preprocess ---
df = pd.read_csv("C:\\Users\\arun2\\Downloads\\car_insurance_claim\\car_insurance_claim.csv")
df.drop(columns=["ID", "BIRTH"], inplace=True)

# Clean currency columns
currency_cols = ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT']
for col in currency_cols:
    df[col] = df[col].replace(r'[\$,]', '', regex=True).replace('', np.nan).astype(float)

df.fillna(df.median(numeric_only=True), inplace=True)

# Convert numerics to categorical via binning
for col in df.select_dtypes(include=[np.number]).columns:
    df[col] = pd.qcut(df[col], q=3, duplicates='drop', labels=False)

# Encode categoricals
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype('category').cat.codes

# --- ID3 functions ---

def entropy(data, target):
    vals = data[target].value_counts(normalize=True)
    return -sum(p * log2(p) for p in vals if p > 0)

def info_gain(data, feature, target):
    total_entropy = entropy(data, target)
    vals = data[feature].unique()
    weighted_entropy = sum(
        (len(data[data[feature] == val]) / len(data)) *
        entropy(data[data[feature] == val], target)
        for val in vals
    )
    return total_entropy - weighted_entropy

def id3(data, features, target):
    # If all same class
    if len(data[target].unique()) == 1:
        return int(data[target].iloc[0])
    
    # If no features left
    if len(features) == 0:
        return int(data[target].mode()[0])

    gains = [info_gain(data, feat, target) for feat in features]
    best_feat = features[np.argmax(gains)]
    
    tree = {best_feat: {}}
    for val in data[best_feat].unique():
        subset = data[data[best_feat] == val]
        if subset.empty:
            tree[best_feat][val] = int(data[target].mode()[0])
        else:
            sub_features = [f for f in features if f != best_feat]
            subtree = id3(subset, sub_features, target)
            tree[best_feat][val] = subtree
    return tree

# --- Predict function ---

def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    root = next(iter(tree))
    val = sample[root]
    subtree = tree[root].get(val)
    if subtree is None:
        return list(tree[root].values())[0]  # fallback
    return predict(subtree, sample)

# --- Run ID3 ---

features = [col for col in df.columns if col != 'CLAIM_FLAG']
target = 'CLAIM_FLAG'

tree = id3(df, features, target)

print("🌳 Decision Tree:")
pprint.pprint(tree)

# --- Predict on sample ---
sample = df.iloc[0].to_dict()
prediction = predict(tree, sample)
print("\n🔮 Prediction for first row (CLAIM_FLAG):", prediction)
