In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor

# Load dataset
df = pd.read_csv("car_insurance_claim.csv")
data = df.drop(['ID', 'BIRTH'], axis=1)

# Clean currency fields
currency_cols = ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT']
for col in currency_cols:
    data[col] = data[col].replace('[\$,]', '', regex=True).replace('', np.nan).astype(float)

# Fill missing values
data.fillna(data.median(numeric_only=True), inplace=True)

# Encode categoricals
cat_cols = data.select_dtypes(include='object').columns
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))

# -------------------------
# 1. Claim Amount Prediction (XGBoost Regression)
# -------------------------
X_reg = data.drop('CLM_AMT', axis=1)
y_reg = data['CLM_AMT']
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0)

xgb_model = XGBRegressor(n_estimators=50, max_depth=5, verbosity=0)
xgb_model.fit(X_train_reg, y_train_reg)
y_pred_reg = xgb_model.predict(X_test_reg)
reg_rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))

# -------------------------
# 2. Fraud Detection / Claim Approval (Random Forest Classification)
# -------------------------
X_clf = data.drop('CLAIM_FLAG', axis=1)
y_clf = data['CLAIM_FLAG']
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=0)

rf_model = RandomForestClassifier(n_estimators=100, random_state=0)
rf_model.fit(X_train_clf, y_train_clf)
y_pred_clf = rf_model.predict(X_test_clf)
clf_acc = accuracy_score(y_test_clf, y_pred_clf)
clf_report = classification_report(y_test_clf, y_pred_clf)

print("🔢 Claim Amount Prediction RMSE:", round(reg_rmse, 2))
print("✅ Fraud Detection Accuracy:", round(clf_acc * 100, 2), "%")
print("\n📊 Classification Report:\n", clf_report)
