In [3]:
import pandas as pd
health_data = pd.read_csv('health_data.csv')

In [5]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Prepare features for the model
features = [
    'healthCheckup_vitalSigns_bmi',
    'healthCheckup_vitalSigns_bloodPressure_systolic',
    'healthCheckup_vitalSigns_bloodPressure_diastolic',
    'healthCheckup_vitalSigns_pulseRate',
    'healthCheckup_vitalSigns_temperature'
]

# Convert diagnosis to numeric feature (one-hot encoding)
health_data['has_hypertension'] = health_data['medicalVisit_diagnosis_primary'].apply(
    lambda x: 1 if 'Hypertension' in str(x) else 0
)
features.append('has_hypertension')

# Prepare X and y
X = health_data[features]
y = health_data['monthly_premium']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"RMSE: ${rmse:.2f}")
print(f"R2 Score: {r2:.4f}")

importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
})
importance = importance.sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(importance)

print("\nSample Predictions vs Actual:")
sample_comparison = pd.DataFrame({
    'Actual': y_test.head(),
    'Predicted': y_pred[:5]
})
print(sample_comparison)

Model Performance:
RMSE: $76.32
R2 Score: 0.5127

Feature Importance:
                                            feature  importance
5                                  has_hypertension    0.912061
0                      healthCheckup_vitalSigns_bmi    0.033982
3                healthCheckup_vitalSigns_pulseRate    0.014863
4              healthCheckup_vitalSigns_temperature    0.014345
2  healthCheckup_vitalSigns_bloodPressure_diastolic    0.013261
1   healthCheckup_vitalSigns_bloodPressure_systolic    0.011487

Sample Predictions vs Actual:
     Actual   Predicted
521  508.79  575.696472
737  546.98  581.778931
740  631.90  645.810730
660  502.51  510.175781
411  467.96  562.968506
