In [None]:
import pandas as pd
import pickle
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score

# =========================
# 1. Load dataset
# =========================
df = pd.read_csv("insurance.csv")  # change to your file path

target_column = "charges"  # set your target
X = df.drop(columns=[target_column])
y = df[target_column]

# Detect numeric and categorical columns automatically
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print("Numeric columns:", numeric_features)
print("Categorical columns:", categorical_features)

# =========================
# 2. Preprocessing
# =========================
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# =========================
# 3. Split data
# =========================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =========================
# 4. XGBoost with Random Search
# =========================
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

param_dist = {
    'xgb__n_estimators': [100, 200, 500, 800],
    'xgb__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'xgb__max_depth': [3, 5, 7, 10],
    'xgb__subsample': [0.6, 0.8, 1.0],
    'xgb__colsample_bytree': [0.6, 0.8, 1.0],
    'xgb__gamma': [0, 0.1, 0.2, 0.3],
    'xgb__min_child_weight': [1, 3, 5]
}

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', xgb)
])

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=30,
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    scoring='r2'
)

random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
print("Best CV R²:", random_search.best_score_)

best_pipeline = random_search.best_estimator_

# =========================
# 5. Evaluate
# =========================
y_pred = best_pipeline.predict(X_test)
print("Test R²:", r2_score(y_test, y_pred))

# =========================
# 6. Save the model
# =========================
filename = "insurance_xgb_auto.sav"
pickle.dump(best_pipeline, open(filename, 'wb'))
print(f"Model saved as {filename}")

# =========================
# 7. Load and Predict
# =========================
loaded_model = pickle.load(open(filename, 'rb'))

# Example raw input for prediction (order must match dataset)
sample_data = pd.DataFrame([{
    'age': 55,
    'sex': 'male',
    'bmi': 32.78,
    'children': 4,
    'smoker': 'yes',
    'region': 'southwest'
}])

prediction = loaded_model.predict(sample_data)
print("Predicted charge:", prediction[0])
#No manual column listing — it automatically detects numeric and categorical features.

#Fully self-contained pipeline — your .sav file includes preprocessing + model.

#You can feed raw inputs for prediction without encoding or scaling.