In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd


In [20]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data_set/insurance.csv")



In [21]:
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Prepare Data and Preprocessing
X = df.drop('charges', axis=1)
y = df['charges']

# Train/test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing
numeric_features = ['age', 'bmi', 'children']
categorical_features = ['sex', 'smoker', 'region']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(), categorical_features)
])

# Step 2: Define the XGBoost Model
model = xgb.XGBRegressor(random_state=42)

# Hyperparameter grid for tuning
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'min_child_weight': [1, 3, 5]
}

# Perform hyperparameter tuning with RandomizedSearchCV (you can switch to GridSearchCV if needed)
search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Step 3: Build the Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', search)
])

# Step 4: Train the Model
pipeline.fit(x_train, y_train)

# Step 5: Save the Pipeline
with open('xgboost_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

# Print the best hyperparameters
print("Best hyperparameters found: ", search.best_params_)

# Step 6: Evaluate the Model
# Predict on the test set
y_pred = pipeline.predict(x_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on test set: {mse:.4f}")

# Calculate R² score
r2 = r2_score(y_test, y_pred)
print(f"R² score on test set: {r2:.4f}")


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best hyperparameters found:  {'subsample': 1.0, 'n_estimators': 50, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 1.0}
Mean Squared Error on test set: 17924632.8950
R² score on test set: 0.8845
