## Exercise 2: Use Gradient Boost for Regression

Instructions:

- Use the Dataset File to train your model
- Use the Test File to generate your results
- Use the Sample Submission file to generate the same format
Submit your results to:
https://www.kaggle.com/competitions/playground-series-s4e12/overview



In [18]:
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

import matplotlib
import matplotlib.pyplot as plt
import numpy as np

from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.utils.fixes import parse_version

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## Dataset
Train, test and sample submission file can be found in this link
https://www.kaggle.com/competitions/playground-series-s4e12/data

## 1. Load the Data

In [19]:
dftrain = pd.read_csv("train.csv")
dftrain.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [20]:
dftest = pd.read_csv("test.csv")

In [21]:
sf=pd.read_csv("sample_submission.csv")

## 2. Perform Data preprocessing

In [22]:
# Define features (X) and target (y)
X = dftrain.drop(columns=['id','Premium Amount']) # Assuming 'id' and 'target' are not features
y = dftrain['Premium Amount']

# Identify categorical and numerical features
categorical_features = ['Marital Status', 'Education Level', 'Occupation','Location', 'Customer Feedback', 'Smoking Status','Exercise Frequency', 'Property Type']  # Add all your categorical features here
numerical_features = X.select_dtypes(include=['number']).columns.tolist()

# Create a ColumnTransformer to apply different preprocessing to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features),
    ])

# Fit and transform the data
X_scaled = preprocessor.fit_transform(X)
X_test_scaled = preprocessor.transform(dftest.drop(columns=['id'])) # Scale test data using the same preprocessor

# Get feature names after OneHotEncoding
feature_names = numerical_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))

# Convert scaled data back to DataFrame (optional but recommended)
X_scaled = pd.DataFrame(X_scaled, columns=feature_names)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_names)

## 3. Create a Pipeline

In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the imputer to fill missing values with the mean
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')  # or strategy='median', etc.

# Fit the imputer on the training data and transform both training and validation data
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)

# Initialize and train the Gradient Boosting Regressor with specified hyperparameters
params = {
    'n_estimators': 500,  # Number of boosting stages
    'max_depth': 4,       # Maximum depth of individual trees
    'min_samples_split': 5, # Minimum number of samples required to split an internal node
    'learning_rate': 0.01, # Step size shrinkage used in update to prevent overfitting
    'loss': 'squared_error' # Loss function to be optimized
}

gb_model = GradientBoostingRegressor(**params)
gb_model.fit(X_train, y_train)

In [23]:
# Create a pipeline with an imputer for numerical and categorical features
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Use the preprocessor defined earlier
    ('imputer', SimpleImputer(strategy='most_frequent', add_indicator=True)), # Add SimpleImputer after preprocessing
    ('regressor', GradientBoostingRegressor()) # Add your regressor
])

# Split data into training and validation sets before preprocessing
# This ensures the preprocessor is applied correctly within the pipeline during training
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values in y_train and y_val before fitting the model
# Use SimpleImputer with the same strategy as in the pipeline, but without add_indicator=True for y
imputer_y = SimpleImputer(strategy='most_frequent')  # Create a separate imputer for y without add_indicator
y_train = imputer_y.fit_transform(y_train.values.reshape(-1, 1)) # Fit and transform y_train
y_val = imputer_y.transform(y_val.values.reshape(-1, 1)) # Transform y_val

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train.ravel()) # Use ravel() to flatten y_train if necessary (it might not be needed here)


# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Evaluate the model (example: Mean Squared Error)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")

# Predict on the test set using the fitted pipeline
test_pred = pipeline.predict(dftest.drop(columns=['id'])) # Predict on original test data

Mean Squared Error: 725363.0472368452


## 4. Train the Model

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values in y_train and y_val before fitting the model
# Use SimpleImputer with the same strategy as in the pipeline, but without add_indicator=True for y
imputer_y = SimpleImputer(strategy='most_frequent')  # Create a separate imputer for y without add_indicator
y_train = imputer_y.fit_transform(y_train.values.reshape(-1, 1)) # Fit and transform y_train
y_val = imputer_y.transform(y_val.values.reshape(-1, 1)) # Transform y_val

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train.ravel()) # Use ravel() to flatten y_train if necessary (it might not be needed here)


# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Evaluate the model (example: Mean Squared Error)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")

# Predict on the test set using the fitted pipeline
test_pred = pipeline.predict(dftest.drop(columns=['id'])) # Predict on original test data

Mean Squared Error: 725363.0472368451


## 5. Evaluate the Model

In [25]:
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 725363.0472368451


## Generate Submission File

Choose the model that has the best performance to generate a submission file.

In [27]:
# Create the submission DataFrame
submission_df = pd.DataFrame({'id': dftest['id'], 'Premium Amount': test_pred})

# Save the predictions to a CSV file
submission_df.to_csv('submission_file.csv', index=False)