In [4]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

from kagglex_cohort4 import *

from xgboost import XGBRegressor

In [5]:
url = "train.csv"
raw = pd.read_csv(url, engine="pyarrow", dtype_backend="pyarrow")
cars = clean_housing(raw)

In [6]:
target = "price"
features = [col for col in cars.columns if col not in target]

X = cars[features]
y = cars[target]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (54273, 12)
y shape: (54273,)


In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=43)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (40704, 12)
y_train shape: (40704,)
X_val shape: (13569, 12)
y_val shape: (13569,)


In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40704 entries, 11257 to 14148
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   id            40704 non-null  uint16[pyarrow]
 1   brand         40704 non-null  category       
 2   model         40704 non-null  category       
 3   model_year    40704 non-null  uint16[pyarrow]
 4   milage        40704 non-null  uint32[pyarrow]
 5   fuel_type     40704 non-null  category       
 6   engine        40704 non-null  category       
 7   transmission  40704 non-null  category       
 8   ext_col       40704 non-null  category       
 9   int_col       40704 non-null  category       
 10  accident      40704 non-null  category       
 11  clean_title   40704 non-null  category       
dtypes: category(9), uint16[pyarrow](2), uint32[pyarrow](1)
memory usage: 1.3 MB


In [9]:
# Identify categorical and numerical columns
cat = list(X.select_dtypes(include=['category']).columns)
num = list(X.select_dtypes(include=['number']).columns)

In [10]:
# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat),
        ('num', StandardScaler(), num)
    ]
)

In [11]:
# Define the pipeline with XGBRegressor
xgb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=43, tree_method='gpu_hist', predictor='gpu_predictor'))
])

In [9]:
# Step 1: Define the Hyperparameter Universe
param_grid = {
    'regressor__n_estimators': [2, 25, 50, 75, 100, 200, 300],  # Number of boosting rounds
}

In [10]:
# Perform hyperparameter search using GridSearchCV
best_model_grid = hyperparameter_search(X_train=X_train, y_train=y_train, estimator=xgb_pipe, param_grid=param_grid, search_strategy='grid')

Fitting 3 folds for each of 7 candidates, totalling 21 fits
Best parameters found: {'regressor__n_estimators': 2}
Best score (negative RMSE): 80691.0645083222


In [11]:
# Step 1: Define the Hyperparameter Universe
param_grid = {
    'regressor__n_estimators': [2],  # Number of boosting rounds
    'regressor__max_depth': [1, 3, 5, 7],  # Maximum depth of a tree
}

best_model_grid = hyperparameter_search(X_train=X_train, y_train=y_train, estimator=xgb_pipe, param_grid=param_grid, search_strategy='grid')

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters found: {'regressor__max_depth': 1, 'regressor__n_estimators': 2}
Best score (negative RMSE): 76789.43671946565


In [12]:
# Step 1: Define the Hyperparameter Universe
param_grid = {
    'regressor__n_estimators': [2],  # Number of boosting rounds
    'regressor__max_depth': [1],  # Maximum depth of a tree
    'regressor__learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5],  # Controls the step size at each iteration
}
best_model_grid = hyperparameter_search(X_train=X_train, y_train=y_train, estimator=xgb_pipe, param_grid=param_grid, search_strategy='grid')

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters found: {'regressor__learning_rate': 0.5, 'regressor__max_depth': 1, 'regressor__n_estimators': 2}
Best score (negative RMSE): 74483.83900231989


In [13]:
param_grid = {
    'regressor__n_estimators': [2],  # Number of boosting rounds
    'regressor__max_depth': [1],  # Maximum depth of a tree
    'regressor__learning_rate': [0.5],  # Controls the step size at each iteration
    'regressor__min_child_weight': [1, 2, 3, 4, 5],  # Minimum sum of instance weight in a child
}

best_model_grid = hyperparameter_search(X_train=X_train, y_train=y_train, estimator=xgb_pipe, param_grid=param_grid, search_strategy='grid')

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters found: {'regressor__learning_rate': 0.5, 'regressor__max_depth': 1, 'regressor__min_child_weight': 1, 'regressor__n_estimators': 2}
Best score (negative RMSE): 74483.83900231989


In [14]:
param_grid = {
    'regressor__n_estimators': [2],  # Number of boosting rounds
    'regressor__max_depth': [1],  # Maximum depth of a tree
    'regressor__learning_rate': [0.5],  # Controls the step size at each iteration
    'regressor__min_child_weight': [1],  # Minimum sum of instance weight in a child
    'regressor__subsample': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],  # Fraction of training data sampled for each tree
}

best_model_grid = hyperparameter_search(X_train=X_train, y_train=y_train, estimator=xgb_pipe, param_grid=param_grid, search_strategy='grid')

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters found: {'regressor__learning_rate': 0.5, 'regressor__max_depth': 1, 'regressor__min_child_weight': 1, 'regressor__n_estimators': 2, 'regressor__subsample': 1.0}
Best score (negative RMSE): 74483.83900231989


In [15]:
param_grid = {
    'regressor__n_estimators': [2],  # Number of boosting rounds
    'regressor__max_depth': [1],  # Maximum depth of a tree
    'regressor__learning_rate': [0.5],  # Controls the step size at each iteration
    'regressor__min_child_weight': [1],  # Minimum sum of instance weight in a child
    'regressor__subsample': [1.0],  # Fraction of training data sampled for each tree
    'regressor__colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features sampled for each tree
}

best_model_grid = hyperparameter_search(X_train=X_train, y_train=y_train, estimator=xgb_pipe, param_grid=param_grid, search_strategy='grid')

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best parameters found: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.5, 'regressor__max_depth': 1, 'regressor__min_child_weight': 1, 'regressor__n_estimators': 2, 'regressor__subsample': 1.0}
Best score (negative RMSE): 74483.83900231989


In [16]:
param_grid = {
    'regressor__n_estimators': [2],  # Number of boosting rounds
    'regressor__max_depth': [1],  # Maximum depth of a tree
    'regressor__learning_rate': [0.5],  # Controls the step size at each iteration
    'regressor__min_child_weight': [1],  # Minimum sum of instance weight in a child
    'regressor__subsample': [1.0],  # Fraction of training data sampled for each tree
    'regressor__colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features sampled for each tree
    'regressor__colsample_bylevel':[0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'regressor__colsample_bynode':[0.5, 0.6, 0.7, 0.8, 0.9, 1],
}

best_model_grid = hyperparameter_search(X_train=X_train, y_train=y_train, estimator=xgb_pipe, param_grid=param_grid, search_strategy='grid')

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters found: {'regressor__colsample_bylevel': 0.8, 'regressor__colsample_bynode': 0.8, 'regressor__colsample_bytree': 1.0, 'regressor__learning_rate': 0.5, 'regressor__max_depth': 1, 'regressor__min_child_weight': 1, 'regressor__n_estimators': 2, 'regressor__subsample': 1.0}
Best score (negative RMSE): 74471.13020770096


In [18]:
param_grid = {
    'regressor__n_estimators': [2],  # Number of boosting rounds
    'regressor__max_depth': [1],  # Maximum depth of a tree
    'regressor__learning_rate': [0.5],  # Controls the step size at each iteration
    'regressor__min_child_weight': [1],  # Minimum sum of instance weight in a child
    'regressor__subsample': [1.0],  # Fraction of training data sampled for each tree
    'regressor__colsample_bytree': [1.0],  # Fraction of features sampled for each tree
    'regressor__colsample_bylevel':[0.8],
    'regressor__colsample_bynode':[0.8],
    'regressor__gamma': [0, 0.1, 0.2],  # Minimum loss reduction for a further partition
}

best_model_grid = hyperparameter_search(X_train=X_train, y_train=y_train, estimator=xgb_pipe, param_grid=param_grid, search_strategy='grid')

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best parameters found: {'regressor__colsample_bylevel': 0.8, 'regressor__colsample_bynode': 0.8, 'regressor__colsample_bytree': 1.0, 'regressor__gamma': 0, 'regressor__learning_rate': 0.5, 'regressor__max_depth': 1, 'regressor__min_child_weight': 1, 'regressor__n_estimators': 2, 'regressor__subsample': 1.0}
Best score (negative RMSE): 74471.13020770096


In [19]:
param_grid = {
    'regressor__n_estimators': [2],  # Number of boosting rounds
    'regressor__max_depth': [1],  # Maximum depth of a tree
    'regressor__learning_rate': [0.5],  # Controls the step size at each iteration
    'regressor__min_child_weight': [1],  # Minimum sum of instance weight in a child
    'regressor__subsample': [1.0],  # Fraction of training data sampled for each tree
    'regressor__colsample_bytree': [1.0],  # Fraction of features sampled for each tree
    'regressor__colsample_bylevel':[0.8],
    'regressor__colsample_bynode':[0.8],
    'regressor__gamma': [0],  # Minimum loss reduction for a further partition
    'regressor__reg_lambda': [1, 1.5, 2],  # L2 regularization term on weights
}

best_model_grid = hyperparameter_search(X_train=X_train, y_train=y_train, estimator=xgb_pipe, param_grid=param_grid, search_strategy='grid')

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best parameters found: {'regressor__colsample_bylevel': 0.8, 'regressor__colsample_bynode': 0.8, 'regressor__colsample_bytree': 1.0, 'regressor__gamma': 0, 'regressor__learning_rate': 0.5, 'regressor__max_depth': 1, 'regressor__min_child_weight': 1, 'regressor__n_estimators': 2, 'regressor__reg_lambda': 1, 'regressor__subsample': 1.0}
Best score (negative RMSE): 74471.13020770096


In [22]:
param_grid = {
    'regressor__n_estimators': [2],  # Number of boosting rounds (Default: 100)
    'regressor__max_depth': [1],  # Maximum depth of a tree (Default: 6)
    'regressor__learning_rate': [0.5],  # Controls the step size at each iteration (Default: 0.3)
    'regressor__min_child_weight': [1],  # Minimum sum of instance weight (hessian) in a child (Default: 1)
    'regressor__subsample': [1.0],  # Fraction of training data sampled for each tree (Default: 1.0)
    'regressor__colsample_bytree': [1.0],  # Fraction of features sampled for each tree (Default: 1.0)
    'regressor__colsample_bylevel': [0.8],  # Fraction of features sampled for each level (Default: 1.0)
    'regressor__colsample_bynode': [0.8],  # Fraction of features sampled for each node (Default: 1.0)
    'regressor__gamma': [0],  # Minimum loss reduction required to make a further partition on a leaf node (Default: 0)
    'regressor__reg_lambda': [1],  # L2 regularization term on weights (Default: 1)
    'regressor__reg_alpha': [0]  # L1 regularization term on weights (Default: 0)
}

best_model_grid = hyperparameter_search(X_train=X_train, y_train=y_train, estimator=xgb_pipe, param_grid=param_grid, search_strategy='grid')

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters found: {'regressor__colsample_bylevel': 0.8, 'regressor__colsample_bynode': 0.8, 'regressor__colsample_bytree': 1.0, 'regressor__gamma': 0, 'regressor__learning_rate': 0.5, 'regressor__max_depth': 1, 'regressor__min_child_weight': 1, 'regressor__n_estimators': 2, 'regressor__reg_alpha': 0, 'regressor__reg_lambda': 1, 'regressor__subsample': 1.0}
Best score (negative RMSE): 74471.13020770096


Here are your chosen parameters with comments describing what they do and their default values:

```python
param_grid = {
    'regressor__n_estimators': [2],  # Number of boosting rounds (Default: 100)
    'regressor__max_depth': [1],  # Maximum depth of a tree (Default: 6)
    'regressor__learning_rate': [0.5],  # Controls the step size at each iteration (Default: 0.3)
    'regressor__min_child_weight': [1],  # Minimum sum of instance weight (hessian) in a child (Default: 1)
    'regressor__subsample': [1.0],  # Fraction of training data sampled for each tree (Default: 1.0)
    'regressor__colsample_bytree': [1.0],  # Fraction of features sampled for each tree (Default: 1.0)
    'regressor__colsample_bylevel': [0.8],  # Fraction of features sampled for each level (Default: 1.0)
    'regressor__colsample_bynode': [0.8],  # Fraction of features sampled for each node (Default: 1.0)
    'regressor__gamma': [0],  # Minimum loss reduction required to make a further partition on a leaf node (Default: 0)
    'regressor__reg_lambda': [1],  # L2 regularization term on weights (Default: 1)
    'regressor__reg_alpha': [0]  # L1 regularization term on weights (Default: 0)
}
```

Here's a breakdown of each parameter with its default value and description:

- **`n_estimators`**: Specifies the number of boosting rounds. Default value is 100.
- **`max_depth`**: Determines the maximum depth of the trees. Default value is 6.
- **`learning_rate`**: Also known as eta, it controls the step size at each iteration. Default value is 0.3.
- **`min_child_weight`**: Minimum sum of instance weight (hessian) needed in a child. Default value is 1.
- **`subsample`**: Fraction of the training data to be randomly sampled for each tree. Default value is 1.0.
- **`colsample_bytree`**: Fraction of features to be randomly sampled for each tree. Default value is 1.0.
- **`colsample_bylevel`**: Fraction of features to be randomly sampled for each level. Default value is 1.0.
- **`colsample_bynode`**: Fraction of features to be randomly sampled for each node. Default value is 1.0.
- **`gamma`**: Minimum loss reduction required to make a further partition on a leaf node. Default value is 0.
- **`reg_lambda`**: L2 regularization term on weights (ridge regression). Default value is 1.
- **`reg_alpha`**: L1 regularization term on weights (Lasso regression). Default value is 0.

## RandomizedSearchCV

In [12]:
param_grid = {
    'regressor__n_estimators': [int(x) for x in range(50, 501, 50)],  # Number of boosting rounds (Default: 100)
    'regressor__max_depth': [int(x) for x in range(1, 11)],  # Maximum depth of a tree (Default: 6)
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5],  # Controls the step size at each iteration (Default: 0.3)
    'regressor__min_child_weight': [int(x) for x in range(1, 11)],  # Minimum sum of instance weight (hessian) in a child (Default: 1)
    'regressor__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],  # Fraction of training data sampled for each tree (Default: 1.0)
    'regressor__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],  # Fraction of features sampled for each tree (Default: 1.0)
    'regressor__colsample_bylevel': [0.6, 0.7, 0.8, 0.9, 1.0],  # Fraction of features sampled for each level (Default: 1.0)
    'regressor__colsample_bynode': [0.6, 0.7, 0.8, 0.9, 1.0],  # Fraction of features sampled for each node (Default: 1.0)
    'regressor__gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],  # Minimum loss reduction required to make a further partition on a leaf node (Default: 0)
    'regressor__reg_lambda': [0.1, 0.5, 1, 1.5, 2],  # L2 regularization term on weights (Default: 1)
    'regressor__reg_alpha': [0, 0.1, 0.5, 1, 1.5, 2]  # L1 regularization term on weights (Default: 0)
}
best_model_grid = hyperparameter_search(X_train=X_train, y_train=y_train, estimator=xgb_pipe, param_grid=param_grid, search_strategy='random', n_iter=20)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best parameters found: {'regressor__subsample': 0.8, 'regressor__reg_lambda': 0.1, 'regressor__reg_alpha': 1, 'regressor__n_estimators': 500, 'regressor__min_child_weight': 9, 'regressor__max_depth': 6, 'regressor__learning_rate': 0.05, 'regressor__gamma': 0.1, 'regressor__colsample_bytree': 0.7, 'regressor__colsample_bynode': 0.7, 'regressor__colsample_bylevel': 0.8}
Best score (negative RMSE): 76143.56330864866


In [14]:
rscv_params = {
    'regressor__n_estimators': 500,  # Number of boosting rounds
    'regressor__max_depth': 6,  # Maximum depth of a tree
    'regressor__learning_rate': 0.05,  # Controls the step size at each iteration
    'regressor__min_child_weight': 9,  # Minimum sum of instance weight in a child
    'regressor__subsample': 0.8,  # Fraction of training data sampled for each tree
    'regressor__colsample_bytree': 0.7,  # Fraction of features sampled for each tree
    'regressor__colsample_bylevel': 0.8,  # Fraction of features sampled for each level
    'regressor__colsample_bynode': 0.7,  # Fraction of features sampled for each node
    'regressor__gamma': 0.1,  # Minimum loss reduction required to make a further partition on a leaf node
    'regressor__reg_lambda': 0.1,  # L2 regularization term on weights
    'regressor__reg_alpha': 1  # L1 regularization term on weights
}


In [None]:
params = {
    'n_estimators': 200, 
    'max_depth': 3, 
    'learning_rate': 0.01, 
    'min_child_weight': 9, 
    'subsample': 0.8, 
    'colsample_bytree': 0.9, 
    'colsample_bylevel': 0.9, 
    'colsample_bynode': 0.7, 
    'gamma': 0.4, 
    'reg_lambda': 2, 
    'reg_alpha': 2
}