In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

from kagglex_cohort4 import *

from xgboost import XGBRegressor

In [2]:
url = "train.csv"
raw = pd.read_csv(url, engine="pyarrow", dtype_backend="pyarrow")
cars = clean_housing(raw)

In [3]:
target = "price"
features = [col for col in cars.columns if col not in target]

X = cars[features]
y = cars[target]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (54273, 12)
y shape: (54273,)


In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=43)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (40704, 12)
y_train shape: (40704,)
X_val shape: (13569, 12)
y_val shape: (13569,)


In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40704 entries, 11257 to 14148
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   id            40704 non-null  uint16[pyarrow]
 1   brand         40704 non-null  category       
 2   model         40704 non-null  category       
 3   model_year    40704 non-null  uint16[pyarrow]
 4   milage        40704 non-null  uint32[pyarrow]
 5   fuel_type     40704 non-null  category       
 6   engine        40704 non-null  category       
 7   transmission  40704 non-null  category       
 8   ext_col       40704 non-null  category       
 9   int_col       40704 non-null  category       
 10  accident      40704 non-null  category       
 11  clean_title   40704 non-null  category       
dtypes: category(9), uint16[pyarrow](2), uint32[pyarrow](1)
memory usage: 1.3 MB


In [6]:
# Identify categorical and numerical columns
cat = list(X.select_dtypes(include=['category']).columns)
num = list(X.select_dtypes(include=['number']).columns)

In [7]:
# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat),
        ('num', StandardScaler(), num)
    ]
)

In [8]:
# Define the pipeline with XGBRegressor
xgb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=43, tree_method='gpu_hist', predictor='gpu_predictor'))
])

In [9]:
# Step 1: Define the Hyperparameter Universe
param_grid = {
    'regressor__learning_rate': [0.01, 0.1, 0.2],  # Controls the step size at each iteration
    'regressor__n_estimators': [100, 200, 300],  # Number of boosting rounds
    'regressor__max_depth': [3, 5, 7],  # Maximum depth of a tree
    'regressor__min_child_weight': [1, 3, 5],  # Minimum sum of instance weight in a child
    'regressor__subsample': [0.6, 0.8, 1.0],  # Fraction of training data sampled for each tree
    'regressor__colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features sampled for each tree
    'regressor__gamma': [0, 0.1, 0.2],  # Minimum loss reduction for a further partition
    'regressor__reg_lambda': [1, 1.5, 2],  # L2 regularization term on weights
    'regressor__reg_alpha': [0, 0.1, 0.5]  # L1 regularization term on weights
}

In [10]:
# Step 2: Choose the Search Strategy
def hyperparameter_search(X_train, y_train, search_strategy='grid', n_iter=10):
    if search_strategy == 'grid':
        search = GridSearchCV(estimator=xgb_pipe, param_grid=param_grid, scoring='neg_root_mean_squared_error', cv=3, verbose=1, n_jobs=-1)
    elif search_strategy == 'random':
        search = RandomizedSearchCV(estimator=xgb_pipe, param_distributions=param_grid, n_iter=n_iter, scoring='neg_root_mean_squared_error', cv=3, verbose=1, random_state=42, n_jobs=-1)
    else:
        raise ValueError("search_strategy must be either 'grid' or 'random'")
    
    # Step 3: Execute the Search
    search.fit(X_train, y_train)
    
    # Step 4: Evaluate and Iterate
    print(f"Best parameters found: {search.best_params_}")
    print(f"Best score (negative RMSE): {-search.best_score_}")
    
    return search.best_estimator_

In [None]:
# Perform hyperparameter search using GridSearchCV
best_model_grid = hyperparameter_search(X_train, y_train, search_strategy='grid')

Fitting 3 folds for each of 19683 candidates, totalling 59049 fits


In [None]:
# Perform hyperparameter search using RandomizedSearchCV
best_model_random = hyperparameter_search(X_train, y_train, search_strategy='random', n_iter=20)

In [None]:
# Evaluate the best models found
y_pred_grid = best_model_grid.predict(X_val)
y_pred_random = best_model_random.predict(X_val)

rmse_grid = np.sqrt(mean_squared_error(y_val, y_pred_grid))
rmse_random = np.sqrt(mean_squared_error(y_val, y_pred_random))

print(f"Test RMSE (GridSearchCV): {rmse_grid}")
print(f"Test RMSE (RandomizedSearchCV): {rmse_random}")

In [None]:
X_test = pd.read_csv("test.csv")  # REMOVERHS
print(X_test.info())
X_test.head()

In [None]:
y_test_pred = pd.Series(xgb_pipe.predict(X_test))  # REMOVERHS
y_test_pred.head()

In [None]:
feat_imp = (pd
            .Series(xgb_model.feature_importances_, index=feature_names)
            .sort_values(key=abs, ascending=False)
           ) 
feat_imp

In [None]:
# Prepare the submission file
submission_example = pd.read_csv('sample_submission.csv')
submission = pd.DataFrame({'id': submission_example['id'], 'target': y_test_pred})
submission.to_csv('submission.csv', index=False)