In [1]:
import numpy as np
import pandas as pd
import optuna

from sklearn.compose import ColumnTransformer
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

from kagglex_cohort4 import *

from xgboost import XGBRegressor

In [2]:
url = "train.csv"
raw = pd.read_csv(url, engine="pyarrow", dtype_backend="pyarrow")
cars = clean_housing(raw)

In [3]:
target = "price"
features = [col for col in cars.columns if col not in target]

X = cars[features]
y = cars[target]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (54273, 12)
y shape: (54273,)


In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=43)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (40704, 12)
y_train shape: (40704,)
X_val shape: (13569, 12)
y_val shape: (13569,)


In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40704 entries, 11257 to 14148
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   id            40704 non-null  uint16[pyarrow]
 1   brand         40704 non-null  category       
 2   model         40704 non-null  category       
 3   model_year    40704 non-null  uint16[pyarrow]
 4   milage        40704 non-null  uint32[pyarrow]
 5   fuel_type     40704 non-null  category       
 6   engine        40704 non-null  category       
 7   transmission  40704 non-null  category       
 8   ext_col       40704 non-null  category       
 9   int_col       40704 non-null  category       
 10  accident      40704 non-null  category       
 11  clean_title   40704 non-null  category       
dtypes: category(9), uint16[pyarrow](2), uint32[pyarrow](1)
memory usage: 1.3 MB


In [6]:
# Identify categorical and numerical columns
cat = list(X.select_dtypes(include=['category']).columns)
num = list(X.select_dtypes(include=['number']).columns)

In [7]:
# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat),
        ('num', StandardScaler(), num)
    ]
)

In [8]:
# Define the objective function
def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'booster': 'gbtree',
        'tree_method': 'gpu_hist',  # Use GPU acceleration
        'n_estimators': trial.suggest_int('n_estimators', 50, 500, step=50),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 0.9, 1.0]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.6, 0.7, 0.8, 0.9, 1.0]),
        'colsample_bylevel': trial.suggest_categorical('colsample_bylevel', [0.6, 0.7, 0.8, 0.9, 1.0]),
        'colsample_bynode': trial.suggest_categorical('colsample_bynode', [0.6, 0.7, 0.8, 0.9, 1.0]),
        'gamma': trial.suggest_categorical('gamma', [0, 0.1, 0.2, 0.3, 0.4, 0.5]),
        'reg_lambda': trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1, 1.5, 2]),
        'reg_alpha': trial.suggest_categorical('reg_alpha', [0, 0.1, 0.5, 1, 1.5, 2]),
        'enable_categorical': True  # Enable categorical handling
    }
    
    # Define the pipeline with XGBRegressor
    xgb_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(**param))
    ])
    
    # Fit the model
    xgb_pipe.fit(X_train, y_train)
    
    # Make predictions
    preds = xgb_pipe.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)
    
    return rmse


In [9]:
# Run optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Print best parameters
print(f"Best parameters found: {study.best_params}")
print(f"Best RMSE: {study.best_value}")

[I 2024-06-12 21:15:18,442] A new study created in memory with name: no-name-c2f6fd00-721d-4433-958c-7c4532c7a8ef
[I 2024-06-12 21:16:03,731] Trial 0 finished with value: 79378.31444545627 and parameters: {'n_estimators': 450, 'max_depth': 8, 'learning_rate': 0.05, 'min_child_weight': 4, 'subsample': 0.9, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.6, 'colsample_bynode': 0.9, 'gamma': 0, 'reg_lambda': 2, 'reg_alpha': 0.1}. Best is trial 0 with value: 79378.31444545627.
[I 2024-06-12 21:16:45,996] Trial 1 finished with value: 100436.79071119521 and parameters: {'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.1, 'min_child_weight': 2, 'subsample': 0.6, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.6, 'colsample_bynode': 0.6, 'gamma': 0.5, 'reg_lambda': 0.5, 'reg_alpha': 1.5}. Best is trial 0 with value: 79378.31444545627.
[I 2024-06-12 21:17:20,515] Trial 2 finished with value: 113399.67887586009 and parameters: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.2, '

Best parameters found: {'n_estimators': 300, 'max_depth': 2, 'learning_rate': 0.01, 'min_child_weight': 10, 'subsample': 0.9, 'colsample_bytree': 1.0, 'colsample_bylevel': 0.8, 'colsample_bynode': 1.0, 'gamma': 0.2, 'reg_lambda': 1, 'reg_alpha': 0}
Best RMSE: 57254.284205333126


In [10]:
# Combine training and validation sets
X_train_full = pd.concat([X_train, X_val])
y_train_full = pd.concat([y_train, y_val])

# Define the best parameters
best_params = {
    'n_estimators': 300,
    'max_depth': 2,
    'learning_rate': 0.01,
    'min_child_weight': 10,
    'subsample': 0.9,
    'colsample_bytree': 1.0,
    'colsample_bylevel': 0.8,
    'colsample_bynode': 1.0,
    'gamma': 0.2,
    'reg_lambda': 1,
    'reg_alpha': 0,
    'verbosity': 0,
    'objective': 'reg:squarederror',
    'tree_method': 'gpu_hist',  # Use GPU acceleration if available
    'predictor': 'gpu_predictor',
    'enable_categorical': True  # Ensure this is set if you have categorical features
}

# Re-create the pipeline with the best parameters
xgb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(**best_params))
])

# Fit the model on the entire training set
xgb_pipe.fit(X_train_full, y_train_full)


In [11]:
# Load the test set
test_url = "test.csv"
test_raw = pd.read_csv(test_url, engine="pyarrow", dtype_backend="pyarrow")
test_cars = clean_housing(test_raw)

# Ensure the same preprocessing as the training set
X_test = test_cars[features]

# Make predictions
test_preds = xgb_pipe.predict(X_test)


In [13]:
# Prepare the submission file
submission_example = pd.read_csv('sample_submission.csv')
submission = pd.DataFrame({'id': submission_example['id'], 'target': test_preds})
submission.to_csv('submission.csv', index=False)