In [1]:
import numpy as np
import pandas as pd
import optuna

from sklearn.compose import ColumnTransformer
from category_encoders import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

from cohort4_helper import *

from xgboost import XGBRegressor

In [2]:
url = "train.csv"
raw = pd.read_csv(url, engine="pyarrow", dtype_backend="pyarrow")
cars = prep_data(raw)

In [3]:
target = "price"
features = [col for col in cars.columns if col not in target]

X = cars[features]
y = cars[target]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (54273, 12)
y shape: (54273,)


In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=43)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (40704, 12)
y_train shape: (40704,)
X_val shape: (13569, 12)
y_val shape: (13569,)


In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40704 entries, 11257 to 14148
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   brand         40704 non-null  category       
 1   model         40704 non-null  category       
 2   milage        40704 non-null  uint32[pyarrow]
 3   fuel_type     40704 non-null  category       
 4   transmission  40704 non-null  string[pyarrow]
 5   ext_col       40704 non-null  category       
 6   int_col       40704 non-null  category       
 7   accident      40704 non-null  category       
 8   clean_title   40704 non-null  category       
 9   age           40704 non-null  int64[pyarrow] 
 10  horsepower    40704 non-null  float64        
 11  cylinders     40704 non-null  float64        
dtypes: category(7), float64(2), int64[pyarrow](1), string[pyarrow](1), uint32[pyarrow](1)
memory usage: 2.3 MB


In [6]:
# Identify categorical and numerical columns
cat = list(X.select_dtypes(include=['category']).columns)
num = list(X.select_dtypes(include=['number']).columns)

In [7]:
# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat),
        ('num', StandardScaler(), num)
    ]
)

In [8]:
# Define the objective function
def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'booster': 'gbtree',
        'tree_method': 'gpu_hist',  # Use GPU acceleration
        'n_estimators': trial.suggest_int('n_estimators', 50, 500, step=50),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 0.9, 1.0]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.6, 0.7, 0.8, 0.9, 1.0]),
        'colsample_bylevel': trial.suggest_categorical('colsample_bylevel', [0.6, 0.7, 0.8, 0.9, 1.0]),
        'colsample_bynode': trial.suggest_categorical('colsample_bynode', [0.6, 0.7, 0.8, 0.9, 1.0]),
        'gamma': trial.suggest_categorical('gamma', [0, 0.1, 0.2, 0.3, 0.4, 0.5]),
        'reg_lambda': trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1, 1.5, 2]),
        'reg_alpha': trial.suggest_categorical('reg_alpha', [0, 0.1, 0.5, 1, 1.5, 2]),
        'enable_categorical': True  # Enable categorical handling
    }
    
    # Define the pipeline with XGBRegressor
    xgb_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(**param))
    ])
    
    # Fit the model
    xgb_pipe.fit(X_train, y_train)
    
    # Make predictions
    preds = xgb_pipe.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)
    
    return rmse


In [9]:
# Run optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Print best parameters
print(f"Best parameters found: {study.best_params}")
print(f"Best RMSE: {study.best_value}")

[I 2024-06-13 11:24:50,426] A new study created in memory with name: no-name-4dff165a-57e9-4cf2-b4e5-c83f3e714707
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
[I 2024-06-13 11:25:09,026] Trial 0 finished with value: 60521.16878664955 and parameters: {'n_estimators': 250, 'max_depth': 8, 'learning_rate': 0.1, 'min_child_weight': 8, 'subsample': 0.9, 'colsample_bytree': 1.0, 'colsample_bylevel': 0.8, 'colsample_bynode': 0.7, 'gamma': 0.4, 'reg_lambda': 0.1, '

Best parameters found: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.01, 'min_child_weight': 10, 'subsample': 0.9, 'colsample_bytree': 0.6, 'colsample_bylevel': 0.6, 'colsample_bynode': 0.7, 'gamma': 0.1, 'reg_lambda': 1.5, 'reg_alpha': 0.5}
Best RMSE: 56713.08975732988


In [12]:
# Combine training and validation sets
X_train_full = pd.concat([X_train, X_val])
y_train_full = pd.concat([y_train, y_val])

# Define the best parameters
best_params = {
    'n_estimators': 200,
    'max_depth': 7,
    'learning_rate': 0.01,
    'min_child_weight': 10,
    'subsample': 0.9,
    'colsample_bytree': 0.6,
    'colsample_bylevel': 0.6,
    'colsample_bynode': 0.7,
    'gamma': 0.1,
    'reg_lambda': 1.5,
    'reg_alpha': 0.5,
    'verbosity': 0,
    'objective': 'reg:squarederror',
    'tree_method': 'gpu_hist',  # Use GPU acceleration if available
    'predictor': 'gpu_predictor',
    'enable_categorical': True  # Ensure this is set if you have categorical features
}


# Re-create the pipeline with the best parameters
xgb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(**best_params))
])

# Fit the model on the entire training set
xgb_pipe.fit(X_train_full, y_train_full)


  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [19]:
# Load the test set
test_url = "test.csv"
test_raw = pd.read_csv(test_url, engine="pyarrow", dtype_backend="pyarrow")
test_cars = prep_data(test_raw)

# Ensure the same preprocessing as the training set
X_test = test_cars[features]

# Make predictions
test_preds = xgb_pipe.predict(X_test)


  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [20]:
len(test_preds)

36183

In [21]:
# Prepare the submission file
submission_example = pd.read_csv('sample_submission.csv')
submission = pd.DataFrame({'id': submission_example['id'], 'target': test_preds})
submission.to_csv('submission.csv', index=False)

In [16]:
# Check lengths
print(f"Length of submission_example['id']: {len(submission_example['id'])}")
print(f"Length of test_preds: {len(test_preds)}")

# If the lengths match, create the DataFrame
if len(submission_example['id']) == len(test_preds):
    submission = pd.DataFrame({'id': submission_example['id'], 'target': test_preds})
    submission.to_csv('submission.csv', index=False)
else:
    print("The lengths of 'id' and 'test_preds' do not match. Please check your predictions.")

# Debugging: if lengths do not match, you need to understand why
if len(submission_example['id']) != len(test_preds):
    # Ensure test_preds is generated correctly, for example:
    # test_preds = model.predict(test_data)
    # Also ensure that test_data is the correct length and corresponds to the submission ids
    pass


Length of submission_example['id']: 36183
Length of test_preds: 54273
The lengths of 'id' and 'test_preds' do not match. Please check your predictions.
