# Predictive Modeling for Used Car Prices

Utilizes pandas for data handling, sklearn for preprocessing (including one-hot encoding and scaling with **StandardScaler**), and various models including **HistGradientBoostingRegressor**, **RandomForestRegressor**, **Ridge**, and **StackingRegressor** for predictive modeling.

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np
import zipfile

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Keep the 'id' column
test_ids = test_data['id']

# Convert categorical variables using one-hot encoding
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

# Align the train and test data 
train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

# Remove the 'price' column from the test data
if 'price' in test_data.columns:
    test_data = test_data.drop(['price'], axis=1)

# Scale numerical features
scaler = StandardScaler()
numerical_columns = train_data.select_dtypes(include=['int64', 'float64']).columns.drop('price')
train_data[numerical_columns] = scaler.fit_transform(train_data[numerical_columns])
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

X = train_data.drop(['price'], axis=1)
y = train_data['price']

assert test_data.shape[0] == 36183, f"Expected 36183 rows in test data, but got {test_data.shape[0]}"


  from pandas.core import (


In [2]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameters for Randomized Search
param_distributions = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [100, 200, 300],
    'max_depth': [3, 5, 10],
    'min_samples_leaf': [10, 20, 50]
}

# Gradient Boosting model
gb_model = HistGradientBoostingRegressor(random_state=42)

# Randomized Search
random_search = RandomizedSearchCV(estimator=gb_model, param_distributions=param_distributions, n_iter=20, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error', random_state=42)

random_search.fit(X_train, y_train)

# best parameters and best score
best_params = random_search.best_params_
best_score = np.sqrt(-random_search.best_score_)
print(f'Best Parameters (Randomized Search): {best_params}')
print(f'Best RMSE (Randomized Search): {best_score}')


Fitting 3 folds for each of 20 candidates, totalling 60 fits


  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


[CV] END learning_rate=0.05, max_depth=3, max_iter=200, min_samples_leaf=10; total time= 1.5min
[CV] END learning_rate=0.05, max_depth=3, max_iter=200, min_samples_leaf=20; total time= 1.5min
[CV] END learning_rate=0.05, max_depth=3, max_iter=200, min_samples_leaf=10; total time= 1.7min
[CV] END learning_rate=0.05, max_depth=3, max_iter=200, min_samples_leaf=20; total time= 1.7min
[CV] END learning_rate=0.05, max_depth=3, max_iter=200, min_samples_leaf=10; total time= 1.8min
[CV] END learning_rate=0.01, max_depth=3, max_iter=100, min_samples_leaf=10; total time= 2.1min
[CV] END learning_rate=0.01, max_depth=3, max_iter=100, min_samples_leaf=10; total time= 2.1min
[CV] END learning_rate=0.01, max_depth=3, max_iter=100, min_samples_leaf=10; total time= 2.1min
[CV] END learning_rate=0.05, max_depth=3, max_iter=200, min_samples_leaf=20; total time= 2.2min
[CV] END learning_rate=0.05, max_depth=3, max_iter=100, min_samples_leaf=20; total time= 1.1min
[CV] END learning_rate=0.05, max_depth=3

In [3]:
# Train the Gradient Boosting model with the best parameters
best_gb_model = HistGradientBoostingRegressor(**best_params, random_state=42)
best_gb_model.fit(X_train, y_train)

y_pred_best_gb = best_gb_model.predict(X_val)

# Calculate RMSE 
rmse_best_gb = np.sqrt(mean_squared_error(y_val, y_pred_best_gb))
print(f'RMSE (Best Gradient Boosting): {rmse_best_gb}')


RMSE (Best Gradient Boosting): 48176.967086731645


In [4]:
# Define base models
base_models = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('gb', HistGradientBoostingRegressor(**best_params, random_state=42))
]

# Define stacking model
stacking_model = StackingRegressor(estimators=base_models, final_estimator=Ridge())

stacking_model.fit(X_train, y_train)

y_pred_stacking = stacking_model.predict(X_val)


rmse_stacking = np.sqrt(mean_squared_error(y_val, y_pred_stacking))
print(f'RMSE (Stacking Model): {rmse_stacking}')


RMSE (Stacking Model): 48085.15815151347


In [5]:
from sklearn.model_selection import cross_val_score

# Evaluate model using cross-validation
cv_scores = cross_val_score(stacking_model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse_scores = np.sqrt(-cv_scores)
print(f'Cross-Validation RMSE Scores: {cv_rmse_scores}')
print(f'Mean CV RMSE: {cv_rmse_scores.mean()}')


Cross-Validation RMSE Scores: [73125.98958589 72561.12099315 50876.61356529 80265.28079763
 62340.30210191]
Mean CV RMSE: 67833.8614087743


In [6]:
# Make predictions on the test set using stacking model
test_predictions = stacking_model.predict(test_data)

submission = pd.DataFrame({
    'id': test_ids,
    'price': test_predictions
})

assert submission.shape[0] == 36183, f"Expected 36183 rows in submission, but got {submission.shape[0]}"

submission.to_csv('submission.csv', index=False)
print('Submission file saved as submission.csv')


Submission file saved as submission.csv
Submission file zipped as submission.zip
