In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from xgboost import XGBRegressor

# Load dataset
df = pd.read_csv(r"C:\Users\chaha\OneDrive\Desktop\Unified Mentor PROJECT\vehicle_price_prediction\Vehicle Price Prediction\dataset.csv")
df.head()
#  Drop rows where target is missing
df = df.dropna(subset=["price"])

#  Feature Engineering
df["car_age"] = 2025 - df["year"]
df["log_mileage"] = np.log1p(df["mileage"])
df["log_price"] = np.log1p(df["price"])

#  Target and Features
X = df.drop(columns=["price", "log_price"])
y = df["log_price"]

#  Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  Column types
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

#  Pipelines
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))  
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

#  XGBoost Regressor with Hyperparameter Tuning
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

param_grid = {
    "regressor__n_estimators": [100, 200],
    "regressor__max_depth": [3, 5],
    "regressor__learning_rate": [0.05, 0.1],
}

model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", xgb)
])

grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, scoring="r2", n_jobs=-1)
grid_search.fit(X_train, y_train)

#  Predict and Evaluate
y_pred = grid_search.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f" RMSE: ${rmse:,.2f}")
print(f" MAE:  ${mae:,.2f}")
print(f" R² Score: {r2:.4f}")
print(f" Best Params: {grid_search.best_params_}")


 RMSE: $0.14
 MAE:  $0.09
 R² Score: 0.8276
 Best Params: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 5, 'regressor__n_estimators': 200}


In [5]:
# Load dataset
df = pd.read_csv(r"C:\Users\chaha\OneDrive\Desktop\Unified Mentor PROJECT\vehicle_price_prediction\Vehicle Price Prediction\dataset.csv")
df.head()

Unnamed: 0,name,description,make,model,year,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain
0,2024 Jeep Wagoneer Series II,"\n \n Heated Leather Seats, Nav Sy...",Jeep,Wagoneer,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,Series II,SUV,4.0,White,Global Black,Four-wheel Drive
1,2024 Jeep Grand Cherokee Laredo,Al West is committed to offering every custome...,Jeep,Grand Cherokee,2024,50170.0,OHV,6.0,Gasoline,1.0,8-Speed Automatic,Laredo,SUV,4.0,Metallic,Global Black,Four-wheel Drive
2,2024 GMC Yukon XL Denali,,GMC,Yukon XL,2024,96410.0,"6.2L V-8 gasoline direct injection, variable v...",8.0,Gasoline,0.0,Automatic,Denali,SUV,4.0,Summit White,Teak/Light Shale,Four-wheel Drive
3,2023 Dodge Durango Pursuit,White Knuckle Clearcoat 2023 Dodge Durango Pur...,Dodge,Durango,2023,46835.0,16V MPFI OHV,8.0,Gasoline,32.0,8-Speed Automatic,Pursuit,SUV,4.0,White Knuckle Clearcoat,Black,All-wheel Drive
4,2024 RAM 3500 Laramie,\n \n 2024 Ram 3500 Laramie Billet...,RAM,3500,2024,81663.0,24V DDI OHV Turbo Diesel,6.0,Diesel,10.0,6-Speed Automatic,Laramie,Pickup Truck,4.0,Silver,Black,Four-wheel Drive
