In [1]:
import os
import sys
from dataclasses import dataclass

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
df = pd.read_csv("vehicles.csv")  # Loading full dataset
# df = df.sample(frac=0.5, random_state=42).reset_index(drop=True)  # Using 50% of the data

In [3]:
 cols_to_drop = [
                "county", "size", "state", "region", "posting_date", 
                "paint_color", "drive",
            ]
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
df.dropna(subset=["year", "odometer", "fuel", "model"], inplace=True)

In [4]:
current_year = 2025
df["car_age"] = current_year - df["year"]
df.drop(columns=["year"], inplace=True)

In [5]:
import re
def extract_base_model(name: str) -> str:
    """
    Cleans a raw vehicle model name and returns the base model only.
    No encoding is done here — just cleaning for consistency before encoding.
    """
    if pd.isnull(name) or name.strip() == "":
        return "unknown"

    name = name.lower()
    name = re.sub(r'[^a-z0-9 ]', '', name)  # removing special characters
    name = re.sub(
        r'\b(crew|cab|pickup|sedan|coupe|van|wagon|truck|convertible|utility|hatchback|2d|4d|4x4|fx4|awd|fwd|rwd|sr|ex|lx|le|lt|xlt|sel|slt|premium|limited|base|plus|l|gls|xle|se|xl|sport|touring|super|luxury|classic|series|class)\b',
        '', name
    )
    name = re.sub(r'\s+', ' ', name).strip()
    
    # returning just the first word (base model)
    return name.split()[0] if name else "unknown"
df["model"] = df["model"].apply(extract_base_model)

In [6]:
# Step 1: Converting 'cylinders' to numeric if it's a string like "4 cylinders"
if df['cylinders'].dtype == 'object':
    df['cylinders'] = df['cylinders'].str.extract('(\d+)')
    df['cylinders'] = pd.to_numeric(df['cylinders'], errors='coerce')

# Step 2: Dropping rows with missing values in relevant numeric columns
df.dropna(subset=['price', 'odometer', 'car_age'], inplace=True)

# Step 3: Defining numeric columns for outlier removal
num_cols = ['price', 'odometer', 'car_age']

# Step 4: IQR-based outlier removal function
def remove_outliers_iqr(df, cols):
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# Step 5: Applying outlier removal
df = remove_outliers_iqr(df, num_cols)

In [7]:
cat_cols = ["manufacturer", "fuel", "title_status", "model", "condition", "cylinders", "type", "transmission"]
for col in cat_cols:
    df[col] = df[col].astype(str)

In [8]:
target_column = "price"
df['price'] = np.log1p(df['price'])
X = df.drop(columns=[target_column])
y = df[target_column]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
def get_data_transformer_object():
        try:
            numerical_features = ["odometer", "car_age"]
            all_categorical_features = [
                "manufacturer", "fuel", "title_status", "model", "type", 'cylinders', 'condition'
            ]
            mode_fill_columns = ["transmission"]

            num_pipeline = Pipeline([
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler())
            ])

            cat_pipeline_unknown = Pipeline([
                ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
                ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
                ("scaler", StandardScaler(with_mean=False))
            ])

            cat_pipeline_mode = Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
                ("scaler", StandardScaler(with_mean=False))
            ])

            general_cats = [col for col in all_categorical_features if col not in mode_fill_columns]

            preprocessor = ColumnTransformer([
                ("num_pipeline", num_pipeline, numerical_features),
                ("cat_pipeline_unknown", cat_pipeline_unknown, general_cats),
                ("cat_pipeline_mode", cat_pipeline_mode, mode_fill_columns)
            ])

            return preprocessor

        except Exception as e:
            raise Exception(f"Transformer pipeline setup failed: {e}")
preprocessor = get_data_transformer_object()
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [11]:
!pip install xgboost catboost scikit-learn

Collecting xgboost
  Downloading xgboost-3.0.1-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading xgboost-3.0.1-py3-none-manylinux2014_x86_64.whl (4.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading catboost-1.2.8-cp310-cp310-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m175.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
Installing collected packages: graphviz, xgboost, catboost
Successfully installed catboost-1.2.8 graphviz-0.20.3 xgboost-3.0.1


In [12]:
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
import joblib

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [13]:
import time
from tqdm import tqdm
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Models listed from light to heavy
models = {
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBRegressor": XGBRegressor(verbosity=0),
    "CatBoost Regressor": CatBoostRegressor(verbose=0),
}

# Evaluation function that reverses log1p transformation
def evaluate_models(X_train, y_train, X_test, y_test, models):
    report = {}
    best_model = None
    best_score = -np.inf

    print("🔁 Starting model training...\n")
    
    for name in tqdm(models, desc="⏳ Progress"):
        model = models[name]
        print(f"\n🚀 Training: {name}")
        start = time.time()

        model.fit(X_train, y_train)
        y_pred_log = model.predict(X_test)

        end = time.time()
        duration = end - start
        print(f"✅ Finished: {name} in {duration:.2f} seconds")

        # Unding the log1p transformation for evaluation
        y_pred_actual = np.expm1(y_pred_log)
        y_test_actual = np.expm1(y_test)

        r2 = r2_score(y_test_actual, y_pred_actual)
        report[name] = {
            "model": model,
            "R2": r2,
            "MAE": mean_absolute_error(y_test_actual, y_pred_actual),
            "MSE": mean_squared_error(y_test_actual, y_pred_actual),
            "RMSE": np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
        }

        if r2 > best_score:
            best_score = r2
            best_model = model
            best_model_name = name

    return best_model, best_model_name, report

# Calling the function
best_model, best_model_name, model_report = evaluate_models(X_train, y_train, X_test, y_test, models)

# Displaying results
print(f"\n✅ Best Model: {best_model_name}")
print("📊 Evaluation Metrics:")
for metric, value in model_report[best_model_name].items():
    if metric != "model":
        print(f"{metric}: {value:.4f}")


🔁 Starting model training...



⏳ Progress:   0%|          | 0/6 [00:00<?, ?it/s]


🚀 Training: Ridge Regression


⏳ Progress:  17%|█▋        | 1/6 [00:17<01:29, 17.92s/it]

✅ Finished: Ridge Regression in 17.91 seconds

🚀 Training: Decision Tree


⏳ Progress:  33%|███▎      | 2/6 [02:16<05:08, 77.15s/it]

✅ Finished: Decision Tree in 118.61 seconds

🚀 Training: AdaBoost Regressor


⏳ Progress:  50%|█████     | 3/6 [11:32<14:47, 295.68s/it]

✅ Finished: AdaBoost Regressor in 555.72 seconds

🚀 Training: Gradient Boosting


⏳ Progress:  67%|██████▋   | 4/6 [1:06:54<49:41, 1490.59s/it]

✅ Finished: Gradient Boosting in 3322.37 seconds

🚀 Training: XGBRegressor


⏳ Progress:  83%|████████▎ | 5/6 [1:07:27<16:05, 965.07s/it] 

✅ Finished: XGBRegressor in 33.28 seconds

🚀 Training: CatBoost Regressor


⏳ Progress: 100%|██████████| 6/6 [1:14:47<00:00, 747.98s/it]

✅ Finished: CatBoost Regressor in 439.93 seconds

✅ Best Model: Decision Tree
📊 Evaluation Metrics:
R2: 0.7124
MAE: 2766.8074
MSE: 49856813.9584
RMSE: 7060.9358





In [14]:
import joblib
import shutil

# Save and move the model
joblib.dump(best_model, "model.pkl")
shutil.move("model.pkl", "/home/ec2-user/SageMaker/model.pkl")
print("✅ Model exported. Look in sidebar to download.")

✅ Model exported. Look in sidebar to download.
