In [2]:
import pandas as pd
import numpy as np
import re

Data from https://www.kaggle.com/datasets/nehalbirla/vehicle-dataset-from-cardekho?resource=download

In [None]:
# Loading
dataset = pd.read_csv("datasets/carData/Car details v3.csv")
dataset.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [None]:
# Cleaning
dataset.drop(columns="name", inplace=True)
dataset = dataset.loc[dataset["torque"].notna()]
dataset.loc[dataset["mileage"]==0, "mileage"] = np.nan
dataset["mileage"] = pd.to_numeric(dataset["mileage"].apply(lambda x : x.split(" ")[0]))
dataset["engine"] = pd.to_numeric(dataset["engine"].apply(lambda x : x.split(" ")[0]))
dataset["max_power"] = pd.to_numeric(dataset["max_power"].apply(lambda x : x.split(" ")[0]))
dataset["owner"] = dataset["owner"].map({"First Owner":1, "Second Owner":2, "Third Owner":3, "Fourth & Above Owner": 4, "Test Drive Car":0})
def extract_nm(s):
    if pd.isna(s):
        return None
    s = s.lower().replace(",", "")
    m = re.search(r"(\d+\.?\d*)", s)
    if not m:
        return None
    val = float(m.group(1))
    if "kgm" in s:
        return val * 9.80665
    else: 
        return val
dataset["torque"] = dataset["torque"].apply(extract_nm)
dataset.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,2014,450000,145500,Diesel,Individual,Manual,1,23.4,1248,74.0,190.0,5.0
1,2014,370000,120000,Diesel,Individual,Manual,2,21.14,1498,103.52,250.0,5.0
2,2006,158000,140000,Petrol,Individual,Manual,3,17.7,1497,78.0,124.544455,5.0
3,2010,225000,127000,Diesel,Individual,Manual,1,23.0,1396,90.0,219.66896,5.0
4,2007,130000,120000,Petrol,Individual,Manual,1,16.1,1298,88.2,112.776475,5.0


In [5]:
# Helper Functions
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

def columnRatio(X):
    return (X.iloc[:, 0] / X.iloc[:, 1]).to_frame()

def ratioName(function_transformer, feature_names_in):
    return ["ratio"]  

def ratioPipeline():
    return make_pipeline(
        FunctionTransformer(columnRatio, feature_names_out=ratioName),
        StandardScaler())


In [6]:
# Pipelines
from sklearn.compose import ColumnTransformer

logPipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler()
)
defaultPipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)
catPipeline = make_pipeline(
    OneHotEncoder(handle_unknown="ignore")
)

preprocessing = ColumnTransformer(
    [("engineTorqueRatio", ratioPipeline(), ["engine", "torque"]),
     ("mileageEngineRatio", ratioPipeline(), ["mileage", "engine"]),
    ("mileageSeatRatio", ratioPipeline(), ["mileage", "seats"]),
    ("log", logPipeline, ["km_driven", "torque", "max_power"]),
    ("cat", catPipeline, ["fuel", "seller_type", "transmission"])],
    remainder = defaultPipeline
    )

In [7]:
# Data Splitting
# Since max_power is most correlated w/ selling price, stratified sampling based on max_power
from sklearn.model_selection import train_test_split

dataset["power_bin"] = pd.qcut(dataset["max_power"], 5, labels=[1,2,3,4,5])
trainSet, testSet = train_test_split(dataset, test_size=0.2, random_state=12345, stratify = dataset["power_bin"])
trainSet.drop(columns="power_bin", inplace=True)
testSet.drop(columns="power_bin", inplace=True)

trainLabels = trainSet["selling_price"].copy()
trainSet.drop(columns="selling_price", inplace=True)
trainLabelsLog = np.log(trainLabels)

testLabels = testSet["selling_price"].copy()
testSet.drop(columns="selling_price", inplace=True)
testLabelsLog = np.log(testLabels)

In [8]:
# Data Processing

processedTrainSet = preprocessing.fit_transform(trainSet)
pd.DataFrame(processedTrainSet, columns = preprocessing.get_feature_names_out(), index = trainSet.index)

Unnamed: 0,engineTorqueRatio__ratio,mileageEngineRatio__ratio,mileageSeatRatio__ratio,log__km_driven,log__torque,log__max_power,cat__fuel_CNG,cat__fuel_Diesel,cat__fuel_LPG,cat__fuel_Petrol,cat__seller_type_Dealer,cat__seller_type_Individual,cat__seller_type_Trustmark Dealer,cat__transmission_Automatic,cat__transmission_Manual,remainder__year,remainder__owner
5360,-1.077674,0.149263,0.765146,-0.834451,0.652728,0.095288,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.523714,-0.623459
684,-1.063191,-0.368774,0.379959,0.378782,0.907621,0.537078,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.763087,0.788197
4669,-0.780211,-1.177792,-1.457381,0.346471,1.131206,0.956099,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.523714,-0.623459
3603,-1.178923,-0.171293,0.722775,-0.448534,0.983481,1.102450,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.763087,0.788197
4149,-3.080619,-0.903055,-0.687008,0.864876,4.337162,0.566648,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,-1.277807,-0.623459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4087,-0.995740,0.987581,1.585593,-0.384798,0.366187,-0.422330,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.781074,-0.623459
6180,-1.600176,-1.067742,-0.344191,-2.215349,1.834886,2.266397,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.295795,-0.623459
6684,-1.077331,0.201107,0.853739,1.258217,0.652449,0.135811,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.763087,-0.623459
639,-0.995740,0.502212,0.844109,-0.039446,0.366187,-0.422330,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.781074,-0.623459


### Training And Evaluating Models

In [None]:
# First, linear reg on unprocessed data
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

onehotTrainSet = pd.get_dummies(trainSet, dtype=int)
unprocessedLinearReg = LinearRegression()

rmses = -cross_val_score(unprocessedLinearReg, onehotTrainSet, trainLabels, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(rmses).describe() #find median rmse

count        10.000000
mean     460094.426749
std       59681.583001
min      372351.276188
25%      428017.939878
50%      453288.804712
75%      508352.620535
max      544847.213480
dtype: float64

In [10]:
# Linear Regression

linearReg = make_pipeline(preprocessing, LinearRegression())

from sklearn.model_selection import cross_val_score

rmses = -cross_val_score(linearReg, trainSet, trainLabels, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(rmses).describe() #find median rmse
# Likely does a little worse because our processing introduced some noisy variables

count        10.000000
mean     491378.268372
std       60413.765771
min      406029.422263
25%      464014.639382
50%      484370.612024
75%      519588.791649
max      588426.053990
dtype: float64

In [11]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
decisionTree = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=12345))
rmses = -cross_val_score(decisionTree, trainSet, trainLabels, scoring="neg_root_mean_squared_error", cv=10, n_jobs=-1)
pd.Series(rmses).describe()["50%"] 

171260.96926293502

In [12]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
randForest = Pipeline([("preprocessing", preprocessing), ("randomforestregressor", RandomForestRegressor(random_state=12345))])
rmses = -cross_val_score(randForest, trainSet, trainLabels, scoring="neg_root_mean_squared_error", cv=10, n_jobs=-1)
pd.Series(rmses).describe()["50%"]

129233.63326140307

In [13]:
# Fine Tuning the Random Forest Model
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Also switched to log train labels to counter effects of skewed data
params = {"randomforestregressor__max_features": randint(2,35)}
randomSearch = RandomizedSearchCV(randForest, param_distributions=params, n_iter=15, cv=5, scoring="neg_root_mean_squared_error", random_state=12345, n_jobs=-1)
randomSearch.fit(trainSet, trainLabelsLog)
finalModel = randomSearch.best_estimator_

In [14]:
sorted(zip(finalModel["randomforestregressor"].feature_importances_.round(3), finalModel["preprocessing"].get_feature_names_out()))
# A lot of features aren't very important

[(0.0, 'cat__fuel_CNG'),
 (0.0, 'cat__fuel_LPG'),
 (0.0, 'cat__seller_type_Trustmark Dealer'),
 (0.001, 'cat__seller_type_Dealer'),
 (0.001, 'cat__seller_type_Individual'),
 (0.002, 'cat__fuel_Diesel'),
 (0.002, 'cat__fuel_Petrol'),
 (0.011, 'remainder__owner'),
 (0.014, 'cat__transmission_Manual'),
 (0.021, 'mileageSeatRatio__ratio'),
 (0.026, 'cat__transmission_Automatic'),
 (0.031, 'log__km_driven'),
 (0.039, 'engineTorqueRatio__ratio'),
 (0.044, 'mileageEngineRatio__ratio'),
 (0.168, 'log__torque'),
 (0.292, 'log__max_power'),
 (0.349, 'remainder__year')]

### Final Evaluation

In [15]:
predictions = np.exp(finalModel.predict(testSet))

np.sqrt(((predictions - testLabels) ** 2).mean())

# The Random Forest Model is able to get, on avg, ~126k away from the true car price

126192.23815321838

In [16]:
pd.Series(np.sqrt(((predictions - testLabels) ** 2))).describe().round(2)
# Median error is 35k

count       1582.00
mean       65426.95
std       107940.54
min            0.00
25%        13257.08
50%        35654.39
75%        75647.17
max      1410908.43
Name: selling_price, dtype: float64