In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_csv("car_l3_dataset_step10.csv")

df.head()

Unnamed: 0,Price,Odometer_km,Doors,Accidents,Year,Location_city,Location_rural,Location_suburb,CarAge,Km_per_year,Is_Urban,LogPrice
0,1500.0,0.12839,4.0,1,1998,1,0,0,1.686714,-0.615631,1,7.31322
1,4171.0,-0.044709,4.0,0,2016,0,1,0,-0.794617,0.070446,0,8.335911
2,5331.0,-0.440923,4.0,0,2014,0,0,1,-0.518913,-0.267993,1,8.581294
3,1500.0,0.203135,4.0,1,1999,0,0,1,1.548862,-0.587024,1,7.31322
4,1500.0,-0.044709,3.0,0,2022,1,0,0,-1.621727,1.738196,1,7.31322


In [3]:
df = df.drop_duplicates()

print("Shape after removing duplicates:", df.shape)

Shape after removing duplicates: (140, 12)


In [4]:
# Car Age
df["CarAge"] = 2024 - df["Year"]

# Km per year
df["Km_per_year"] = df["Odometer_km"] / (df["CarAge"] + 1)

# Urban flag (example)
df["Is_Urban"] = np.where(df["Location_city"] == 1, 1, 0)

df.head()

Unnamed: 0,Price,Odometer_km,Doors,Accidents,Year,Location_city,Location_rural,Location_suburb,CarAge,Km_per_year,Is_Urban,LogPrice
0,1500.0,0.12839,4.0,1,1998,1,0,0,26,0.004755,1,7.31322
1,4171.0,-0.044709,4.0,0,2016,0,1,0,8,-0.004968,0,8.335911
2,5331.0,-0.440923,4.0,0,2014,0,0,1,10,-0.040084,0,8.581294
3,1500.0,0.203135,4.0,1,1999,0,0,1,25,0.007813,0,7.31322
4,1500.0,-0.044709,3.0,0,2022,1,0,0,2,-0.014903,1,7.31322


In [5]:
X = df.drop(columns=["Price", "LogPrice"])
y = df["Price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (112, 10)
Test shape: (28, 10)


In [6]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [7]:
rf = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

rf.fit(X_train, y_train)

In [8]:
def evaluate_model(name, model):
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(f"\n{name} Performance:")
    print(f"R²   : {r2:.3f}")
    print(f"MAE  : {mae:,.0f}")
    print(f"RMSE : {rmse:,.0f}")

    return y_pred

In [9]:
lr_pred = evaluate_model("Linear Regression", lr)
rf_pred = evaluate_model("Random Forest", rf)


Linear Regression Performance:
R²   : 0.425
MAE  : 1,411
RMSE : 1,957

Random Forest Performance:
R²   : 0.451
MAE  : 914
RMSE : 1,911


In [10]:
i = 3

sample_X = X_test.iloc[[i]]
actual = y_test.iloc[i]

lr_single = lr.predict(sample_X)[0]
rf_single = rf.predict(sample_X)[0]

print("Actual Price       :", actual)
print("LR Predicted Price :", lr_single)
print("RF Predicted Price :", rf_single)

Actual Price       : 7009.0
LR Predicted Price : 5853.242177354492
RF Predicted Price : 7049.635
