In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

In [10]:
df = pd.read_csv('../OLX_cars_dataset00.csv')

# Remove outliers
lower_bound = df['Price'].quantile(0.01)  # 1st percentile
upper_bound = df['Price'].quantile(0.99)  # 99th percentile
df = df[(df['Price'] >= lower_bound) & (df['Price'] <= upper_bound)]


# Remove useless columns (based on distribution and obviously useless columns)
df = df.drop(['Car Name', 'Condition', 'Description', 'Car Features', "Images URL's", 'Car Profile', 'Ad ID', 'Car documents', 'Fuel', 'Seller Location'], axis=1)
df.head()

Unnamed: 0,Make,Model,Year,KM's driven,Price,Registration city,Assembly,Transmission
0,Toyota,Passo,2021,54000,4190000,Unregistered,Imported,Automatic
1,Suzuki,Ravi,2018,95000,1300000,Karachi,Local,Manual
2,Suzuki,Bolan,2015,50000,800000,Karachi,Local,Manual
3,Daihatsu,Move,2013,94000,2155000,Lahore,Imported,Automatic
4,Suzuki,Swift,2011,126544,1440000,Karachi,Local,Manual


In [11]:
# One hot encode
df = pd.get_dummies(df, columns=['Make', 'Model', "KM's driven", 'Assembly', 'Transmission', 'Registration city'])
df.head()

Unnamed: 0,Year,Price,Make_Changan,Make_Chevrolet,Make_Daihatsu,Make_FAW,Make_Honda,Make_Hyundai,Make_KIA,Make_Mercedes,...,Registration city_Sialkot,Registration city_Sindh,Registration city_Swabi,Registration city_Swat,Registration city_Toba Tek singh,Registration city_Unknown,Registration city_Unregistered,Registration city_Vehari,Registration city_Wah,Registration city_Wazirabad
0,2021,4190000,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,2018,1300000,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2015,800000,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2013,2155000,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2011,1440000,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
# Initialize variables
y = df['Price']
X = df.drop(['Price'], axis=1)

# Split into training & testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [114]:
# Initialize the Random Forest
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    random_state=21
)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print("R² Score:", r2)
print("MAPE:", mape)

# See if it passes the 10% threshold
if mape < 0.10:  # 0.10 = 10%
    print(f"✅ Passed: MAPE is {mape*100:.2f}% (<10%)")
else:
    print(f"❌ Failed: MAPE is {mape*100:.2f}% (>10%)")



R² Score: 0.9506742203449255
MAPE: 0.09638845487934829
✅ Passed: MAPE is 9.64% (<10%)
