In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df=pd.read_csv('car_details.csv')

In [4]:
df

Unnamed: 0,Make,Model,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,Owner,Seller Type,Engine,Max Power,Max Torque,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity
0,Honda,Amaze 1.2 VX i-VTEC,505000,2017,87150,Petrol,Manual,Pune,Grey,First,Corporate,1198 cc,87 bhp @ 6000 rpm,109 Nm @ 4500 rpm,FWD,3990.0,1680.0,1505.0,5.0,35.0
1,Maruti Suzuki,Swift DZire VDI,450000,2014,75000,Diesel,Manual,Ludhiana,White,Second,Individual,1248 cc,74 bhp @ 4000 rpm,190 Nm @ 2000 rpm,FWD,3995.0,1695.0,1555.0,5.0,42.0
2,Hyundai,i10 Magna 1.2 Kappa2,220000,2011,67000,Petrol,Manual,Lucknow,Maroon,First,Individual,1197 cc,79 bhp @ 6000 rpm,112.7619 Nm @ 4000 rpm,FWD,3585.0,1595.0,1550.0,5.0,35.0
3,Toyota,Glanza G,799000,2019,37500,Petrol,Manual,Mangalore,Red,First,Individual,1197 cc,82 bhp @ 6000 rpm,113 Nm @ 4200 rpm,FWD,3995.0,1745.0,1510.0,5.0,37.0
4,Toyota,Innova 2.4 VX 7 STR [2016-2020],1950000,2018,69000,Diesel,Manual,Mumbai,Grey,First,Individual,2393 cc,148 bhp @ 3400 rpm,343 Nm @ 1400 rpm,RWD,4735.0,1830.0,1795.0,7.0,55.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,Mahindra,XUV500 W8 [2015-2017],850000,2016,90300,Diesel,Manual,Surat,White,First,Individual,2179 cc,138 bhp @ 3750 rpm,330 Nm @ 1600 rpm,FWD,4585.0,1890.0,1785.0,7.0,70.0
2055,Hyundai,Eon D-Lite +,275000,2014,83000,Petrol,Manual,Ahmedabad,White,Second,Individual,814 cc,55 bhp @ 5500 rpm,75 Nm @ 4000 rpm,FWD,3495.0,1550.0,1500.0,5.0,32.0
2056,Ford,Figo Duratec Petrol ZXI 1.2,240000,2013,73000,Petrol,Manual,Thane,Silver,First,Individual,1196 cc,70 bhp @ 6250 rpm,102 Nm @ 4000 rpm,FWD,3795.0,1680.0,1427.0,5.0,45.0
2057,BMW,5-Series 520d Luxury Line [2017-2019],4290000,2018,60474,Diesel,Automatic,Coimbatore,White,First,Individual,1995 cc,188 bhp @ 4000 rpm,400 Nm @ 1750 rpm,RWD,4936.0,1868.0,1479.0,5.0,65.0


In [5]:
df.isnull().sum()

Unnamed: 0,0
Make,0
Model,0
Price,0
Year,0
Kilometer,0
Fuel Type,0
Transmission,0
Location,0
Color,0
Owner,0


In [6]:
#Drop rows where Price is missing (target variable must be complete)
df = df.dropna(subset=['Price'])

In [7]:
y=df['Price']

In [8]:
y

Unnamed: 0,Price
0,505000
1,450000
2,220000
3,799000
4,1950000
...,...
2054,850000
2055,275000
2056,240000
2057,4290000


In [9]:
df['Year'].unique()

array([2017, 2014, 2011, 2019, 2018, 2015, 2016, 2020, 2013, 2009, 2010,
       2021, 2012, 2006, 2022, 2007, 2008, 2004, 2002, 1988, 2000, 1996])

In [10]:
# Create derived feature: Car Age
df['Car_Age'] = 2025 - df['Year']

In [11]:
df['Car_Age'].unique()

array([ 8, 11, 14,  6,  7, 10,  9,  5, 12, 16, 15,  4, 13, 19,  3, 18, 17,
       21, 23, 37, 25, 29])

In [12]:
# Clean and convert Engine & Max Power
df['Engine'] = pd.to_numeric(df['Engine'].astype(str).str.replace(' cc', '', regex=False), errors='coerce')
df['Max Power'] = pd.to_numeric(df['Max Power'].astype(str).str.replace(' bhp', '', regex=False), errors='coerce')


In [13]:
#Drop unnecessary columns
df.drop(['Price', 'Max Torque', 'Color'], axis=1, inplace=True, errors='ignore')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Make                2059 non-null   object 
 1   Model               2059 non-null   object 
 2   Year                2059 non-null   int64  
 3   Kilometer           2059 non-null   int64  
 4   Fuel Type           2059 non-null   object 
 5   Transmission        2059 non-null   object 
 6   Location            2059 non-null   object 
 7   Owner               2059 non-null   object 
 8   Seller Type         2059 non-null   object 
 9   Engine              1979 non-null   float64
 10  Max Power           0 non-null      float64
 11  Drivetrain          1923 non-null   object 
 12  Length              1995 non-null   float64
 13  Width               1995 non-null   float64
 14  Height              1995 non-null   float64
 15  Seating Capacity    1995 non-null   float64
 16  Fuel T

In [15]:
# Handle missing values
for col in ['Engine', 'Max Power', 'Length', 'Width', 'Height', 'Seating Capacity', 'Fuel Tank Capacity']:
    df[col] = df[col].fillna(df[col].mean())

In [16]:
# For object columns: fill with mode
for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [19]:
# One-hot encode categorical features
x = pd.get_dummies(df, drop_first=True)

In [20]:
x

Unnamed: 0,Year,Kilometer,Engine,Max Power,Length,Width,Height,Seating Capacity,Fuel Tank Capacity,Car_Age,...,Location_Zirakpur,Owner_First,Owner_Fourth,Owner_Second,Owner_Third,Owner_UnRegistered Car,Seller Type_Corporate,Seller Type_Individual,Drivetrain_FWD,Drivetrain_RWD
0,2017,87150,1198.0,,3990.0,1680.0,1505.0,5.0,35.00000,8,...,False,True,False,False,False,False,True,False,True,False
1,2014,75000,1248.0,,3995.0,1695.0,1555.0,5.0,42.00000,11,...,False,False,False,True,False,False,False,True,True,False
2,2011,67000,1197.0,,3585.0,1595.0,1550.0,5.0,35.00000,14,...,False,True,False,False,False,False,False,True,True,False
3,2019,37500,1197.0,,3995.0,1745.0,1510.0,5.0,37.00000,6,...,False,True,False,False,False,False,False,True,True,False
4,2018,69000,2393.0,,4735.0,1830.0,1795.0,7.0,55.00000,7,...,False,True,False,False,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,2016,90300,2179.0,,4585.0,1890.0,1785.0,7.0,70.00000,9,...,False,True,False,False,False,False,False,True,True,False
2055,2014,83000,814.0,,3495.0,1550.0,1500.0,5.0,32.00000,11,...,False,False,False,True,False,False,False,True,True,False
2056,2013,73000,1196.0,,3795.0,1680.0,1427.0,5.0,45.00000,12,...,False,True,False,False,False,False,False,True,True,False
2057,2018,60474,1995.0,,4936.0,1868.0,1479.0,5.0,65.00000,7,...,False,True,False,False,False,False,False,True,False,True


In [21]:
# Train/test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [22]:
#  Model: Random Forest
model = RandomForestRegressor()
model.fit(x_train, y_train)

In [24]:
#  Predict and evaluate
y_pred = model.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [25]:
print(f"✅ Model trained successfully.")
print(f"✅ RMSE: {rmse:,.2f}")
print(f"✅ R² Score: {r2:.2f}")

✅ Model trained successfully.
✅ RMSE: 1,353,792.74
✅ R² Score: 0.74


In [27]:
import pickle

# 🔹 Save the trained model to a file
with open('car_price_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("✅ Model saved as car_price_model.pkl")


✅ Model saved as car_price_model.pkl
