# Imports

In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import re

# Load Dataset

In [81]:
df = pd.read_csv("used_car_price.csv", encoding="latin1")
df.drop(columns=['Name','Location'],axis=1,inplace=True)
df.head()

Unnamed: 0,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


# Preprocessing and cleaning

In [82]:
def extract_number(text):
    num = re.findall(r"[\d\.]+", str(text))
    if num:  # Check if the list is not empty
        return float(num[0])
    else:
        return None  # or np.nan

# Apply cleaning
df['Mileage'] = df['Mileage'].apply(extract_number)
df['Engine'] = df['Engine'].apply(extract_number)
df['Power'] = df['Power'].apply(extract_number)

In [83]:
df.dropna(inplace=True)
df.shape

(5924, 10)

In [84]:
df

Unnamed: 0,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,2010,72000,CNG,Manual,First,26.60,998.0,58.16,5.0,1.75
1,2015,41000,Diesel,Manual,First,19.67,1582.0,126.20,5.0,12.50
2,2011,46000,Petrol,Manual,First,18.20,1199.0,88.70,5.0,4.50
3,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.00
4,2013,40670,Diesel,Automatic,Second,15.20,1968.0,140.80,5.0,17.74
...,...,...,...,...,...,...,...,...,...,...
7290,2008,74270,CNG,Manual,First,11.14,1586.0,50.00,5.0,1.65
7291,2012,80173,CNG,Manual,First,12.14,998.0,42.00,5.0,2.95
7292,2019,78600,CNG,Manual,First,30.48,1086.0,58.00,5.0,5.10
7293,2014,76256,CNG,Manual,First,26.60,998.0,58.00,5.0,3.45


# Train Test Split

In [85]:

# Features and target
X = df.drop('Price', axis=1)
y = df['Price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Pipeline

In [86]:
# Categorical and numerical columns
categorical_features = ['Fuel_Type', 'Transmission', 'Owner_Type']
numerical_features = ['Year', 'Kilometers_Driven', 'Seats','Mileage','Engine','Power']

# Preprocessing
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Train
model_pipeline.fit(X_train, y_train)

# Evaluation

In [87]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Predict
y_pred = model_pipeline.predict(X_test)

# Metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R2 Score: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")


R2 Score: 0.9063
MSE: 10.1542
RMSE: 3.1866


# Prediction System

In [90]:
def predict_car_price(model_pipeline, year, kilometers_driven, fuel_type, transmission, owner_type, seats, Mileage, Engine, Power):
    # Create a DataFrame including Mileage, Engine, and Power
    input_data = pd.DataFrame({
        'Year': [year],
        'Kilometers_Driven': [kilometers_driven],
        'Fuel_Type': [fuel_type],
        'Transmission': [transmission],
        'Owner_Type': [owner_type],
        'Seats': [seats],
        'Mileage': [Mileage],  # Added Mileage
        'Engine': [Engine],    # Added Engine
        'Power': [Power]       # Added Power
    })
    
    # Predict the price using the model pipeline
    prediction = model_pipeline.predict(input_data)
    
    return prediction[0]

# Example
predicted_price = predict_car_price(
    model_pipeline=model_pipeline,
    year=2015,
    kilometers_driven=41000,
    fuel_type='Diesel',
    transmission='Manual',
    owner_type='First',
    seats=5.0,
    Mileage=19.67,  # Mileage in kmpl
    Engine=1582,    # Engine in CC
    Power=126.2     # Power in bhp
)

print(f"Predicted Price: {predicted_price:.2f} Lakh")


Predicted Price: 11.94 Lakh


In [92]:
predicted_price = predict_car_price(
    model_pipeline=model_pipeline,
    year=2010,
    kilometers_driven=72000,
    fuel_type='CNG',
    transmission='Manual',
    owner_type='First',
    seats=5.0,
    Mileage=26.6,  # Mileage in km/kg
    Engine=998,    # Engine in CC
    Power=58.16    # Power in bhp
)

print(f"Predicted Price: {predicted_price:.2f} Lakh")

Predicted Price: 1.84 Lakh


# Save Pipeline

In [93]:
import pickle 

pickle.dump(model_pipeline,open("pipeline.pkl",'wb'))