In [None]:
import pandas as pd

df = pd.read_csv("car data.csv")

print(df.head())
print(df.info())
print(df.isnull().sum())

df['Car_Age'] = 2025 - df['Year']

df.drop(['Year'], axis=1, inplace=True)

df.drop(['Car_Name'], axis=1, inplace=True)

df['Fuel_Type'] = df['Fuel_Type'].map({'Petrol':0, 'Diesel':1, 'CNG':2})
df['Selling_type'] = df['Selling_type'].map({'Individual':0, 'Dealer':1})
df['Transmission'] = df['Transmission'].map({'Manual':0, 'Automatic':1})

X = df.drop(['Selling_Price'], axis=1)
y = df['Selling_Price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np

model = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

np.save("y_test.npy", y_test)
np.save("y_pred.npy", y_pred)
print("Saved y_test.npy and y_pred.npy successfully!")

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(((y_test - y_pred) ** 2).mean())

print("R2 Score :", r2)
print("MAE      :", mae)
print("RMSE     :", rmse)


In [None]:

import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Selling Price")
plt.ylabel("Predicted Selling Price")
plt.title("Actual vs Predicted Car Selling Price")
plt.grid(True)
plt.show()

errors = y_test - y_pred

plt.figure(figsize=(10,5))
plt.hist(errors, bins=20)
plt.xlabel("Prediction Error")
plt.ylabel("Count")
plt.title("Error Distribution")
plt.grid(True)
plt.show()

import numpy as np

importances = model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(10,5))
plt.barh(feature_names, importances)
plt.xlabel("Importance Score")
plt.title("Feature Importance in Car Price Prediction")
plt.grid(True)
plt.show()


In [None]:
def predict_price(Present_Price, Driven_kms, Fuel_Type, Selling_type, Transmission, Owner, Car_Age):
    input_data = pd.DataFrame([[
        Present_Price,
        Driven_kms,
        Fuel_Type,
        Selling_type,
        Transmission,
        Owner,
        Car_Age
    ]], columns=X.columns)
    
    predicted_price = model.predict(input_data)[0]
    return predicted_price


price = predict_price(
    Present_Price = 8.5,   # showroom price
    Driven_kms = 30000,
    Fuel_Type = 0,         # 0 = Petrol
    Selling_type = 1,      # 1 = Dealer
    Transmission = 0,      # 0 = Manual
    Owner = 0,
    Car_Age = 2025 - 2018
)

print("Predicted Selling Price:", price, "Lakhs")



In [None]:
import pickle

with open("model.pkl", "wb") as file:
    pickle.dump(model, file)

print("Model saved successfully as model.pkl")


In [None]:
loaded_model = pickle.load(open("model.pkl", "rb"))

sample_pred = loaded_model.predict([X_test.iloc[0]])
print("Loaded Model Prediction:", sample_pred)


In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np

# -------------------------------
# Create folder to store graphs
# -------------------------------
folder = "graphs"
os.makedirs(folder, exist_ok=True)
print("Folder created:", folder)

# -------------------------------
# 1) Actual vs Predicted Plot
# -------------------------------
plt.figure(figsize=(10,6))
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Selling Price")
plt.ylabel("Predicted Selling Price")
plt.title("Actual vs Predicted Car Selling Price")
plt.grid(True)

plt.savefig(f"{folder}/actual_vs_predicted.png")
plt.close()

# -------------------------------
# 2) Error Distribution Plot
# -------------------------------
errors = y_test - y_pred

plt.figure(figsize=(10,5))
plt.hist(errors, bins=20)
plt.xlabel("Prediction Error")
plt.ylabel("Count")
plt.title("Error Distribution")
plt.grid(True)

plt.savefig(f"{folder}/error_distribution.png")
plt.close()

# -------------------------------
# 3) Feature Importance Plot
# -------------------------------
importances = model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(10,5))
plt.barh(feature_names, importances)
plt.xlabel("Importance Score")
plt.title("Feature Importance in Car Price Prediction")
plt.grid(True)

plt.savefig(f"{folder}/feature_importance.png")
plt.close()

print("All graphs saved successfully in 'graphs' folder!")
