<a href="https://colab.research.google.com/github/theamitmehra/Oasis-Infobyte-projects/blob/main/Cars_Sales_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load and Inspect the Dataset

In [None]:
import pandas as pd

df = pd.read_csv('/content/car data.csv')
df.head()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Driven_kms     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Selling_type   301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


## Data Preprocessing

### Feature Engineering

In [None]:
df['Car_Age'] = 2024 - df['Year']
df.drop(['Year', 'Car_Name'], axis = 1, inplace=True)

In [None]:
X = df.drop('Selling_Price', axis = 1)
y = df['Selling_Price']

### Feature Encoding

In [None]:
categorical_features = ['Fuel_Type', 'Selling_type', 'Transmission']

In [None]:
from sklearn.preprocessing import OneHotEncoder
categorical_transformer = OneHotEncoder(drop='first')

### Feature Scaling

In [None]:
numerical_features = ['Present_Price', 'Driven_kms', 'Car_Age', 'Owner']

In [None]:
from sklearn.preprocessing import StandardScaler
numerical_transformer = StandardScaler()

In [None]:
from sklearn.compose import ColumnTransformer
Preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)

### Data splitting

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Build the Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

# Random Forest Model
model_pipeline = Pipeline(steps=[('preprocessor', Preprocessor),
                                  ('model', RandomForestRegressor(n_estimators=50, max_depth=10, min_samples_split=2, random_state=42))])

model_pipeline.fit(X_train, y_train)


## Model Evaluation

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

y_train_pred = model_pipeline.predict(X_train)
y_test_pred = model_pipeline.predict(X_test)

In [None]:
print("Train R2 Score:", r2_score(y_train, y_train_pred))
print("Test R2 Score:", r2_score(y_test, y_test_pred))
print("Test MAE:", mean_absolute_error(y_test, y_test_pred))

Train R2 Score: 0.9853769597782989
Test R2 Score: 0.9628251202739098
Test MAE: 0.6012905788258337


## Feature Importance

In [None]:
# Feature Importance
import numpy as np

feature_names = numerical_features + list(Preprocessor.transformers_[1][1].get_feature_names_out())
feature_importances = model_pipeline.named_steps['model'].feature_importances_

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
print(importance_df.sort_values(by='Importance', ascending=False))


         Feature  Importance
4  Present_Price    0.891155
6        Car_Age    0.054884
5     Driven_kms    0.037536
3          Owner    0.006442
2        Car_Age    0.003550
0  Present_Price    0.002693
1     Driven_kms    0.002101
7          Owner    0.001641


## Save the Model

In [None]:
import joblib

joblib.dump(model_pipeline, 'car_price_prediction_model.pkl')

['car_price_prediction_model.pkl']