In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dataset = pd.read_csv(r"C:\Users\BASHA\Desktop\machine learning\car data.csv")
dataset.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


**Exploring the dataset**

In [3]:
dataset.shape

(301, 9)

In [4]:
dataset.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [6]:
dataset.select_dtypes(include='object').columns

Index(['Car_Name', 'Fuel_Type', 'Seller_Type', 'Transmission'], dtype='object')

In [7]:
dataset.select_dtypes(include=['float64', 'int64']).columns

Index(['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Owner'], dtype='object')

In [8]:
dataset.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [9]:
# for NA values

dataset.isna().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [10]:
dataset = dataset.drop(columns='Car_Name')

In [11]:
dataset.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [12]:
dataset['Current Year'] = 2024

In [13]:
dataset.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current Year
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0,2024
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0,2024
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0,2024
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0,2024
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0,2024


In [14]:
dataset['Years old'] = dataset['Current Year'] - dataset['Year']

In [15]:
dataset.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current Year,Years old
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0,2024,10
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0,2024,11
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0,2024,7
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0,2024,13
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0,2024,10


In [16]:
dataset = dataset.drop(columns=['Current Year', 'Year'])
dataset.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Years old
0,3.35,5.59,27000,Petrol,Dealer,Manual,0,10
1,4.75,9.54,43000,Diesel,Dealer,Manual,0,11
2,7.25,9.85,6900,Petrol,Dealer,Manual,0,7
3,2.85,4.15,5200,Petrol,Dealer,Manual,0,13
4,4.6,6.87,42450,Diesel,Dealer,Manual,0,10


**Encoding categorical variables**

In [17]:
dataset.select_dtypes(include='object').columns

Index(['Fuel_Type', 'Seller_Type', 'Transmission'], dtype='object')

In [18]:
#one hot encoding

dataset = pd.get_dummies(data=dataset, drop_first=True).astype(int)

In [19]:
dataset.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,Years old,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,3,5,27000,0,10,0,1,0,1
1,4,9,43000,0,11,1,0,0,1
2,7,9,6900,0,7,0,1,0,1
3,2,4,5200,0,13,0,1,0,1
4,4,6,42450,0,10,1,0,0,1


In [20]:
dataset.shape

(301, 9)

**Splitting the data**

In [21]:
dataset.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,Years old,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,3,5,27000,0,10,0,1,0,1
1,4,9,43000,0,11,1,0,0,1
2,7,9,6900,0,7,0,1,0,1
3,2,4,5200,0,13,0,1,0,1
4,4,6,42450,0,10,1,0,0,1


In [22]:
# feature matrix

X = dataset.drop(columns='Selling_Price')

In [23]:
# target variable
y = dataset['Selling_Price']

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)

In [25]:
X_train.shape

(240, 8)

In [26]:
X_test.shape

(61, 8)

**Model building**

In [27]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)

In [28]:
y_pred = regressor.predict(X_test)

In [29]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9527394093971288

**Hyperparameter Tuning**

In [30]:
from sklearn.model_selection import RandomizedSearchCV

In [31]:
parameters = {
    'n_estimators':[100,200,300,400,500,600,700,800,900,1000],
    'criterion':['mse','absolute_error'],
    'max_depth':[10,20,30,40,50],
    'min_samples_split':[2,5,10,20,50],
    'min_samples_leaf':[1,2,5,10],
    'max_features':['auto','sqrt','log2']
}

In [32]:
parameters

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
 'criterion': ['mse', 'absolute_error'],
 'max_depth': [10, 20, 30, 40, 50],
 'min_samples_split': [2, 5, 10, 20, 50],
 'min_samples_leaf': [1, 2, 5, 10],
 'max_features': ['auto', 'sqrt', 'log2']}

In [33]:
random_cv = RandomizedSearchCV(estimator=regressor, param_distributions=parameters, n_iter=10,
                              scoring='neg_mean_absolute_error', cv=5, verbose=2, n_jobs=-1)

In [34]:
random_cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\BASHA\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\BASHA\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 340, in fit
    self._validate_params()
  File "C:\Users\BASHA\anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\BASHA\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParamete

In [35]:
random_cv.best_estimator_

In [36]:
random_cv.best_params_

{'n_estimators': 900,
 'min_samples_split': 2,
 'min_samples_leaf': 5,
 'max_features': 'auto',
 'max_depth': 10,
 'criterion': 'absolute_error'}

**Predicting a observation**

In [37]:
dataset.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,Years old,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,3,5,27000,0,10,0,1,0,1
1,4,9,43000,0,11,1,0,0,1
2,7,9,6900,0,7,0,1,0,1
3,2,4,5200,0,13,0,1,0,1
4,4,6,42450,0,10,1,0,0,1


In [38]:
single_obs = [[8.50,3500,0,5,1,0,0,1]]

In [39]:
regressor.predict(single_obs)



array([6.97])