### 1. Import Modules

In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

In [68]:
cars_data = pd.read_csv('./data/cardetails.csv')

### 2. Clean the Datasets

In [69]:
cars_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [70]:
cars_data.drop(columns=['torque'], inplace=True)

In [71]:
cars_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,5.0


In [72]:
cars_data.shape

(8128, 12)

In [73]:
cars_data.isnull().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
seats            221
dtype: int64

In [74]:
cars_data.dropna(inplace=True)

In [75]:
cars_data.shape

(7907, 12)

In [76]:
cars_data.duplicated().sum()

1189

In [77]:
cars_data.drop_duplicates(inplace=True)

In [78]:
cars_data.shape

(6718, 12)

In [79]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6718 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6718 non-null   object 
 1   year           6718 non-null   int64  
 2   selling_price  6718 non-null   int64  
 3   km_driven      6718 non-null   int64  
 4   fuel           6718 non-null   object 
 5   seller_type    6718 non-null   object 
 6   transmission   6718 non-null   object 
 7   owner          6718 non-null   object 
 8   mileage        6718 non-null   object 
 9   engine         6718 non-null   object 
 10  max_power      6718 non-null   object 
 11  seats          6718 non-null   float64
dtypes: float64(1), int64(3), object(8)
memory usage: 682.3+ KB


### 3. Explore Data

In [80]:
cars_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,5.0


In [81]:
# Get brand name of the car without the model
def get_brand_name(car_name):
    car_name = car_name.strip().split(' ')[0]
    return car_name

In [82]:
# Get the value index of the categories
def clean_data(value):
    if isinstance(value, str):
        cleaned_value = ''
        decimal_found = False
        for char in value:
            if char.isdigit():
                cleaned_value += char
            elif char == '.' and not decimal_found:
                cleaned_value += char
                decimal_found = True
        value = cleaned_value
        
    if value == '':
        return 0
    return float(value)

In [83]:
cars_data['name'] = cars_data['name'].apply(get_brand_name)

In [84]:
cars_data['name'].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Datsun', 'Jeep', 'Mercedes-Benz',
       'Mitsubishi', 'Audi', 'Volkswagen', 'BMW', 'Nissan', 'Lexus',
       'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo', 'Kia', 'Fiat', 'Force',
       'Ambassador', 'Ashok', 'Isuzu', 'Opel'], dtype=object)

In [85]:
cars_data['mileage'] = cars_data['mileage'].apply(clean_data)
cars_data['engine'] = cars_data['engine'].apply(clean_data)
cars_data['max_power'] = cars_data['max_power'].apply(clean_data)

In [86]:
cars_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


In [87]:
categorical_columns = ['name', 'fuel', 'seller_type', 'transmission', 'owner']
# numerical_columns = ['year', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']

In [88]:
# Encode columns contain string type
label_encoder = LabelEncoder()
for column in categorical_columns:
    cars_data[column] = label_encoder.fit_transform(cars_data[column])


In [89]:
cars_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,20,2014,450000,145500,1,1,1,0,23.4,1248.0,74.0,5.0
1,26,2014,370000,120000,1,1,1,2,21.14,1498.0,103.52,5.0
2,10,2006,158000,140000,3,1,1,4,17.7,1497.0,78.0,5.0
3,11,2010,225000,127000,1,1,1,0,23.0,1396.0,90.0,5.0
4,20,2007,130000,120000,3,1,1,0,16.1,1298.0,88.2,5.0


### 4. Train the Model

In [90]:
input_data = cars_data.drop(columns=['selling_price'])
output_data = cars_data['selling_price']

In [91]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2, random_state=0)

In [92]:
# # Find the best parameter for the model & train the model
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 4, 8, 10, 20],
#     'min_samples_leaf': [1, 2, 5]
# }

# rf = RandomForestRegressor(random_state=0)

# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# grid_search.fit(X_train, y_train)

# best_rf = grid_search.best_estimator_

# y_pred = best_rf.predict(X_test)

# print(f"Best Parameters: {grid_search.best_params_}")

In [93]:
rf = RandomForestRegressor(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300, random_state=0)

rf.fit(X_test, y_test)

y_pred = rf.predict(X_test)

In [94]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 36686.97152316193


In [95]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
5666,620000,613266.666667,6733.333333
7916,600000,545283.330000,54716.670000
2146,330000,339950.000000,-9950.000000
7462,550000,523583.316667,26416.683333
5387,375000,355029.966667,19970.033333
...,...,...,...
6909,300000,301386.653333,-1386.653333
2306,185000,191159.996667,-6159.996667
7082,300000,289226.666667,10773.333333
1205,511000,487549.966667,23450.033333


### 5. Test the model

In [96]:
X_train.head(1)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
3125,20,2007,75000,3,1,1,0,19.7,796.0,46.3,5.0


In [97]:
input_data_model = pd.DataFrame([[10, 2019, 500000, 3, 1, 1, 1, 20, 600, 90, 4]],
                                columns = ['name','year','km_driven','fuel','seller_type','transmission','owner','mileage','engine','max_power','seats'])

In [98]:
print(rf.predict(input_data_model))

[631063.29]


In [99]:
import pickle as pk

pk.dump(rf, open('model.pkl', 'wb'))