In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('https://media.githubusercontent.com/media/musthave-ML10/data_source/main/car.csv')

In [4]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
name             8128 non-null object
year             8128 non-null int64
selling_price    8128 non-null int64
km_driven        8128 non-null int64
fuel             8128 non-null object
seller_type      8128 non-null object
transmission     8128 non-null object
owner            8128 non-null object
mileage          7907 non-null object
engine           7907 non-null object
max_power        7913 non-null object
torque           7906 non-null object
seats            7907 non-null float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


In [6]:
round(df.describe(),2)

Unnamed: 0,year,selling_price,km_driven,seats
count,8128.0,8128.0,8128.0,7907.0
mean,2013.8,638271.81,69819.51,5.42
std,4.04,806253.4,56550.55,0.96
min,1983.0,29999.0,1.0,2.0
25%,2011.0,254999.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,675000.0,98000.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


In [7]:
df['engine'].str.split(expand=True)

Unnamed: 0,0,1
0,1248,CC
1,1498,CC
2,1497,CC
3,1396,CC
4,1298,CC
5,1197,CC
6,1061,CC
7,796,CC
8,1364,CC
9,1399,CC


In [8]:
df[['engine','engine_unit']] = df['engine'].str.split(expand=True)

In [9]:
df['engine'].head()

0    1248
1    1498
2    1497
3    1396
4    1298
Name: engine, dtype: object

In [10]:
df['engine'] = df['engine'].astype('float32')

In [11]:
df['engine'].head()

0    1248.0
1    1498.0
2    1497.0
3    1396.0
4    1298.0
Name: engine, dtype: float32

In [12]:
df['engine_unit'].unique()

array(['CC', nan], dtype=object)

In [13]:
df.drop('engine_unit', axis=1, inplace=True)

In [14]:
df[['max_power','max_power_unit']] = df['max_power'].str.split(expand=True)

In [15]:
df['max_power'].head()

0        74
1    103.52
2        78
3        90
4      88.2
Name: max_power, dtype: object

In [16]:
df['max_power'] = df['max_power'].astype('float32')

ValueError: could not convert string to float: 'bhp'

In [17]:
df[df['max_power'] =='bhp']

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,max_power_unit
4933,Maruti Omni CNG,2000,80000,100000,CNG,Individual,Manual,Second Owner,10.9 km/kg,796.0,bhp,,8.0,


In [18]:
def isFloat(value):
    try:
        num = float(value)
        return num
    except ValueError:
        return np.NaN

In [19]:
df['max_power'] = df['max_power'].apply(isFloat)

In [20]:
df['max_power_unit'].unique()

array(['bhp', nan, None], dtype=object)

In [21]:
df.drop('max_power_unit',  axis=1, inplace=True)

In [22]:
df[['mileage','mileage_unit']] = df['mileage'].str.split(expand=True)

In [23]:
df['mileage'] = df['mileage'].astype('float32')

In [24]:
df['mileage_unit'].unique()

array(['kmpl', 'km/kg', nan], dtype=object)

In [25]:
df['fuel'].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

In [26]:
def mile(x):
    if x['fuel'] =='Petrol':
        return x['mileage']/ 80.43
    elif x['fuel'] =='Diesel':
        return x['mileage'] / 73.56
    elif x['fuel'] =='LPG':
        return x['mileage'] / 40.85
    else:
        return x['mileage'] / 44.23

In [27]:
df['mileage'] = df.apply(mile, axis=1)

In [28]:
df.drop('mileage_unit',axis=1,inplace=True)

In [29]:
df['torque'].head()

0              190Nm@ 2000rpm
1         250Nm@ 1500-2500rpm
2       12.7@ 2,700(kgm@ rpm)
3    22.4 kgm at 1750-2750rpm
4       11.5@ 4,500(kgm@ rpm)
Name: torque, dtype: object

In [30]:
df['torque']= df['torque'].str.upper()

In [31]:
def torque_unit(x):
    if 'NM' in str(x):
        return 'Nm'
    elif 'KGM' in str(x):
        return 'kgm'

In [32]:
df['torque_unit'] = df['torque'].apply(torque_unit)

In [33]:
df['torque_unit'].unique()

array(['Nm', 'kgm', None], dtype=object)

In [34]:
df['torque_unit'].isna()
df[df['torque_unit'].isna()]
df[df['torque_unit'].isna()]['torque'].unique()

array([nan, '250@ 1250-5000RPM', '510@ 1600-2400', '110(11.2)@ 4800',
       '210 / 1900'], dtype=object)

In [35]:
df[df['torque_unit'].isna()]['torque'].unique()

array([nan, '250@ 1250-5000RPM', '510@ 1600-2400', '110(11.2)@ 4800',
       '210 / 1900'], dtype=object)

In [36]:
df['torque_unit'].fillna('Nm',inplace=True)

In [42]:
def split_num(x):
    x = str(x)
    for i,j in enumerate(x):
        if j not in '0123456789.':
            cut = i
            break
    return x[:cut]

In [43]:
df['torque'] = df['torque'].apply(split_num)

In [44]:
df['torque']

0          190
1          250
2         12.7
3         22.4
4         11.5
5       113.75
6          7.8
7           59
8          170
9          160
10         248
11          78
12         190
13            
14          84
15         115
16         200
17        22.4
18          62
19       219.7
20         160
21         200
22         190
23         200
24         114
25         115
26          69
27       172.5
28          84
29          59
         ...  
8098       190
8099       330
8100       260
8101       197
8102     259.8
8103          
8104      13.5
8105      11.3
8106     259.8
8107       200
8108       250
8109       215
8110        90
8111       170
8112       215
8113      20.4
8114        62
8115        59
8116        90
8117       190
8118     113.7
8119        90
8120      96.1
8121        90
8122     219.7
8123     113.7
8124        24
8125       190
8126       140
8127       140
Name: torque, Length: 8128, dtype: object

In [45]:
df['torque'] = df['torque'].astype('float64')

ValueError: could not convert string to float: 

In [46]:
df['torque'] = df['torque'].replace('', np.NaN)

In [47]:
df['torque'] = df['torque'].astype('float64')

In [48]:
df['torque'].head()

0    190.0
1    250.0
2     12.7
3     22.4
4     11.5
Name: torque, dtype: float64

In [50]:
def torque_trans(x):
    if x['torque_unit'] =='kgm':
        return x['torque']*9.8066
    else:
        return x['torque']

In [51]:
df['torque'] = df.apply(torque_trans, axis=1)

In [52]:
df.drop('torque_unit',axis=1, inplace=True)

In [53]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,0.318108,1248.0,74.0,190.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,0.287384,1498.0,103.52,250.0,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,0.220067,1497.0,78.0,124.54382,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,0.31267,1396.0,90.0,219.66784,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,0.200174,1298.0,88.2,112.7759,5.0


In [54]:
df['name'] = df['name'].str.split(expand=True)[0]

In [56]:
df['name'].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Fiat', 'Datsun', 'Jeep',
       'Mercedes-Benz', 'Mitsubishi', 'Audi', 'Volkswagen', 'BMW',
       'Nissan', 'Lexus', 'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo',
       'Kia', 'Force', 'Ambassador', 'Ashok', 'Isuzu', 'Opel', 'Peugeot'],
      dtype=object)

In [57]:
df['name'] = df['name'].replace('Land','Land Rover')

In [58]:
df.isna().mean()

name             0.000000
year             0.000000
selling_price    0.000000
km_driven        0.000000
fuel             0.000000
seller_type      0.000000
transmission     0.000000
owner            0.000000
mileage          0.027190
engine           0.027190
max_power        0.026575
torque           0.027313
seats            0.027190
dtype: float64

In [59]:
df.dropna(inplace=True)
len(df)

7906

In [60]:
df = pd.get_dummies(df, columns = ['name','fuel','seller_type','transmission','owner'], drop_first=True)

In [62]:
df.head()

Unnamed: 0,year,selling_price,km_driven,mileage,engine,max_power,torque,seats,name_Ashok,name_Audi,...,fuel_Diesel,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2014,450000,145500,0.318108,1248.0,74.0,190.0,5.0,0,0,...,1,0,0,1,0,1,0,0,0,0
1,2014,370000,120000,0.287384,1498.0,103.52,250.0,5.0,0,0,...,1,0,0,1,0,1,0,1,0,0
2,2006,158000,140000,0.220067,1497.0,78.0,124.54382,5.0,0,0,...,0,0,1,1,0,1,0,0,0,1
3,2010,225000,127000,0.31267,1396.0,90.0,219.66784,5.0,0,0,...,1,0,0,1,0,1,0,0,0,0
4,2007,130000,120000,0.200174,1298.0,88.2,112.7759,5.0,0,0,...,0,0,1,1,0,1,0,0,0,0


In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('selling_price', axis=1), df['selling_price'],test_size=0.2, random_state=100)

In [72]:
from sklearn.ensemble import RandomForestRegressor

In [73]:
model = RandomForestRegressor(random_state=100)

In [74]:
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

In [75]:
from sklearn.metrics import mean_squared_error
print("train_rmse:", mean_squared_error(y_train, train_pred)**0.5, "test_rsme:", mean_squared_error(y_test, test_pred)**0.5)

train_rmse: 60091.23988811351 test_rsme: 132476.64010840387


In [76]:
from sklearn.model_selection import KFold

In [77]:
df

Unnamed: 0,year,selling_price,km_driven,mileage,engine,max_power,torque,seats,name_Ashok,name_Audi,...,fuel_Diesel,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2014,450000,145500,0.318108,1248.0,74.00,190.00000,5.0,0,0,...,1,0,0,1,0,1,0,0,0,0
1,2014,370000,120000,0.287384,1498.0,103.52,250.00000,5.0,0,0,...,1,0,0,1,0,1,0,1,0,0
2,2006,158000,140000,0.220067,1497.0,78.00,124.54382,5.0,0,0,...,0,0,1,1,0,1,0,0,0,1
3,2010,225000,127000,0.312670,1396.0,90.00,219.66784,5.0,0,0,...,1,0,0,1,0,1,0,0,0,0
4,2007,130000,120000,0.200174,1298.0,88.20,112.77590,5.0,0,0,...,0,0,1,1,0,1,0,0,0,0
5,2017,440000,45000,0.250404,1197.0,81.86,113.75000,5.0,0,0,...,0,0,1,1,0,1,0,0,0,0
6,2007,96000,175000,0.423501,1061.0,57.50,76.49148,5.0,0,0,...,0,1,0,1,0,1,0,0,0,0
7,2001,45000,5000,0.200174,796.0,37.00,59.00000,4.0,0,0,...,0,0,1,1,0,1,0,1,0,0
8,2011,350000,90000,0.320691,1364.0,67.10,170.00000,5.0,0,0,...,1,0,0,1,0,1,0,0,0,0
9,2013,200000,169000,0.271887,1399.0,68.10,160.00000,5.0,0,0,...,1,0,0,1,0,1,0,0,0,0


In [78]:
df.reset_index(drop=True, inplace=True)

In [79]:
df

Unnamed: 0,year,selling_price,km_driven,mileage,engine,max_power,torque,seats,name_Ashok,name_Audi,...,fuel_Diesel,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2014,450000,145500,0.318108,1248.0,74.00,190.00000,5.0,0,0,...,1,0,0,1,0,1,0,0,0,0
1,2014,370000,120000,0.287384,1498.0,103.52,250.00000,5.0,0,0,...,1,0,0,1,0,1,0,1,0,0
2,2006,158000,140000,0.220067,1497.0,78.00,124.54382,5.0,0,0,...,0,0,1,1,0,1,0,0,0,1
3,2010,225000,127000,0.312670,1396.0,90.00,219.66784,5.0,0,0,...,1,0,0,1,0,1,0,0,0,0
4,2007,130000,120000,0.200174,1298.0,88.20,112.77590,5.0,0,0,...,0,0,1,1,0,1,0,0,0,0
5,2017,440000,45000,0.250404,1197.0,81.86,113.75000,5.0,0,0,...,0,0,1,1,0,1,0,0,0,0
6,2007,96000,175000,0.423501,1061.0,57.50,76.49148,5.0,0,0,...,0,1,0,1,0,1,0,0,0,0
7,2001,45000,5000,0.200174,796.0,37.00,59.00000,4.0,0,0,...,0,0,1,1,0,1,0,1,0,0
8,2011,350000,90000,0.320691,1364.0,67.10,170.00000,5.0,0,0,...,1,0,0,1,0,1,0,0,0,0
9,2013,200000,169000,0.271887,1399.0,68.10,160.00000,5.0,0,0,...,1,0,0,1,0,1,0,0,0,0


In [80]:
kf=KFold(n_splits=5)

In [81]:
X=df.drop('selling_price', axis=1)
y=df['selling_price']

In [82]:
for i,j in kf.split(X):
    print(i,j)

[1582 1583 1584 ... 7903 7904 7905] [   0    1    2 ... 1579 1580 1581]
[   0    1    2 ... 7903 7904 7905] [1582 1583 1584 ... 3160 3161 3162]
[   0    1    2 ... 7903 7904 7905] [3163 3164 3165 ... 4741 4742 4743]
[   0    1    2 ... 7903 7904 7905] [4744 4745 4746 ... 6322 6323 6324]
[   0    1    2 ... 6322 6323 6324] [6325 6326 6327 ... 7903 7904 7905]


In [83]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index],X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [84]:
train_rmse_total = []
test_rmse_total = []

In [87]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index],X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = RandomForestRegressor(random_state=100)
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    train_rmse= mean_squared_error(y_train, train_pred)**0.5 
    test_rmse= mean_squared_error(y_test, test_pred)**0.5
    
    train_rmse_total.append(train_rmse)
    test_rmse_total.append(test_rmse)
    

In [88]:
train_rmse_total

[63077.94657794671,
 63077.94657794671,
 63077.94657794671,
 65600.89151855985,
 70286.92790264332,
 63126.25846374735,
 69548.52572421309]

In [89]:
print('train_rmse:',sum(train_rmse_total)/5, 'test_rmse:',sum(test_rmse_total)/5)

train_rmse: 91559.28866860076 test_rmse: 146383.47358831842


In [91]:
train_rmse_total = []
test_rmse_total = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index],X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = RandomForestRegressor(n_estimators=300, max_depth = 50, min_samples_split = 5, min_samples_leaf=1, n_jobs=-1 ,random_state=100)
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    train_rmse= mean_squared_error(y_train, train_pred)**0.5 
    test_rmse= mean_squared_error(y_test, test_pred)**0.5
    
    train_rmse_total.append(train_rmse)
    test_rmse_total.append(test_rmse)
    


In [92]:
print('train_rmse:',sum(train_rmse_total)/5, 'test_rmse:',sum(test_rmse_total)/5)

train_rmse: 66762.87840477741 test_rmse: 142206.09679835328
