In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('../data/car.csv')

data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


In [31]:
round(data.describe(), 2)

Unnamed: 0,year,selling_price,km_driven,seats
count,8128.0,8128.0,8128.0,7907.0
mean,2013.8,638271.81,69819.51,5.42
std,4.04,806253.4,56550.55,0.96
min,1983.0,29999.0,1.0,2.0
25%,2011.0,254999.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,675000.0,98000.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


In [32]:
data[['engine', 'engine_unit']] = data['engine'].str.split(expand = True)

data['engine'].head()

0    1248
1    1498
2    1497
3    1396
4    1298
Name: engine, dtype: object

In [33]:
data['engine'] = data['engine'].astype('float32')
data.drop('engine_unit', axis = 1, inplace = True)

In [34]:
# 공백 기준으로 나누기
split_cols = data['max_power'].str.split(n=1, expand=True)

# 첫 번째 컬럼 (숫자 부분)만 살림
data['max_power'] = split_cols[0]

# float32로 변환 (NaN 자동 처리됨)
data['max_power'] = pd.to_numeric(data['max_power'], errors='coerce').astype('float32')


In [38]:
data[['mileage', 'mileage_unit']] = data['mileage'].str.split(expand = True)

In [40]:
data['mileage'] = data['mileage'].astype('float32')

In [41]:
def mile(x):
    if x['fuel'] == 'Petrol':
        return x['mileage'] / 80.43
    elif x['fuel'] == 'Diesel':
        return x['mileage'] / 73.56
    elif x['fuel'] == 'LPG':
        return x['mileage'] / 40.85
    else:
        return x['mileage'] / 44.23

In [42]:
data['mileage'] = data.apply(mile, axis = 1)

In [43]:
data.drop('mileage_unit', axis = 1, inplace = True)

In [44]:
data['torque'] = data['torque'].str.upper()

In [45]:
def torque_unit(x):
    if 'NM' in str(x):
        return 'Nm'
    elif 'KGM' in str(x):
        return 'kgm'

In [47]:
data['torque_unit'] = data['torque'].apply(torque_unit)

In [48]:
data['torque_unit'].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
8123    False
8124    False
8125    False
8126    False
8127    False
Name: torque_unit, Length: 8128, dtype: bool

In [49]:
data[data['torque_unit'].isna()]

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,torque_unit
13,Maruti Swift 1.3 VXi,2007,200000,80000,Petrol,Individual,Manual,Second Owner,,,,,,
31,Fiat Palio 1.2 ELX,2003,70000,50000,Petrol,Individual,Manual,Second Owner,,,,,,
78,Tata Indica DLS,2003,50000,70000,Diesel,Individual,Manual,First Owner,,,,,,
87,Maruti Swift VDI BSIV W ABS,2015,475000,78000,Diesel,Dealer,Manual,First Owner,,,,,,
119,Maruti Swift VDI BSIV,2010,300000,120000,Diesel,Individual,Manual,Second Owner,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7846,Toyota Qualis Fleet A3,2000,200000,100000,Diesel,Individual,Manual,First Owner,,,,,,
7996,Hyundai Santro LS zipPlus,2000,140000,50000,Petrol,Individual,Manual,Second Owner,,,,,,
8009,Hyundai Santro Xing XS eRLX Euro III,2006,145000,80000,Petrol,Individual,Manual,Second Owner,,,,,,
8068,Ford Figo Aspire Facelift,2017,580000,165000,Diesel,Individual,Manual,First Owner,,,,,,


In [50]:
data[data['torque_unit'].isna()]['torque'].unique()

array([nan, '250@ 1250-5000RPM', '510@ 1600-2400', '110(11.2)@ 4800',
       '210 / 1900'], dtype=object)

In [52]:
data['torque_unit'].fillna({'torque_unit' : 'Nm'}, inplace = True)

In [53]:
def split_num(x):
    x = str(x)
    for i, j in enumerate(x):
        if j not in '0123456789':
            cut = i
            break
    return x[:cut]

In [54]:
data['torque'] = data['torque'].apply(split_num)

In [55]:
data['torque'] = data['torque'].replace('', np.nan)

In [56]:
data['torque'] = data['torque'].astype('float32')

In [57]:
def torque_trans(x):
    if x['torque_unit'] == 'kgm':
        return x['torque'] * 9.8066
    else:
        return x['torque']

In [58]:
data['torque'] = data.apply(torque_trans, axis = 1)
data.drop('torque_unit', axis = 1, inplace = True)

In [59]:
data['name'] = data['name'].str.split(expand = True)[0]

In [60]:
data['name'] = data['name'].replace('Land', 'Land Rover')

In [61]:
data.isna().mean()

name             0.000000
year             0.000000
selling_price    0.000000
km_driven        0.000000
fuel             0.000000
seller_type      0.000000
transmission     0.000000
owner            0.000000
mileage          0.027190
engine           0.027190
max_power        0.026575
torque           0.027313
seats            0.027190
dtype: float64

In [62]:
data.dropna(inplace = True)

In [63]:
data = pd.get_dummies(data, columns = ['name', 'fuel', 'seller_type', 
                                       'transmission', 'owner'], drop_first = True)

In [64]:
from sklearn.model_selection import train_test_split
x = data.drop('selling_price', axis = 1)
y = data['selling_price']
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size = 0.2, random_state = 100)

In [65]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state = 100)
model.fit(x_train, y_train)
train_pred = model.predict(x_train)
test_pred = model.predict(x_test)

In [67]:
from sklearn.metrics import mean_squared_error

print("train_rmse : ", mean_squared_error(y_train, train_pred) ** 0.5,
     "test_rmse : ", mean_squared_error(y_test, test_pred) ** 0.5)

train_rmse :  53603.59212300673 test_rmse :  131076.0319769309


In [68]:
from sklearn.model_selection import KFold

data.reset_index(drop = True, inplace = True)
kf = KFold(n_splits = 5)
x = data.drop('selling_price', axis = 1)
y = data['selling_price']

train_rmse_total = []
test_rmse_total = []

for train_index, test_index in kf.split(x):
    x_train, x_test = x.loc[train_index], x.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = RandomForestRegressor(random_state = 100)
    model.fit(x_train, y_train)

    train_pred = model.predict(x_train)
    test_pred = model.predict(x_test)
    train_rmse = mean_squared_error(y_train, train_pred) ** 0.5