### 1. 목표 - 현대차 가격 예측하기

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor

### 2. 데이터 수집

In [3]:
train_df = pd.read_excel('현대차_가격.xlsx', sheet_name='train')
test_df = pd.read_excel('현대차_가격.xlsx', sheet_name='test')

In [4]:
train_df

Unnamed: 0,가격,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,1885,2015,준중형,11.8,172,21.0,가솔린,0,1999,1300,자동
1,2190,2015,준중형,12.3,204,27.0,가솔린,0,1591,1300,자동
2,1135,2015,소형,15.0,100,13.6,가솔린,0,1368,1035,수동
3,1645,2014,소형,14.0,140,17.0,가솔린,0,1591,1090,자동
4,1960,2015,대형,9.6,175,46.0,디젤,0,2497,1990,자동
...,...,...,...,...,...,...,...,...,...,...,...
66,3802,2015,중형,8.5,290,34.8,가솔린,0,3342,1901,자동
67,1270,2012,소형,13.3,108,13.9,가솔린,0,1396,1040,자동
68,2430,2015,준중형,12.8,186,41.0,디젤,0,1995,1665,자동
69,2870,2015,중형,17.7,156,19.3,가솔린,1,1999,1585,자동


In [5]:
x_train = train_df.drop(['가격'], axis=1)
x_test = test_df.drop(['가격'],axis=1)
y_train = train_df['가격']
y_test = test_df['가격']

In [6]:
x_train

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,준중형,11.8,172,21.0,가솔린,0,1999,1300,자동
1,2015,준중형,12.3,204,27.0,가솔린,0,1591,1300,자동
2,2015,소형,15.0,100,13.6,가솔린,0,1368,1035,수동
3,2014,소형,14.0,140,17.0,가솔린,0,1591,1090,자동
4,2015,대형,9.6,175,46.0,디젤,0,2497,1990,자동
...,...,...,...,...,...,...,...,...,...,...
66,2015,중형,8.5,290,34.8,가솔린,0,3342,1901,자동
67,2012,소형,13.3,108,13.9,가솔린,0,1396,1040,자동
68,2015,준중형,12.8,186,41.0,디젤,0,1995,1665,자동
69,2015,중형,17.7,156,19.3,가솔린,1,1999,1585,자동


In [7]:
x_test

Unnamed: 0,년식,종류,연비,마력,토크,연료,하이브리드,배기량,중량,변속기
0,2015,대형,6.8,159,23.0,LPG,0,2359,1935,수동
1,2012,소형,13.3,108,13.9,가솔린,0,1396,1035,자동
2,2015,중형,14.4,184,41.0,디젤,0,1995,1792,자동
3,2015,대형,10.9,175,46.0,디젤,0,2497,2210,수동
4,2015,대형,6.4,159,23.0,LPG,0,2359,1935,자동
5,2015,소형,18.0,136,30.6,디젤,0,1582,1160,자동
6,2015,준중형,13.9,184,41.0,디젤,0,1995,1611,수동
7,2015,대형,8.9,133,26.5,디젤,0,2497,1696,수동
8,2015,준중형,12.5,184,41.0,디젤,0,1995,1611,자동
9,2015,준중형,12.8,215,21.3,가솔린,0,1999,1216,수동


In [8]:
y_train

0     1885
1     2190
2     1135
3     1645
4     1960
      ... 
66    3802
67    1270
68    2430
69    2870
70    3254
Name: 가격, Length: 71, dtype: int64

In [9]:
y_test

0      1915
1      1164
2      2817
3      2160
4      1915
5      1560
6      2260
7      1430
8      2260
9      1690
10     2255
11     3590
12     1445
13     1610
14     4897
15     1445
16     2080
17     1135
18     1111
19     2190
20     1111
21     1845
22    14570
23     1890
24     1690
25     1410
26     6910
27     2545
28     1960
29      870
30     2879
Name: 가격, dtype: int64

In [10]:
transformer = make_column_transformer((OneHotEncoder(),['종류','연료','변속기']))

In [11]:
transformer.fit(x_train)

ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(),
                                 ['종류', '연료', '변속기'])])

In [12]:
x_train = transformer.transform(x_train)
x_test = transformer.transform(x_test)

### 모델 생성

##### randomforest

In [19]:
rf = RandomForestRegressor(max_depth=7)
#모델 학습
rf.fit(x_train, y_train)
#정확도
rf.score(x_train, y_train)

0.6018618365961833

In [20]:
rf.score(x_test, y_test)

0.4993132944745924

In [22]:
new_test = [[2015, '중형', 6.7, 160, 24, 'LPG', 0, 2500, 2000, '수동']]
new_test = pd.DataFrame(new_test, columns = ['년식','종류','연비','마력','토크','연료','하이브리드','배기량','중량','변속기'])

In [23]:
new_test = transformer.transform(new_test)

In [24]:
y_predict = rf.predict(new_test)

In [25]:
print(y_predict)

[2817.11688095]


##### ada boost

In [51]:
from sklearn.ensemble import AdaBoostRegressor

In [52]:
# 모델 생성
ada = AdaBoostRegressor()
# 모델 학습
ada.fit(x_train, y_train)
# 모델 스코어 - 학습데이터, 데스트데이터
ada.score(x_train, y_train)

0.5793824347195714

In [53]:
ada.score(x_test, y_test)

0.5347676811084663

In [54]:
# new_test 예측
ada.predict(new_test) #randomforest 예측값과 크게 차이나지 않음

array([3091.])

##### gradient boosting

In [56]:
from sklearn.ensemble import GradientBoostingRegressor

In [57]:
gb = GradientBoostingRegressor()

In [58]:
gb.fit(x_train, y_train)

GradientBoostingRegressor()

In [59]:
gb.score(x_train, y_train)

0.6032349753137957

In [60]:
gb.score(x_test, y_test)

0.4849660422367216

In [61]:
gb.predict(new_test) #위 모델들의 예측값과 비슷

array([2992.40880391])

##### xgboost

In [62]:
import xgboost as xgb

In [63]:
xgb = xgb.XGBRegressor()

In [65]:
xgb.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [66]:
xgb.score(x_train, y_train)

0.6032527591199182

In [67]:
xgb.score(x_test, y_test)

0.4850809005658747

In [72]:
xgb.predict(new_test) #위 모델들의 예측값과 비슷

array([2687.1926], dtype=float32)

##### lightGBM

In [64]:
from lightgbm import LGBMRegressor

In [68]:
lgb = LGBMRegressor()

In [69]:
lgb.fit(x_train, y_train)

LGBMRegressor()

In [70]:
lgb.score(x_train, y_train)

0.26803621675337663

In [71]:
lgb.score(x_test, y_test)

0.19121671328898493

In [73]:
lgb.predict(new_test) #위 모델들의 예측값과 비슷하지 않음

array([1195.89195875])