### 목표
- DecisionTree모델 사용하기
- 모델 최적화를 해보자

#### 1. 모델정의

In [802]:
# 예측하기 좋은 최적의 질문을 만들어 학습하는 모델
from sklearn.tree import DecisionTreeRegressor

In [803]:
house_model = DecisionTreeRegressor()

#### 2. 모델학습
- 데이터 로딩 후 탐색

In [804]:
import pandas as pd

In [805]:
# 데이터 로딩(훈련용 데이터, 평가용데이터 )
test=pd.read_csv("./data/house/test.csv")
train=pd.read_csv("./data/house/train.csv")

In [806]:
train.head()

Unnamed: 0,Id,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,...,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Price
0,5467,Rosanna,22 Douglas St,2,h,S,Miles,19/11/2016,11.4,3084,...,1.0,757,,,Banyule,-37.7428,145.07,Eastern Metropolitan,3540,1200000
1,4365,North Melbourne,103/25 Byron St,1,u,SP,Jellis,16/07/2016,2.3,3051,...,1.0,0,60.0,2012.0,Melbourne,-37.802,144.9516,Northern Metropolitan,6821,450000
2,9741,Surrey Hills,4/40 Durham Rd,3,u,SP,Noel,17/06/2017,10.2,3127,...,1.0,149,,,Boroondara,-37.82971,145.09007,Southern Metropolitan,5457,780000
3,11945,Cheltenham,3/33 Sunray Av,2,t,S,Buxton,29/07/2017,17.9,3192,...,1.0,171,,,Kingston,-37.96304,145.06421,Southern Metropolitan,9758,751000
4,4038,Mont Albert,7/27 High St,3,t,S,Fletchers,15/10/2016,11.8,3127,...,2.0,330,148.0,2001.0,Whitehorse,-37.8167,145.107,Eastern Metropolitan,2079,1310000


In [807]:
# 전체 데이터 갯수 파악
print('평가용 데이터:',test.shape)
print('훈련용 데이터:',train.shape)


평가용 데이터: (3395, 21)
훈련용 데이터: (10185, 22)


In [808]:
# 전체 컬럼 무엇이 있는지 파악
# 1. 수치형(키, 몸무게) - 일반적으로 숫자형태, 데이터의 범위가 지정x
# 2. 범주형(성별, 혈액형) - 정해진 종류에서 등장
# 기타. 비정형 텍스트(주소..)
train.info()
#누락된 데이터→결측치

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10185 entries, 0 to 10184
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             10185 non-null  int64  
 1   Suburb         10185 non-null  object 
 2   Address        10185 non-null  object 
 3   Rooms          10185 non-null  int64  
 4   Type           10185 non-null  object 
 5   Method         10185 non-null  object 
 6   SellerG        10185 non-null  object 
 7   Date           10185 non-null  object 
 8   Distance       10185 non-null  float64
 9   Postcode       10185 non-null  int64  
 10  Bedroom2       10185 non-null  int64  
 11  Bathroom       10185 non-null  int64  
 12  Car            10142 non-null  float64
 13  Landsize       10185 non-null  int64  
 14  BuildingArea   5367 non-null   float64
 15  YearBuilt      6153 non-null   float64
 16  CouncilArea    9174 non-null   object 
 17  Lattitude      10185 non-null  float64
 18  Longti

In [809]:
# 기술통계 확인(최대값, 최소값, 최빈값....)
train.describe(include='all')

Unnamed: 0,Id,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,...,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Price
count,10185.0,10185,10185,10185.0,10185,10185,10185,10185,10185.0,10185.0,...,10142.0,10185.0,5367.0,6153.0,9174,10185.0,10185.0,10185,10185.0,10185.0
unique,,310,10066,,3,5,243,58,,,...,,,,,33,,,8,,
top,,Reservoir,2 Bruce St,,h,S,Nelson,27/05/2017,,,...,,,,,Moreland,,,Southern Metropolitan,,
freq,,261,3,,7106,6753,1156,364,,,...,,,,,887,,,3525,,
mean,6802.613942,,,2.94325,,,,,10.198213,3105.172607,...,1.613883,573.426411,154.137372,1964.904599,,-37.809763,144.995347,,7447.172018,1077961.0
std,3926.7021,,,0.952794,,,,,5.86664,90.19874,...,0.959076,4550.75718,614.71188,37.603561,,0.079922,0.104255,,4354.473015,636430.1
min,3.0,,,1.0,,,,,0.0,3000.0,...,0.0,0.0,0.0,1196.0,,-38.18255,144.43181,,249.0,131000.0
25%,3384.0,,,2.0,,,,,6.2,3044.0,...,1.0,178.0,93.92,1940.0,,-37.8577,144.9295,,4380.0,650000.0
50%,6838.0,,,3.0,,,,,9.3,3084.0,...,2.0,448.0,127.0,1970.0,,-37.8029,145.00013,,6543.0,905000.0
75%,10223.0,,,4.0,,,,,13.0,3149.0,...,2.0,652.0,175.0,2000.0,,-37.75671,145.05928,,10331.0,1330000.0


In [810]:
# 문제와 정답 추출
# 결측치가 존재하는 컬럼과 문자형태의 컬럼은 제외
X_train= train[['Propertycount','Postcode','Rooms']]
y_train= train['Price']

In [811]:
# train -> 7:3 비율로 train2, validation
from sklearn.model_selection import train_test_split
X_train2,X_val,y_train2,y_val=train_test_split(X_train,y_train,test_size=0.3, random_state=3)

In [812]:
X_train2.shape,y_train2.shape

((7129, 3), (7129,))

In [813]:
X_val.shape,y_val.shape

((3056, 3), (3056,))

In [814]:
house_model.fit(X_train2,y_train2)

DecisionTreeRegressor()

#### 3. 모델 예측

In [815]:
pre= house_model.predict(X_val)
pre

array([ 458521.73913043, 1369925.        ,  682324.32432432, ...,
        839687.5       , 2132500.        ,  881427.27272727])

#### 4. 모델평가
- MAE(평균절대값오차)

In [816]:
from sklearn.metrics import mean_absolute_error

In [817]:
# error= mean_absolute_error(실제값, 예측값)
error= mean_absolute_error(y_val, pre)
print('평균절대값오차:',error)

평균절대값오차: 248623.45411532142


#### 캐글에 업로드하기

In [818]:
X_test= test[['Propertycount','Postcode','Rooms']]
X_test.shape

(3395, 3)

In [819]:
test_pre=house_model.predict(X_test)
test_pre

array([ 388350.        ,  522500.        ,  700000.        , ...,
        710500.        ,  651357.14285714, 1009088.23529412])

In [820]:
# 정답지 파일 로딩
submission = pd.read_csv("./data/house/sample_submission.csv")
submission

Unnamed: 0,Id,Price
0,3189,0
1,2539,0
2,9171,0
3,4741,0
4,12455,0
...,...,...
3390,12276,0
3391,4618,0
3392,12913,0
3393,11741,0


In [821]:
test.head()

Unnamed: 0,Id,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,3189,Hawthorn,22/9 Lisson Gr,1,u,S,Biggin,19/11/2016,4.6,3122,...,1,1.0,0,52.0,1970.0,Boroondara,-37.8261,145.0269,Southern Metropolitan,11308
1,2539,Fitzroy,113/300 Young St,1,u,SP,Jellis,19/11/2016,1.6,3065,...,1,1.0,0,52.0,2011.0,Yarra,-37.7974,144.9799,Northern Metropolitan,5825
2,9171,Greenvale,7 Murray Ct,5,h,S,Barry,3/06/2017,20.4,3059,...,3,5.0,1750,310.0,1990.0,Hume,-37.65439,144.89113,Northern Metropolitan,4864
3,4741,Port Melbourne,172 Albert St,2,h,S,hockingstuart,10/12/2016,3.8,3207,...,1,0.0,106,70.0,1910.0,Port Phillip,-37.8346,144.9373,Southern Metropolitan,8648
4,12455,Brunswick West,47 Everett St,4,h,VB,Nelson,9/09/2017,5.2,3055,...,2,2.0,600,180.0,2004.0,,-37.75465,144.94144,Northern Metropolitan,7082


In [822]:
submission['Price'] = test_pre
submission

Unnamed: 0,Id,Price
0,3189,3.883500e+05
1,2539,5.225000e+05
2,9171,7.000000e+05
3,4741,9.590946e+05
4,12455,1.293500e+06
...,...,...
3390,12276,7.450000e+05
3391,4618,6.337656e+05
3392,12913,7.105000e+05
3393,11741,6.513571e+05


In [823]:
# csv파일로 저장 index=False -> 이걸 안하면 인덱스도 컬럼으로 저장된다
submission.to_csv("./data/house/myPrediction.csv", index=False)

#### 다른 컬럼을 이용해보자.
1. 결측치가 있는 컬럼
   - 데이터를 버린다 -> drop, dropna(결측치가 있는걸 삭제)
   - 데이터를 채운다 -> fillna
        - 기술통계를 활용
        - 모델 활용 -> 결측치를 정답, 주변컬럼을 문제로 설정
2. 문자형태의 컬럼
   - 문자타입 -> 숫자타입 변경(인코딩)
        - 라벨인코딩 -> 임의의 숫자를 글자에 부여
        - 원핫 인코딩 -> 0과 1을 이용해서 변환
        

In [824]:
train.columns

Index(['Id', 'Suburb', 'Address', 'Rooms', 'Type', 'Method', 'SellerG', 'Date',
       'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize',
       'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude', 'Longtitude',
       'Regionname', 'Propertycount', 'Price'],
      dtype='object')

In [825]:
# 원핫인코딩 함수
type_onehot=pd.get_dummies(train['Type'])
type_onehot

Unnamed: 0,h,t,u
0,1,0,0
1,0,0,1
2,0,0,1
3,0,1,0
4,0,1,0
...,...,...,...
10180,1,0,0
10181,1,0,0
10182,1,0,0
10183,1,0,0


In [826]:
type_onehot_test=pd.get_dummies(test['Type'])
type_onehot_test

Unnamed: 0,h,t,u
0,0,0,1
1,0,0,1
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
3390,1,0,0
3391,1,0,0
3392,0,1,0
3393,1,0,0


In [827]:
train = pd.concat([train,type_onehot], axis=1)
train

Unnamed: 0,Id,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,...,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Price,h,t,u
0,5467,Rosanna,22 Douglas St,2,h,S,Miles,19/11/2016,11.4,3084,...,,Banyule,-37.74280,145.07000,Eastern Metropolitan,3540,1200000,1,0,0
1,4365,North Melbourne,103/25 Byron St,1,u,SP,Jellis,16/07/2016,2.3,3051,...,2012.0,Melbourne,-37.80200,144.95160,Northern Metropolitan,6821,450000,0,0,1
2,9741,Surrey Hills,4/40 Durham Rd,3,u,SP,Noel,17/06/2017,10.2,3127,...,,Boroondara,-37.82971,145.09007,Southern Metropolitan,5457,780000,0,0,1
3,11945,Cheltenham,3/33 Sunray Av,2,t,S,Buxton,29/07/2017,17.9,3192,...,,Kingston,-37.96304,145.06421,Southern Metropolitan,9758,751000,0,1,0
4,4038,Mont Albert,7/27 High St,3,t,S,Fletchers,15/10/2016,11.8,3127,...,2001.0,Whitehorse,-37.81670,145.10700,Eastern Metropolitan,2079,1310000,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10180,4865,Prahran,84 York St,2,h,S,Jellis,27/11/2016,4.5,3181,...,1900.0,Stonnington,-37.84790,144.99880,Southern Metropolitan,7717,1305000,1,0,0
10181,4426,Northcote,7 Prospect Gr,4,h,PI,Woodards,12/11/2016,5.5,3070,...,,Darebin,-37.77360,145.00040,Northern Metropolitan,11364,2100000,1,0,0
10182,9870,Essendon,22 Lyon St,2,h,S,Nelson,24/06/2017,7.5,3040,...,1937.0,Moonee Valley,-37.75225,144.90462,Western Metropolitan,9264,1200000,1,0,0
10183,1775,Carnegie,68 Leila Rd,4,h,PI,Ray,25/02/2017,11.4,3163,...,,Glen Eira,-37.90110,145.05100,Southern Metropolitan,7822,1410000,1,0,0


In [828]:
test= pd.concat([test,type_onehot_test], axis=1)
test

Unnamed: 0,Id,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,...,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,h,t,u
0,3189,Hawthorn,22/9 Lisson Gr,1,u,S,Biggin,19/11/2016,4.6,3122,...,52.0,1970.0,Boroondara,-37.82610,145.02690,Southern Metropolitan,11308,0,0,1
1,2539,Fitzroy,113/300 Young St,1,u,SP,Jellis,19/11/2016,1.6,3065,...,52.0,2011.0,Yarra,-37.79740,144.97990,Northern Metropolitan,5825,0,0,1
2,9171,Greenvale,7 Murray Ct,5,h,S,Barry,3/06/2017,20.4,3059,...,310.0,1990.0,Hume,-37.65439,144.89113,Northern Metropolitan,4864,1,0,0
3,4741,Port Melbourne,172 Albert St,2,h,S,hockingstuart,10/12/2016,3.8,3207,...,70.0,1910.0,Port Phillip,-37.83460,144.93730,Southern Metropolitan,8648,1,0,0
4,12455,Brunswick West,47 Everett St,4,h,VB,Nelson,9/09/2017,5.2,3055,...,180.0,2004.0,,-37.75465,144.94144,Northern Metropolitan,7082,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3390,12276,Diamond Creek,17 Wentworth Cl,4,h,SP,Barry,3/09/2017,21.8,3089,...,235.0,1995.0,,-37.68001,145.16000,Northern Victoria,4258,1,0,0
3391,4618,Pascoe Vale,1/23 Stewart St,2,h,S,hockingstuart,7/05/2016,9.9,3044,...,,2000.0,Moreland,-37.72860,144.92490,Northern Metropolitan,7485,1,0,0
3392,12913,Briar Hill,4/129 Mountain View Rd,2,t,S,Barry,19/08/2017,16.1,3088,...,,,,-37.70835,145.11795,Eastern Metropolitan,1390,0,1,0
3393,11741,Mill Park,4 Streeton Cct,3,h,S,Ray,22/07/2017,17.9,3082,...,129.0,1980.0,Whittlesea,-37.66916,145.06912,Northern Metropolitan,10529,1,0,0


In [829]:
# 라벨인코딩
method_dict = {
    'S':0,
    'SP':1,
    'PI':2,
    'PN':3,
    'SN':4,
    'NB':5,
    'VB':6,
    'W':7,
    'SA':8,
    'SS':9
}
method_label=train['Method'].map(method_dict)
method_label

0        0
1        1
2        1
3        0
4        0
        ..
10180    0
10181    2
10182    0
10183    2
10184    0
Name: Method, Length: 10185, dtype: int64

In [830]:
train['Method_label'] = method_label
train

Unnamed: 0,Id,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,...,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Price,h,t,u,Method_label
0,5467,Rosanna,22 Douglas St,2,h,S,Miles,19/11/2016,11.4,3084,...,Banyule,-37.74280,145.07000,Eastern Metropolitan,3540,1200000,1,0,0,0
1,4365,North Melbourne,103/25 Byron St,1,u,SP,Jellis,16/07/2016,2.3,3051,...,Melbourne,-37.80200,144.95160,Northern Metropolitan,6821,450000,0,0,1,1
2,9741,Surrey Hills,4/40 Durham Rd,3,u,SP,Noel,17/06/2017,10.2,3127,...,Boroondara,-37.82971,145.09007,Southern Metropolitan,5457,780000,0,0,1,1
3,11945,Cheltenham,3/33 Sunray Av,2,t,S,Buxton,29/07/2017,17.9,3192,...,Kingston,-37.96304,145.06421,Southern Metropolitan,9758,751000,0,1,0,0
4,4038,Mont Albert,7/27 High St,3,t,S,Fletchers,15/10/2016,11.8,3127,...,Whitehorse,-37.81670,145.10700,Eastern Metropolitan,2079,1310000,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10180,4865,Prahran,84 York St,2,h,S,Jellis,27/11/2016,4.5,3181,...,Stonnington,-37.84790,144.99880,Southern Metropolitan,7717,1305000,1,0,0,0
10181,4426,Northcote,7 Prospect Gr,4,h,PI,Woodards,12/11/2016,5.5,3070,...,Darebin,-37.77360,145.00040,Northern Metropolitan,11364,2100000,1,0,0,2
10182,9870,Essendon,22 Lyon St,2,h,S,Nelson,24/06/2017,7.5,3040,...,Moonee Valley,-37.75225,144.90462,Western Metropolitan,9264,1200000,1,0,0,0
10183,1775,Carnegie,68 Leila Rd,4,h,PI,Ray,25/02/2017,11.4,3163,...,Glen Eira,-37.90110,145.05100,Southern Metropolitan,7822,1410000,1,0,0,2


#### 결측치와 문자형태를 처리해서 다양한 컬럼으로 학습해보자.
1. train, test데이터에 원하는 결츨치, 인코딩 처리 실시
2. 원하는 컬럼 선택
3. train을 train2와 val로 분리
4. 모델 학습 후 평가
5. test데이터를 예측해 kaggle에 업로드

In [831]:
# 원핫인코딩 함수
type_onehot=pd.get_dummies(train['Regionname'])
type_onehot

Unnamed: 0,Eastern Metropolitan,Eastern Victoria,Northern Metropolitan,Northern Victoria,South-Eastern Metropolitan,Southern Metropolitan,Western Metropolitan,Western Victoria
0,1,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0
2,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
10180,0,0,0,0,0,1,0,0
10181,0,0,1,0,0,0,0,0
10182,0,0,0,0,0,0,1,0
10183,0,0,0,0,0,1,0,0


In [832]:
type_onehot_test=pd.get_dummies(test['Regionname'])
type_onehot_test

Unnamed: 0,Eastern Metropolitan,Eastern Victoria,Northern Metropolitan,Northern Victoria,South-Eastern Metropolitan,Southern Metropolitan,Western Metropolitan,Western Victoria
0,0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
3390,0,0,0,1,0,0,0,0
3391,0,0,1,0,0,0,0,0
3392,1,0,0,0,0,0,0,0
3393,0,0,1,0,0,0,0,0


In [833]:
train = pd.concat([train,type_onehot], axis=1)
train.shape

(10185, 34)

In [834]:
test= pd.concat([test,type_onehot_test], axis=1)
test

Unnamed: 0,Id,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,...,t,u,Eastern Metropolitan,Eastern Victoria,Northern Metropolitan,Northern Victoria,South-Eastern Metropolitan,Southern Metropolitan,Western Metropolitan,Western Victoria
0,3189,Hawthorn,22/9 Lisson Gr,1,u,S,Biggin,19/11/2016,4.6,3122,...,0,1,0,0,0,0,0,1,0,0
1,2539,Fitzroy,113/300 Young St,1,u,SP,Jellis,19/11/2016,1.6,3065,...,0,1,0,0,1,0,0,0,0,0
2,9171,Greenvale,7 Murray Ct,5,h,S,Barry,3/06/2017,20.4,3059,...,0,0,0,0,1,0,0,0,0,0
3,4741,Port Melbourne,172 Albert St,2,h,S,hockingstuart,10/12/2016,3.8,3207,...,0,0,0,0,0,0,0,1,0,0
4,12455,Brunswick West,47 Everett St,4,h,VB,Nelson,9/09/2017,5.2,3055,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3390,12276,Diamond Creek,17 Wentworth Cl,4,h,SP,Barry,3/09/2017,21.8,3089,...,0,0,0,0,0,1,0,0,0,0
3391,4618,Pascoe Vale,1/23 Stewart St,2,h,S,hockingstuart,7/05/2016,9.9,3044,...,0,0,0,0,1,0,0,0,0,0
3392,12913,Briar Hill,4/129 Mountain View Rd,2,t,S,Barry,19/08/2017,16.1,3088,...,1,0,1,0,0,0,0,0,0,0
3393,11741,Mill Park,4 Streeton Cct,3,h,S,Ray,22/07/2017,17.9,3082,...,0,0,0,0,1,0,0,0,0,0


In [835]:
# Car 결측치 채우기 -> 평균값 or 중앙값(정수로 바꾸기)
train['Car'].fillna(train['Car'].median(), inplace=True)

In [836]:
test['Car'].fillna(test['Car'].median(), inplace=True)

In [837]:
X_train= train[['Southern Metropolitan','h','u','Rooms','Postcode','Northern Metropolitan','Bathroom']]
y_train = train['Price']

In [838]:
X_train

Unnamed: 0,Southern Metropolitan,h,u,Rooms,Postcode,Northern Metropolitan,Bathroom
0,0,1,0,2,3084,0,1
1,0,0,1,1,3051,1,1
2,1,0,1,3,3127,0,1
3,1,0,0,2,3192,0,1
4,0,0,0,3,3127,0,2
...,...,...,...,...,...,...,...
10180,1,1,0,2,3181,0,1
10181,0,1,0,4,3070,1,1
10182,0,1,0,2,3040,0,1
10183,1,1,0,4,3163,0,3


In [839]:
X_train2,X_val,y_train2,y_val=train_test_split(X_train,y_train,test_size=0.3, random_state=3)

In [840]:
X_train2.shape

(7129, 7)

In [841]:
house_model2 = DecisionTreeRegressor()

In [842]:
house_model2.fit(X_train2,y_train2)

DecisionTreeRegressor()

In [843]:
pre= house_model2.predict(X_val)
pre

array([ 464818.18181818,  674000.        ,  576840.        , ...,
        650233.33333333, 1912727.27272727,  923318.18181818])

In [844]:
error= mean_absolute_error(y_val, pre)
print('평균절대값오차:',error)

평균절대값오차: 217684.3662999033


In [845]:
X_test1= test[['Southern Metropolitan','h','u','Rooms','Postcode','Northern Metropolitan','Bathroom']]
X_test1.shape

(3395, 7)

In [846]:
test_pre1=house_model2.predict(X_test1)
test_pre1

array([ 388350.,  522500.,  900000., ...,  586500.,  701125., 1200000.])

In [847]:
submission['Price'] = test_pre1
submission

Unnamed: 0,Id,Price
0,3189,3.883500e+05
1,2539,5.225000e+05
2,9171,9.000000e+05
3,4741,1.133577e+06
4,12455,1.349500e+06
...,...,...
3390,12276,7.450000e+05
3391,4618,7.892857e+05
3392,12913,5.865000e+05
3393,11741,7.011250e+05


In [848]:
submission.to_csv("./data/house/myPrediction10.csv", index=False)

In [849]:
X_train = train[['Rooms', 'Postcode', 'Propertycount','h','t','u']]
y_train = train['Price']
X_train.shape, y_train.shape
#검증데이터 분리
X_train3,X_val2,y_train3,y_val2 = train_test_split(X_train,y_train,test_size=0.2,random_state=916)

In [850]:
X_train.shape, y_train.shape

((10185, 6), (10185,))

In [851]:
#검증데이터 분리
X_train3,X_val2,y_train3,y_val2 = train_test_split(X_train,y_train,test_size=0.2,random_state=916)

In [852]:
#모델정의
house_model2 = DecisionTreeRegressor()
#모델학습
house_model2.fit(X_train3,y_train3)
#모델예측
pre = house_model2.predict(X_val2)
#모델평가
mean_absolute_error(y_val2,pre)

215957.51873238373

In [853]:
X_test = test[['Rooms', 'Postcode', 'Propertycount','h','t','u']]
pre2 = house_model2.predict(X_test)
submisstion['Price'] = pre2
submisstion.to_csv('./data/house/myPrediction3.csv', index=False)

NameError: name 'submisstion' is not defined

#### 모델최적화
- 모델복잡도 제어하기(하이퍼파라미터 변경)
- KNN은 이웃의 숫자로 모델의 복잡도 제어
     - 이웃의 숫자가 커질수록 단순해지고
     - 이웃의 숫자가 적어질수록 복잡해진다
- DecisionTree는 질문의 깊이로 모델의 복잡도를 제어(max_depth)
    - 깊이가 얕으면 단순해지고
    - 깊이가 깊으면 복잡해진다

In [856]:
train_score_list=[] #훈련데이터 평가 점수가 담긴다.
for d in range(1,30): #나무의 깊이를 최대 1~29까지 설정
    m = DecisionTreeRegressor(max_depth=d) # 최대깊이가 다른 모델을 계속 생성
    m.fit(X_train2,y_train2)
    pre_train = m.predict(X_train2) #훈련데이터 예측
    # 점수계산
    score_train=mean_absolute_error(y_train,pre_train)
    train_score_list.append(pre_train)

ValueError: Found input variables with inconsistent numbers of samples: [10185, 7129]

In [None]:
# 시각화 
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5)) # 그림의 크기 설정
plt.plot(range(1,30),train_score_list) # 선그래프 그리기
plt.show() # 그림 보여주기