In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
# 데이터
data = pd.read_csv("C:\Jupyter Notebook\K-digital\Data\kc_house_data.csv")
data.head(3)

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233


In [None]:
'''
id: 집 고유아이디
date: 집이 팔린 날짜 
price: 집 가격 (타겟변수)
bedrooms: 주택 당 침실 개수
bathrooms: 주택 당 화장실 개수
floors: 전체 층 개수
waterfront: 해변이 보이는지 (0, 1)
condition: 집 청소상태 (1~5)
grade: King County grading system 으로 인한 평점 (1~13)
yr_built: 집이 지어진 년도
yr_renovated: 집이 리모델링 된 년도
zipcode: 우편번호
lat: 위도
long: 경도
'''

In [6]:
nCar = data.shape[0] # 데이터 개수
nVar = data.shape[1] # 변수 개수
 
print(nCar, nVar)    

21613 14


In [7]:
## 불필요한 변수 제거
data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'],axis=1)
data.head(3)

Unnamed: 0,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated
0,221900.0,3,1.0,1.0,0,3,7,1955,0
1,538000.0,3,2.25,2.0,0,3,7,1951,1991
2,180000.0,2,1.0,1.0,0,3,6,1933,0


In [9]:
list(data.columns.difference(['price']))

['bathrooms',
 'bedrooms',
 'condition',
 'floors',
 'grade',
 'waterfront',
 'yr_built',
 'yr_renovated']

In [10]:
## 타겟변수와 설명변수를 분리, 학습데이터 평가 데이터 분리
feature_columns = list(data.columns.difference(['price']))
X = data[feature_columns]
y = data['price']

In [11]:
## 학습데이터 평가 데이터 분리
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42) # 학습데이터와 평가데이터의 비율을 7:3
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(15129, 8) (6484, 8) (15129,) (6484,)


In [12]:
## 학습 데이터를 선형회귀 모형으로 학습 시킨후 검증 작업(Stats_Models)
import statsmodels.api as sm  ## 모델 패키지
from sklearn.metrics import mean_squared_error, r2_score ## 평가지표 패키지
from math import sqrt

In [13]:
sm_train_x = sm.add_constant(train_x, has_constant='add') # 상수항 결합
sm_train_x.head(3)

Unnamed: 0,const,bathrooms,bedrooms,condition,floors,grade,waterfront,yr_built,yr_renovated
167,1.0,2.5,4,3,2.0,9,0,1999,0
12412,1.0,1.75,4,5,1.0,7,0,1924,0
7691,1.0,2.25,4,3,2.0,8,0,1978,0


In [11]:
sm_model = sm.OLS(train_y, sm_train_x)
fit_sm_model = sm_model.fit()
fit_sm_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.595
Model:,OLS,Adj. R-squared:,0.595
Method:,Least Squares,F-statistic:,2776.0
Date:,"Tue, 06 Apr 2021",Prob (F-statistic):,0.0
Time:,09:57:58,Log-Likelihood:,-208260.0
No. Observations:,15129,AIC:,416500.0
Df Residuals:,15120,BIC:,416600.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.186e+06,1.73e+05,41.548,0.000,6.85e+06,7.52e+06
bathrooms,1.303e+05,3960.833,32.889,0.000,1.23e+05,1.38e+05
bedrooms,-2224.7910,2382.356,-0.934,0.350,-6894.497,2444.915
condition,1.641e+04,3169.013,5.178,0.000,1.02e+04,2.26e+04
floors,1946.3052,4336.838,0.449,0.654,-6554.422,1.04e+04
grade,1.956e+05,2199.540,88.924,0.000,1.91e+05,2e+05
waterfront,7.555e+05,2.26e+04,33.479,0.000,7.11e+05,8e+05
yr_built,-4300.7865,88.073,-48.832,0.000,-4473.420,-4128.153
yr_renovated,12.7325,5.043,2.525,0.012,2.847,22.618

0,1,2,3
Omnibus:,13447.374,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1684794.827
Skew:,3.763,Prob(JB):,0.0
Kurtosis:,54.147,Cond. No.,182000.0


In [13]:
## 결과 확인
sm_test_x = sm.add_constant(test_x, has_constant='add') # 상수항 결합 - 데이터에 Bias 추가(언더피팅 방지)
sm_model_pred = fit_sm_model.predict(sm_test_x)
print("RMSE : {}".format(sqrt(mean_squared_error(sm_model_pred, test_y))))
print(fit_sm_model.params)      

RMSE : 239804.29670858168
const           7.185671e+06
bathrooms       1.302689e+05
bedrooms       -2.224791e+03
condition       1.641020e+04
floors          1.946305e+03
grade           1.955909e+05
waterfront      7.555423e+05
yr_built       -4.300787e+03
yr_renovated    1.273246e+01
dtype: float64


In [17]:
import random
bagging_predict_result = [] ## 예측 결과 저장

for _ in range(10) :
    data_index = [data_index for data_index in range(train_x.shape[0])] #학습 데이터 index list로 변환
    
    random_data_index = np.random.choice(data_index,train_x.shape[0]) ## 데이터 크기의 1/10 크기만큼 랜덤 샘플링
    print(len(set(random_data_index)))
    
    ## 설명변수 
    sm_train_x = train_x.iloc[random_data_index,] #랜덤 인덱스에 해당하는 데이터 추출
    ## 종속변수
    sm_train_y = train_y.iloc[random_data_index,]
    
    sm_train_x = sm.add_constant(sm_train_x, has_constant='add')
    sm_model = sm.OLS(sm_train_y, sm_train_x)
    fitted_sm_model = sm_model.fit()
    
    sm_test_x = sm.add_constant(test_x, has_constant='add')
    sm_pred = fitted_sm_model.predict(sm_test_x)
    bagging_predict_result.append(sm_pred)


9580
9597
9567
9635
9572
9615
9494
9533
9555
9548


In [21]:
bagging_predict_result[9]

735      5.571880e+05
2830     7.045045e+05
4106     1.106161e+06
16218    1.459546e+06
19964    6.920259e+05
             ...     
12606    5.949422e+05
14393    6.748159e+05
6899     3.251603e+05
85       9.011503e+05
21363    4.316789e+05
Length: 6484, dtype: float64

In [23]:
bagging_predict_result[0]

735      5.639596e+05
2830     7.064786e+05
4106     1.124459e+06
16218    1.476142e+06
19964    7.022239e+05
             ...     
12606    6.013197e+05
14393    6.692597e+05
6899     3.297236e+05
85       9.087733e+05
21363    4.378337e+05
Length: 6484, dtype: float64

In [22]:
# Bagging을 바탕으로 예측한 결과값에 대한 평균을 계산
bagging_predict = [] # 빈 리스트 생성
for lst2_index in range(test_x.shape[0]): # 테스트 데이터 개수만큼의 반복
    temp_predict = [] # 임시 빈 리스트 생성 (반복문 내 결과값 저장)
    for lst_index in range(len(bagging_predict_result)): # Bagging 결과 리스트 반복
        temp_predict.append(bagging_predict_result[lst_index].values[lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장
    bagging_predict.append(np.mean(temp_predict)) # 해당 인덱스의 30개의 결과값에 대한 평균을 최종 리스트에 추가

In [24]:
bagging_predict

[561841.7312974599,
 709643.6883683864,
 1118914.2021584571,
 1479053.5763838298,
 699665.1506219786,
 385614.1174798637,
 786167.2424955529,
 484274.70540393284,
 499439.33985118626,
 539100.9717485515,
 642321.6532550593,
 410705.7317183506,
 267781.940828147,
 279446.04927100276,
 337178.88530747266,
 1265406.028002413,
 319313.1319489955,
 1036307.1246178333,
 258889.7714452995,
 602829.1426278189,
 390247.48819666356,
 1304940.987621196,
 823405.925261187,
 582659.0092454322,
 597731.7648405416,
 570648.4930178841,
 263307.3997256319,
 43482.81709757438,
 563997.5733629244,
 642008.3726281069,
 566304.1418719136,
 458802.3015891391,
 553161.1942499613,
 689042.8036952345,
 409254.2515119859,
 875091.8358952019,
 940615.6785461393,
 638456.9573855891,
 391642.613768059,
 1083054.073760858,
 454732.2997565408,
 148257.45378976222,
 487486.14976381883,
 219521.1864760642,
 63081.58895483902,
 -46167.588839864264,
 247593.35054547788,
 283337.09832033375,
 363210.6927228499,
 720305.5

In [25]:
## 실제 y 값과 비교하여 성능 평가
print("RMSE : {}".format(sqrt(mean_squared_error(bagging_predict, test_y))))

RMSE : 239959.20797465957


In [None]:
### scikit-learn 패키지로 확인### scikit-learn 패키지로 확인

In [26]:
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression() # 선형 회귀 모형
linear_model1 = regression_model.fit(train_x, train_y) # 학습 데이터를 선형 회귀 모형에 적합
predict1 = linear_model1.predict(test_x) # 학습된 선형 회귀 모형으로 평가 데이터 예측
print("RMSE: {}".format(sqrt(mean_squared_error(predict1, test_y)))) # RMSE 결과

RMSE: 239804.29670858145


In [31]:
### scikit-learn bagging 패키지
from sklearn.ensemble import BaggingRegressor
bagging_model = BaggingRegressor(base_estimator = regression_model,
                                 n_estimators = 50,
                                 verbose = 1)
linear_model2 = bagging_model.fit(train_x, train_y)
predict2 = linear_model2.predict(test_x) # 학습된 선형 회귀 모형으로 평가 데이터 예측
print("RMSE: {}".format(sqrt(mean_squared_error(predict2, test_y)))) # RMSE 결과

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RMSE: 239747.09280391195


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [15]:
## 학습데이터를 의사결정 나무 모형에 적합한 후 평가데이터로 검증
from sklearn.tree import DecisionTreeRegressor
decision_tree_model = DecisionTreeRegressor() # 모델생성
tree_model1 = decision_tree_model.fit(train_x,train_y) # 학습된모델
predict1 = tree_model1.predict(test_x)# 예측결과
print('RMSE: {}'.format(sqrt(mean_squared_error(predict1,test_y))))

RMSE: 299608.75754810125


In [20]:
### scikit-learn bagging 패키지
from sklearn.ensemble import BaggingRegressor
bagging_model = BaggingRegressor(base_estimator = decision_tree_model,
                                 n_estimators = 50,
                                 verbose = 1)
linear_model2 = bagging_model.fit(train_x, train_y)
predict2 = linear_model2.predict(test_x) # 학습된 선형 회귀 모형으로 평가 데이터 예측
print("RMSE: {}".format(sqrt(mean_squared_error(predict2, test_y)))) # RMSE 결과

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RMSE: 233118.91221558672


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
