#### 의사결정나무 Regression
- data : house_price.csv
- 주택가격 예측

In [1]:
import warnings 
warnings.filterwarnings("ignore")
import pandas as pd
data2 = pd.read_csv('house_price.csv')
X = data2[data2.columns[1:5]]
y = data2[['house_value']]

In [3]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# stratify 어떨 때 쓰는지?

In [7]:
# 정규화
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_scaled_train = scaler.transform(X_train)
X_scaled_test = scaler.transform(X_test)

In [38]:
# 모델적용 DecisionTreeRegressor() - train data
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X_scaled_train,y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

1.0

In [10]:
# 모델적용 - test data
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.2170084148444198

train data 는 100%, test data 는 21.70% 의 예측률로 train data 에 심하게 과적합 되어있다.

In [11]:
# RMSE
import numpy as np
from sklearn.metrics import mean_squared_error
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)
print("traindata RMSE :", np.sqrt(MSE_train))
print("testdata RMSE:", np.sqrt(MSE_test))

traindata RMSE : 0.0
testdata RMSE: 84594.28445108373


In [13]:
# Grid Search
param_grid = {'max_depth':range(2,20,2),
             'min_samples_leaf':range(1,50,2)}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)
grid_search.fit(X_scaled_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': range(2, 20, 2),
                         'min_samples_leaf': range(1, 50, 2)})

In [14]:
print("Best Parameter:{}".format(grid_search.best_params_))
print("Best Score:{:.4f}".format(grid_search.best_score_))
print("Testset Score:{:.4f}".format(grid_search.score(X_scaled_train,y_train)))

Best Parameter:{'max_depth': 8, 'min_samples_leaf': 49}
Best Score:0.5592
Testset Score:0.6078


In [15]:
# random search
from scipy.stats import randint
param_distribs = {'max_depth': randint(low=1, high=20),
                 'min_samples_leaf': randint(low=1, high=50)}
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(DecisionTreeRegressor(),
                                  param_distributions = param_distribs,
                                  n_iter=20, cv=5)
random_search.fit(X_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_iter=20,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ffbe4697890>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ffbe4697a50>})

In [16]:
print("Best Parameter:{}".format(random_search.best_params_))
print("Best Score:{:.4f}".format(random_search.best_score_))
print("Testset Score:{:.4f}".format(random_search.score(X_scaled_train,y_train)))

Best Parameter:{'max_depth': 18, 'min_samples_leaf': 47}
Best Score:0.5576
Testset Score:0.6179
