In [1]:
import pandas as pd 
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/apart/seoul_3.csv')
df = df.drop(['gu','dong'], axis=1)
df = df[['gu_l','dong_l','used_y','square','per_cost_man','floor','YN_r','YN_top10','cost']]

In [3]:
X=df.drop('cost',axis=1)
y=df['cost']

X.shape, y.shape

((114142, 8), (114142,))

In [4]:
X.columns

Index(['gu_l', 'dong_l', 'used_y', 'square', 'per_cost_man', 'floor', 'YN_r',
       'YN_top10'],
      dtype='object')

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 42)

In [6]:
X_test.shape,y_test.shape

((34243, 8), (34243,))

In [7]:
model = ExtraTreesRegressor()

In [8]:
model.fit(X_train,y_train)

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=None, oob_score=False,
                    random_state=None, verbose=0, warm_start=False)

In [9]:
model.score(X_test,y_test)

0.9998936824901747

In [10]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_pred,y_test)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % mse**(0.5))


MSE: 399304.84
RMSE: 631.91


### 공시지가 포함

In [11]:
url = '/content/drive/MyDrive/Colab Notebooks/apart/dataset.csv'
df1 = pd.read_csv(url)

In [12]:
df1.head()

Unnamed: 0,Gu_Label,Dong_Label,Year,Exclusive_area,Price per squar,top10,YN_r,Transaction_real_price
0,0,8,32,77.75,643.068907,0,1,149500
1,0,8,33,54.98,643.068907,0,1,144000
2,0,8,33,79.97,643.068907,0,1,162750
3,0,8,33,79.97,643.068907,0,1,160000
4,0,8,33,79.97,643.068907,0,1,150000


In [13]:
X=df1.drop('Transaction_real_price',axis=1)
y=df1['Transaction_real_price']

X.shape, y.shape

((114143, 7), (114143,))

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 42)

In [15]:
etr = ExtraTreesRegressor()

In [16]:
etr.fit(X_train,y_train)

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=None, oob_score=False,
                    random_state=None, verbose=0, warm_start=False)

In [17]:
etr.score(X_test,y_test)

0.9594123202664357

In [18]:
y_pred = etr.predict(X_test)
mse = mean_squared_error(y_pred,y_test)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % mse**(0.5))


MSE: 153823606.65
RMSE: 12402.56


## RandomForestRegressor

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score

In [20]:
rf = RandomForestRegressor(random_state=42)

rf.fit(X_train,y_train)
pred = rf.predict(X_test)
rf.score(X_test,y_test)

0.9559749144952329

In [21]:
mse = mean_squared_error(y_test,pred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % mse**(0.5))

MSE: 166851061.21
RMSE: 12917.08


In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
params={
    'n_estimators':[100],'max_depth':[6,8,10,12], 
    'min_samples_leaf':[8,12,18], 'min_samples_split':[8,16,20]
}
rf = RandomForestRegressor(random_state=42,n_jobs=-1)

grid_cv=GridSearchCV(rf,param_grid=params,cv=2,n_jobs=-1)

grid_cv.fit(X_train,y_train)

print('GridSearchCV 최고의 평균 정확도 : {0:.4f}'.format(grid_cv.best_score_))
print('GridSearchCV 최적의 하이퍼 파라미터 :',grid_cv.best_params_)

GridSearchCV 최고의 평균 정확도 : 0.8989
GridSearchCV 최적의 하이퍼 파라미터 : {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}


In [24]:
rf = RandomForestRegressor(max_depth = 12,min_samples_leaf= 8, 
                           min_samples_split=8, n_estimators= 100,random_state=0)

rf.fit(X_train,y_train)
pred = rf.predict(X_test)
rf.score(X_test,y_test)

0.9124039778226746

## 의사결정나무

In [25]:
from sklearn.tree import DecisionTreeRegressor

In [26]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train,y_train)
dt.score(X_test,y_test)

0.9430792985311873

In [27]:
pred = dt.predict(X_test)
mse = mean_squared_error(y_test,pred)

In [28]:
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % mse**(0.5))

MSE: 215724270.29
RMSE: 14687.55
