#### 랜덤포레스트 classification
- data : 
- 암 환자 예측

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
data1=pd.read_csv('breast-cancer-wisconsin.csv', encoding='utf-8')
X=data1[data1.columns[1:10]]
y=data1[["Class"]]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, random_state=42)

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_train)
X_scaled_train=scaler.transform(X_train)
X_scaled_test=scaler.transform(X_test)

In [3]:
# 모델적용 - traindata
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

1.0

In [4]:
# 오차행렬 - traindata
from sklearn.metrics import confusion_matrix
confusion_train = confusion_matrix(y_train, pred_train)
print("오차행렬 train data \n", confusion_train)

오차행렬 train data 
 [[333   0]
 [  0 179]]


In [5]:
# 분류에측 레포트 - train data
from sklearn.metrics import classification_report
cfreport_train = classification_report(y_train, pred_train)
print("분류예측레포트 train data \n", cfreport_train)

분류예측레포트 train data 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       333
           1       1.00      1.00      1.00       179

    accuracy                           1.00       512
   macro avg       1.00      1.00      1.00       512
weighted avg       1.00      1.00      1.00       512



In [6]:
# 모델적용 - test data
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.9707602339181286

In [8]:
# 오차행렬 - test data
confusion_test = confusion_matrix(y_test, pred_test)
print("오차행렬 test data\n", confusion_test)

# 분류에측 레포트 - test data
cfreport_test = classification_report(y_test, pred_test)
print("\n분류예측레포트 test data\n", cfreport_test)

오차행렬 test data
 [[106   5]
 [  0  60]]

분류예측레포트 test data
               precision    recall  f1-score   support

           0       1.00      0.95      0.98       111
           1       0.92      1.00      0.96        60

    accuracy                           0.97       171
   macro avg       0.96      0.98      0.97       171
weighted avg       0.97      0.97      0.97       171



In [14]:
# Grid Search
param_grid={'n_estimators': range(100, 1000, 100), 
            'max_features': ['auto', 'sqrt', 'log2']}
from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_scaled_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': range(100, 1000, 100)})

In [15]:
print("Best Parameter: {}".format(grid_search.best_params_))
print("Best Score: {:.4f}".format(grid_search.best_score_))
print("TestSet Score: {:.4f}".format(grid_search.score(X_scaled_test, y_test)))

Best Parameter: {'max_features': 'auto', 'n_estimators': 300}
Best Score: 0.9765
TestSet Score: 0.9649


In [16]:
# Random Search
from scipy.stats import randint
param_distribs = {'n_estimators': randint(low=100, high=1000), 
                  'max_features': ['auto', 'sqrt', 'log2']}
from sklearn.model_selection import RandomizedSearchCV
random_search=RandomizedSearchCV(RandomForestClassifier(), 
                                 param_distributions=param_distribs, n_iter=20, cv=5)
random_search.fit(X_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=20,
                   param_distributions={'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fea7dfd1a10>})

In [17]:
print("Best Parameter: {}".format(random_search.best_params_))
print("Best Score: {:.4f}".format(random_search.best_score_))
print("TestSet Score: {:.4f}".format(random_search.score(X_scaled_test, y_test)))

Best Parameter: {'max_features': 'sqrt', 'n_estimators': 174}
Best Score: 0.9746
TestSet Score: 0.9708


#### 회귀 Regressor
- data : house_price.csv
- 주택가격예측

In [18]:
data2=pd.read_csv('house_price.csv', encoding='utf-8')
X=data2[data2.columns[1:5]]
y=data2[["house_value"]]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=42)

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_train)
X_scaled_train=scaler.transform(X_train)
X_scaled_test=scaler.transform(X_test)

In [25]:
# 모델적용 
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

0.9380281383038754

In [26]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test,y_test)

0.5816275194834313

In [28]:
# 위의 결과를 보면 train data 에 과적합되었다고 볼수 있다.
# RMSE
import numpy as np
from sklearn.metrics import mean_squared_error
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)
print("train data RMSE:", np.sqrt(MSE_train))
print("test data RMSE:",np.sqrt(MSE_test))

train data RMSE: 23760.10280576464
test data RMSE: 61836.37594764356


In [35]:
# Grid Search
param_grid={'n_estimators': range(100, 500, 100), 
            'max_features': ['auto', 'sqrt', 'log2']}
from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
grid_search.fit(X_scaled_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': range(100, 500, 100)})

In [36]:
print("Best Parameter: {}".format(grid_search.best_params_))
print("Best Score: {:.4f}".format(grid_search.best_score_))
print("TestSet Score: {:.4f}".format(grid_search.score(X_scaled_test, y_test)))

Best Parameter: {'max_features': 'sqrt', 'n_estimators': 400}
Best Score: 0.5687
TestSet Score: 0.5921


In [37]:
param_distribs = {'n_estimators': randint(low=100, high=500), 
                  'max_features': ['auto', 'sqrt', 'log2']}
from sklearn.model_selection import RandomizedSearchCV
random_search=RandomizedSearchCV(RandomForestRegressor(), 
                                 param_distributions=param_distribs, n_iter=20, cv=5)
random_search.fit(X_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=20,
                   param_distributions={'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fea7dfef8d0>})

In [None]:
print("Best Parameter: {}".format(random_search.best_params_))
print("Best Score: {:.4f}".format(random_search.best_score_))
print("TestSet Score: {:.4f}".format(random_search.score(X_scaled_test, y_test)))