## GridSearchでチューニング練習

In [1]:
from tqdm import tqdm
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score

In [2]:
cancer_data = load_breast_cancer()
train_X, test_X, train_y, test_y = train_test_split(
    cancer_data.data, cancer_data.target, random_state=0
)

In [5]:
max_score = 0
SearchMethod = 0
RFC_grid = {RandomForestClassifier(): {"n_estimators": [i for i in range(1,21)],
                                      "criterion": ["gini", "entropy"],
                                      "max_depth": [i for i in range(1,5)],
                                      "random_state": [i for i in range(1,5)]}}

In [6]:
%%time
for model, param in tqdm(RFC_grid.items()):
    clf = GridSearchCV(model, param)
    clf.fit(train_X, train_y)
    pred_y = clf.predict(test_X)
    score = f1_score(test_y, pred_y, average="micro")
    
    if max_score < score:
        max_score = score
        best_param = clf.best_params_
        best_model = model.__class__.__name__

print(f"ベストスコア: {max_score}")
print(f"モデル: {best_model}")
print(f'パラメータ: {best_param}')

100%|██████████| 1/1 [02:38<00:00, 158.76s/it]

ベストスコア: 0.958041958041958
モデル: RandomForestClassifier
パラメータ: {'criterion': 'gini', 'max_depth': 4, 'n_estimators': 7, 'random_state': 2}
CPU times: user 2min 38s, sys: 246 ms, total: 2min 38s
Wall time: 2min 38s





In [9]:
# パラメータチューニングをしない場合
model = RandomForestClassifier()
model.fit(train_X, train_y)
score = model.score(test_X, test_y)
print(f"デフォルトスコア: {score}")

デフォルトスコア: 0.951048951048951




In [10]:
# 説明変数の分析
import statsmodels.api as sm

# 重回帰分析
X = sm.add_constant(cancer_data.data)
y = cancer_data.target
model = sm.OLS(y, X)
fitted = model.fit()
    
print('summary = \n', fitted.summary())


summary = 
                             OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.774
Model:                            OLS   Adj. R-squared:                  0.762
Method:                 Least Squares   F-statistic:                     61.53
Date:                Mon, 19 Dec 2022   Prob (F-statistic):          6.05e-153
Time:                        05:49:15   Log-Likelihood:                 29.650
No. Observations:                 569   AIC:                             2.699
Df Residuals:                     538   BIC:                             137.4
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.0218      0.428      7.