<a href="https://colab.research.google.com/github/tanakt-hub/Test/blob/main/JPMA_2022_TF_1_1_demo_(2)Grid_Search%E3%81%A8%E7%B5%90%E6%9E%9C%E3%81%AE%E7%A2%BA%E8%AA%8D%E3%81%BE%E3%81%A7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 前準備

## Google Driveの接続

In [None]:
# データ受け渡しのためにGoogle Driveをマウント
from google.colab import drive
drive.mount('/content/drive')

# データ保存ディレクトリの指定
datadir = '/content/drive/MyDrive/datadir/'

Mounted at /content/drive


## データのロード

In [None]:
# 分かち書き済みのテキストとベクトル化したデータを読み込み
import pickle

with open(datadir + 'datadic.pkl', 'rb') as f:
  datadic = pickle.load(f)

# GridSearch

## 検索対象モデルをインポート

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split


# ロジスティック回帰とリッジ回帰
from sklearn.linear_model import LogisticRegression, RidgeClassifier

# 線形SVCとSVC
from sklearn.svm import LinearSVC, SVC

# ナイーブベイズ
from sklearn.naive_bayes import BernoulliNB

# k近傍法
from sklearn.neighbors import KNeighborsClassifier

# 決定木
from sklearn.tree import DecisionTreeClassifier

# ランダムフォレスト、ADAブースト、lightGBM
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import lightgbm

# ニューラルネットワーク
from sklearn.neural_network import MLPClassifier

## 検索パラメータと実行関数の設定

In [None]:
import numpy as np

# Grid search対象のモデル
clf_LR   = LogisticRegression(max_iter=50000)
clf_RID  = RidgeClassifier(max_iter=50000)
clf_LSVC = LinearSVC(max_iter=50000)
clf_SVC  = SVC(probability=True)
clf_NB   = BernoulliNB()
clf_KN   = KNeighborsClassifier()
clf_RF   = RandomForestClassifier()
clf_AB   = AdaBoostClassifier()
clf_LGB  = lightgbm.LGBMClassifier(objective='binary')
clf_MLP  = MLPClassifier(max_iter=5000)

# Grid searchのパラメータ範囲
param_LR = {
    'C': [10**i for i in range(-5,5)]
     }

param_RID = {
    'alpha':  [10**i for i in range(-5,5)]
     }

param_LSVC = {
    'C': [10**i for i in range(-5,5)]
     }

param_SVC = {
    'C': [10**i for i in range(-2,5)], 
    'kernel': ['rbf'], 
    'gamma': [10**i for i in range(-5,2)]
     }

param_NB = {
    'alpha':  [10**i for i in range(-5,5)]
    }

param_KN = {
    'n_neighbors': np.arange(3, 15),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

param_RF = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [100, 300, 500, 1000],
    'max_depth': [7, 8, 9]
}

param_LGB = {
      'num_leaves': [2, 4, 8, 16, 32, 64],
      'reg_alpha': [0, 0.1, 1],
      'reg_lambda': [0, 0.1, 1]
}

# ADAブーストとMLPは時間がかかるため実行用に検索パラメータを分割設定
param_AB1 = {
    'base_estimator':[DecisionTreeClassifier(max_depth=x) for x in range(7, 9)],
    'n_estimators': [100, 300, 500, 1000],
    'learning_rate' : [1.5]
}

param_AB2 = {
    'base_estimator':[DecisionTreeClassifier(max_depth=x) for x in range(7, 9)],
    'n_estimators': [100, 300, 500, 1000],
    'learning_rate' : [1.0]
}

param_MLP1 = {
    'hidden_layer_sizes': [(64), (128), (64,64)],
    'learning_rate_init': [10**i for i in range(-4,-1)],
    'alpha': [10**i for i in range(-6,-3)],
    }
param_MLP2 = {
    'hidden_layer_sizes': [(128,128)],
    'learning_rate_init': [10**i for i in range(-4,-1)],
    'alpha': [10**i for i in range(-6,-3)],
    }
param_MLP3 = {
    'hidden_layer_sizes': [(64,64,64)],
    'learning_rate_init': [10**i for i in range(-4,-1)],
    'alpha': [10**i for i in range(-6,-3)],
    }


# Grid Searchの実行関数定義
def grid_search(clf, param, Data, Label):
  grid_search = GridSearchCV(
      clf,
      param,
      cv=5,         # 5-fold closs validation
      scoring='f1') # f1でスコアリング
  return grid_search.fit(Data, Label)

## 実行

In [None]:
###########################################################################
### ！注意！                                                            ###
### このまま実行すると完了までに20-30時間かかります。                   ###
### 無料のColab環境ではタイムアウトしないよう実行対象を絞ってください。 ###
### （Notebookを複製した並列実行も可能）                                ###
###                                                                     ###
### 参考実行時間：                                                      ###
###  BoW:                                                               ###
###   2.1時間（All）                                                    ###
###  TFIDF:                                                             ###
###   2.2時間（All）                                                    ###
###  Emb:                                                               ###
###   1.8時間（LR～LGB）｜8.5時間（AB1/2）｜1.0時間（MLP1-3）           ###
###  Emb_tfidf:                                                         ###
###   1.8時間（LR～LGB）｜7.6時間（AB1/2）｜1.8時間（MLP1-3）           ###
###########################################################################


# 検索対象のモデルとパラメータ設定
clfs = {
    "LR"  : [clf_LR, param_LR],
    "RID" : [clf_RID, param_RID],
    "LSVC": [clf_LSVC, param_LSVC], 
    "SVC" : [clf_SVC, param_SVC], 
    "NB"  : [clf_NB, param_NB], 
    "KN"  : [clf_KN, param_KN], 
    "RF"  : [clf_RF, param_RF], 
    "LGB" : [clf_LGB, param_LGB],
    "AB1" : [clf_AB, param_AB1], 
    "AB2" : [clf_AB, param_AB2], 
    "MLP1": [clf_MLP, param_MLP1],     
    "MLP2": [clf_MLP, param_MLP2],     
    "MLP3": [clf_MLP, param_MLP3],     
  }

# 実行対象ベクトルの設定
vecs = [
    'BoW',
    'TFIDF',
    'Emb',
    'Emb_tfidf',
]


for vecname in vecs:
  # 対象ベクトルを random seed = 0 で分割
  X_train, X_test, y_train, y_test = train_test_split(datadic[vecname].astype(float), datadic['flg'], test_size=0.15, random_state = 0)
  print('*** dataset:', vecname)

  for clfname in clfs.keys():

    # 分割した Train データで Grid Search CV を実施
    print('  *** classifier:', clfname)
    gs = grid_search(clfs[clfname][0], clfs[clfname][1], X_train, y_train)

    # datadir に作成済みの結果 Dict があれば読み込み、なければ新規作成
    try:
      with open(datadir + vecname + '.pkl', 'rb') as f:
        gsdic = pickle.load(f)
    except:
      gsdic = {}

    # Grid Search CV の結果を保存
    gsdic[clfname] = gs
    gsdic[clfname + '_score'] = gs.score(X_test, y_test)

    with open(datadir + vecname + '.pkl', 'wb') as f:
      pickle.dump(gsdic, f)


# Appendix: 実行結果の確認

In [1]:
# 全てのモデルの Grid Search 完了後に結果をまとめてCSV出力する
# 結果閲覧のみを目的とするため、実行せずとも本編に影響なし

# 本セクションのみの実行を想定して再マウント＆変数定義
from google.colab import drive
drive.mount('/content/drive')

datadir = '/content/drive/MyDrive/datadir/'

import pickle
import pandas as pd

vecs = [
    'BoW',
    'TFIDF',
    'Emb',
    'Emb_tfidf',
]

# 各DictのKey確認
for dataname in vecs: 
  with open(datadir + dataname + '.pkl', 'rb') as f:
    gsmodels = pickle.load(f)
  print(dataname)
  print(gsmodels.keys())

Mounted at /content/drive
BoW
dict_keys(['LR', 'LR_score', 'RID', 'RID_score', 'LSVC', 'LSVC_score', 'SVC', 'SVC_score', 'NB', 'NB_score', 'KN', 'KN_score', 'RF', 'RF_score', 'LGB', 'LGB_score', 'AB1', 'AB1_score', 'AB2', 'AB2_score', 'MLP1', 'MLP1_score', 'MLP2', 'MLP2_score', 'MLP3', 'MLP3_score'])
TFIDF
dict_keys(['LR', 'LR_score', 'RID', 'RID_score', 'LSVC', 'LSVC_score', 'SVC', 'SVC_score', 'NB', 'NB_score', 'KN', 'KN_score', 'RF', 'RF_score', 'LGB', 'LGB_score', 'AB1', 'AB1_score', 'AB2', 'AB2_score', 'MLP1', 'MLP1_score', 'MLP2', 'MLP2_score', 'MLP3', 'MLP3_score'])
Emb
dict_keys(['LR', 'LR_score', 'RID', 'RID_score', 'LSVC', 'LSVC_score', 'SVC', 'SVC_score', 'NB', 'NB_score', 'KN', 'KN_score', 'RF', 'RF_score', 'LGB', 'LGB_score', 'MLP1', 'MLP1_score', 'MLP2', 'MLP2_score', 'MLP3', 'MLP3_score', 'AB1', 'AB1_score', 'AB2', 'AB2_score'])
Emb_tfidf
dict_keys(['LR', 'LR_score', 'RID', 'RID_score', 'LSVC', 'LSVC_score', 'SVC', 'SVC_score', 'NB', 'NB_score', 'KN', 'KN_score', 'RF', '

In [2]:
# 保存された結果の読み込みとCSV出力の実行
gsresults = pd.DataFrame()
gsscores = []
for dataname in vecs:
  cv_results = pd.DataFrame()
  with open(datadir + dataname + '.pkl', 'rb') as f:
    gsmodels = pickle.load(f)

  for clf in gsmodels.keys():
    if clf.endswith('_score'):
      clf_bestscores =  [dataname, str(clf), gsmodels[clf]]
      gsscores.append(clf_bestscores)
    else:
      clf_cv_results =  pd.DataFrame(gsmodels[clf].cv_results_)
      clf_cv_results.insert(0,"clf", str(clf))

      cv_results = pd.concat([cv_results,clf_cv_results], axis = 0)

  cv_results.insert(0,"DataName", dataname)

  gsresults = pd.concat([gsresults,cv_results], axis = 0)

gsscoresdf = pd.DataFrame(gsscores, columns=["DataName","clf","TestScore"])
gsresults.to_csv(datadir + 'gs_results.csv')
gsscoresdf.to_csv(datadir + 'gs_scores.csv')
