# 그리드서치를 통한 하이퍼파라미터를 최적화

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
import pandas as pd
pd.set_option('mode.chained_assignment',  None) 
df = pd.read_csv("data/credit_card.csv")
df = df.drop("CLIENTNUM", axis=1)


## 계층적 샘플링을 통해 타겟값(y)의 비율을 유지

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42,stratify=df['Attrition_Flag'])

class_a, class_b = train['Attrition_Flag'].value_counts()
print("class_a의 수:",class_a, "class_b의 수:",class_b)
print("비율:",class_a/(class_a+class_b))
class_a, class_b = test['Attrition_Flag'].value_counts()
print("class_a의 수:",class_a, "class_b의 수:",class_b)
print("비율:",class_a/(class_a+class_b))

In [None]:
from sklearn import preprocessing

encoders = {}

for col in train.select_dtypes(include = "object"):

    encoders[col] = preprocessing.LabelEncoder()
    train[col] = encoders[col].fit_transform(train[col])
    
for col in test.select_dtypes(include = "object"):
    le_dict = dict(zip(encoders[col].classes_, encoders[col].transform(encoders[col].classes_)))
    #훈련데이터에는 없지만, 테스트데이터에는 존재하는 값에는 -1을 입력
    test[col] = test[col].apply(lambda x: le_dict.get(x, -1))

In [None]:
train_X, train_y = train.drop('Attrition_Flag',axis=1), train['Attrition_Flag']
test_X, test_y = test.drop('Attrition_Flag',axis=1), test['Attrition_Flag']

## 사용가능한 모든 Classifier 테스트

In [None]:
from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='classifier')
all_clf = []
for name, Classifier in estimators:
    try:
        clf = Classifier()
        all_clf.append(clf)
        print(name)
    except:
        pass

In [None]:
train_X.shape

In [None]:
train_X

In [None]:
train_y

In [None]:
train_X

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold


results = []
SUPER_SLOW_MODELS = ["SVC","NuSVC","GaussianProcessClassifier","LabelPropagation","LabelSpreading"]
for clf in all_clf:
    clf_name = clf.__class__.__name__ 
    if clf_name not in SUPER_SLOW_MODELS:
        try:
            #y값의 비율을 유지한 상태로 Kfold를 하기 위함
            skf = StratifiedKFold(n_splits=5)
            scores = cross_val_score(clf, train_X, train_y, scoring = "f1", cv=skf)

            if not scores.mean():
                break
            print("모델명:",clf.__class__.__name__)
            print("각 Fold의 F1 Score:",scores)
            print("평균 F1 Score:",scores.mean())
            print("="*60)
            result = {
                "Name":clf.__class__.__name__, 
                "Score":scores.mean()
            }
            results.append(result)
        except Exception as e:
            print(e)
            pass
    
    

In [None]:
sorted(results, key=lambda kv : kv['Score'], reverse=True)

## 가장 좋은 모델의 하이퍼파라미터 튜닝

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier


In [None]:
from sklearn.model_selection import GridSearchCV

model = HistGradientBoostingClassifier()
## 파라미터 종류 확인
print(model.get_params())

In [None]:

param_grid = {
    'learning_rate': (0.01, 0.1, 0.2, 0.4,0.5,0.6, 1, 10),
    'max_leaf_nodes': (3, 10, 20, 30, 40, 50)}

skf = StratifiedKFold(n_splits=5)

model_grid_search = GridSearchCV(model, scoring="f1",param_grid=param_grid,n_jobs=-1, cv=skf)
model_grid_search.fit(train_X, train_y)

In [None]:
model_grid_search.best_params_

In [None]:
model_grid_search.best_estimator_

In [None]:
accuracy = model_grid_search.score(test_X, test_y)
print(
    f"The test f1 score of the grid-searched pipeline is: "
    f"{accuracy:.4f}"
)