# 유방암 데이터로 하이퍼파라미터 튜닝
- GridSearchCV vs RandomSearchCV 비교
- LogisticRegression의 최적 파라미터 찾기
- 정확도와 AUC 모두 확인
- 시간 효율성 비교

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### GridSearchCV

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import time

# 모델 정의
model = LogisticRegression(max_iter=1000, random_state=42)

# 하이퍼파라미터 정의
pram_grid = {
    'C' : [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty' : ['l1', 'l2'],
    'solver' : ['liblinear',  'saga']
}

# GridSearchCV 생성
grid_search = GridSearchCV(
    estimator=model,
    param_grid=pram_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# 탐색 실행
start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()

grid_search_time = end_time - start_time

Fitting 5 folds for each of 24 candidates, totalling 120 fits




### RandomSearchCV

In [3]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [39]:
# 하이퍼파라미터 정의
pram_distributions = {
    'C' : uniform(0.001, 100),
    'penalty' : ['l1', 'l2'],
    'solver' : ['liblinear',  'saga']
}

# RandomSearchCV 생성
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=pram_distributions,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# 탐색 실행
start_time2 = time.time()
random_search.fit(X_train, y_train)
end_time2 = time.time()

random_search_time = end_time2 - start_time2

Fitting 5 folds for each of 10 candidates, totalling 50 fits




### GridSearchCV, RandomSearchCV 결과 비교

In [40]:
# GridSearchCV 결과 확인
print(f'최적 파라미터 : {grid_search.best_params_}')
print(f'최적 점수 : {grid_search.best_score_:.3f}')
print(f'실행 시간 : {grid_search_time:.2f}초')

print('='*70)
# RandomSearchCV 결과 확인
print(f'최적 파라미터 : {random_search.best_params_}')
print(f'최적 점수 : {random_search.best_score_:.3f}')
print(f'실행 시간 : {random_search_time:.2f}초')

최적 파라미터 : {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
최적 점수 : 0.967
실행 시간 : 1.95초
최적 파라미터 : {'C': np.float64(73.20039418114051), 'penalty': 'l1', 'solver': 'liblinear'}
최적 점수 : 0.969
실행 시간 : 1.29초


### 최적 파라미터 정확도, AUC

In [7]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [41]:
best_model = random_search.best_estimator_

In [42]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

In [43]:
print(f'정확도 : {best_model.score(X_test, y_test):.3f}')
print(f'AUC score : {roc_auc_score(y_test, y_proba):.3f}')
print('=' * 60)
print(classification_report(y_test, y_pred))

정확도 : 0.982
AUC score : 0.996
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        43
           1       0.99      0.99      0.99        71

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

