# 교차 검증 실습

In [39]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [7]:
fish_df = pd.read_csv('./data/fish.csv')
fish_input = fish_df.drop('Species', axis=1)
fish_target = fish_df['Species']

X_train, X_test, y_train, y_test = train_test_split(fish_input, fish_target, random_state=42, stratify=fish_target)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [21]:
fish_input

Unnamed: 0,Weight,Length,Diagonal,Height,Width
0,242.0,25.4,30.0,11.5200,4.0200
1,290.0,26.3,31.2,12.4800,4.3056
2,340.0,26.5,31.1,12.3778,4.6961
3,363.0,29.0,33.5,12.7300,4.4555
4,430.0,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...
154,12.2,12.2,13.4,2.0904,1.3936
155,13.4,12.4,13.5,2.4300,1.2690
156,12.2,13.0,13.8,2.2770,1.2558
157,19.7,14.3,15.2,2.8728,2.0672


### 생선 다중 분류 with cross_val_score

In [36]:
from sklearn.model_selection import cross_val_score

# 모델 생성
lr_clf = LogisticRegression(max_iter=1000, solver='newton-cg') # solver 파라미터 'newton-cg' 사용 → 최적화 알고리즘 선택 → 학습 시간 빠름

# 교차검증 점수
scores = cross_val_score(lr_clf, X_train_scaled, y_train, scoring='accuracy', cv=5)
print('교차검증 점수 score : ', scores)
print('np.mean(scores)    : ', np.mean(scores))

# 학습/예측/결과 평가
lr_clf.fit(X_train_scaled, y_train)
y_pred = lr_clf.predict(X_test_scaled)

# 예측 결과 평가
from sklearn.metrics import accuracy_score

acc_score = accuracy_score(y_test, y_pred)

print('테스트 정확도       : ', acc_score)


교차검증 점수 score :  [0.875      0.83333333 0.83333333 0.79166667 0.73913043]
np.mean(scores)    :  0.8144927536231885
테스트 정확도       :  0.825


In [40]:
import warnings
warnings.filterwarnings('ignore')


### 생선 다중 분류 with GridSearchCV

In [47]:
# GridSearchCV 사용 → 최적의 파라미터, 평가 점수, 모델 확인

from sqlite3 import paramstyle
from sklearn.model_selection import GridSearchCV, StratifiedKFold

lr_clf = LogisticRegression()

# 탐색할 파라미터 값 지정
params = {
    'penalty': ['l2', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [100, 200, 300, 400, 500],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

stratifiedkfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV 객체 생성
grid_search = GridSearchCV(lr_clf, params, cv=stratifiedkfold)
grid_search.fit(X_train_scaled, y_train)

# 학습
lr_clf.fit(X_train_scaled, y_train)

# 최적의 파라미터 확인
print('최적의 모델     : ', grid_search.best_estimator_)
print('최적의 파라미터 : ', grid_search.best_params_)
print('최적의 점수     : ', grid_search.best_score_)


# 예측 결과 평가
y_train_pred = lr_clf.predict(X_train_scaled)
y_test_pred = lr_clf.predict(X_test_scaled)

from sklearn.metrics import accuracy_score

train_score = accuracy_score(y_train, y_train_pred)
test_score = accuracy_score(y_test, y_test_pred)

print('학습 정확도     : ', train_score)
print('테스트 정확도   : ', test_score)

# 실제 베스트 모델 적용
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
print('최적 모델 정확도 : ', best_model.score(X_test_scaled, y_test))




최적의 모델     :  LogisticRegression(C=100, solver='liblinear')
최적의 파라미터 :  {'C': 100, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
최적의 점수     :  0.9152173913043479
학습 정확도     :  0.8151260504201681
테스트 정확도   :  0.825
최적 모델 정확도 :  0.975
