# 교차 검증

In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
fish_df = pd.read_csv('./data/fish.csv')
fish_input = fish_df.drop('Species', axis=1)
fish_target = fish_df['Species']

X_train, X_test, y_train, y_test = train_test_split(fish_input, fish_target, random_state=42, stratify=fish_target)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 생선 다중 분류 with cross_val_score

In [25]:
# 모델 생성
lr = LogisticRegression(max_iter=1000, solver='newton-cg')

# 교차검증 점수
scores = cross_val_score(
    lr,
    X_train_scaled,
    y_train,
    scoring='accuracy',
    cv=5
)
print(scores)
print(np.mean(scores))

# 예측 결과
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)
acc_score = accuracy_score(y_test, y_pred)
print(acc_score)

[0.875      0.83333333 0.83333333 0.79166667 0.73913043]
0.8144927536231885
0.825


In [9]:
scores

array([0.84375   , 0.9375    , 0.96875   , 1.        , 0.83870968])

### 생선 다중 분류 with GridSearchCV

In [23]:
lr = LogisticRegression()

# GridSearchCV 사용 -> 최적의 파라미터, 평가 점수, 모델 확인
params = {
    'max_iter': [500,1000,1500,2000],
    'solver': ['liblinear','newton-cg', 'lbfgs']
}

stratifiedkfold = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(lr, params, scoring='accuracy',cv=stratifiedkfold)
grid_search.fit(X_train_scaled, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.best_estimator_)

# 예측 결과 평가
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
print(best_model.score(X_test_scaled, y_test))

{'max_iter': 500, 'solver': 'newton-cg'}
0.8144927536231885
LogisticRegression(max_iter=500, solver='newton-cg')
0.825
