<a href="https://colab.research.google.com/github/yeahginny/Machine_learning/blob/main/ML_Model_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree classifier
tree = DecisionTreeClassifier(random_state=42)

# Train the Decision Tree classifier
tree.fit(X_train, y_train)

# Use the trained classifier to make predictions on the test set
y_pred = tree.predict(X_test)

# Compute the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Model accuracy: {accuracy:.2f}")

Model accuracy: 1.00


In [None]:
# Use the trained classifier to make predictions on the test set
y_train_pred = tree.predict(X_train)

# Compute the accuracy of the model
accuracy = accuracy_score(y_train, y_train_pred)

In [None]:
accuracy

1.0

In [None]:
# K 폴드 교차 검증

from sklearn import datasets
from sklearn.model_selection import KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier

# Load the iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Create a Decision Tree classifier
tree = DecisionTreeClassifier(random_state=42)

# Define the K-fold cross validator
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

# Perform K-fold cross validation
scores = cross_val_score(tree, X, y, cv=kfold)

# Print cross validation scores
print(f"Cross-validation scores: {scores}")

# Print cross validation average score
print(f"Average cross-validation score: {scores.mean():.2f}")

Cross-validation scores: [1.         0.96666667 0.93333333 0.93333333 0.93333333]
Average cross-validation score: 0.95


In [None]:
# stratified K 폴드

from sklearn import datasets
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier

# Load the iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Create a Decision Tree classifier
tree = DecisionTreeClassifier(random_state=42)

# Create a StratifiedKFold object
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform stratified 5-fold cross validation
scores = cross_val_score(tree, X, y, cv=stratified_kfold)

# Print cross validation scores
print(f"Cross-validation scores: {scores}")

# Print cross validation average score
print(f"Average cross-validation score: {scores.mean():.2f}")

Cross-validation scores: [1.         0.96666667 0.93333333 0.96666667 0.9       ]
Average cross-validation score: 0.95


In [None]:
# sklearn api로 스코어까지 추출할 수 있도록 개량

from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

# Load the iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Create a Decision Tree classifier
tree = DecisionTreeClassifier(random_state=42)

# Perform 5-fold cross validation
scores = cross_val_score(tree, X, y, cv=5)

# Print cross validation scores
print(f"Cross-validation scores: {scores}")

# Print cross validation average score
print(f"Average cross-validation score: {scores.mean():.2f}")

Cross-validation scores: [0.96666667 0.96666667 0.9        0.93333333 1.        ]
Average cross-validation score: 0.95


In [None]:
from sklearn import datasets
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier

# Load the iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Define the parameter grid
param_grid = {'max_depth': range(1, 10)}

# Create a Decision Tree classifier
tree = DecisionTreeClassifier(random_state=42)

# Create a KFold object
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

# Perform grid search with cross validation
grid_search = GridSearchCV(tree, param_grid, cv=kfold)
grid_search.fit(X, y)

# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")

# Print cross validation score of best model
print(f"Best model score: {grid_search.best_score_:.2f}")

Best parameters: {'max_depth': 4}
Best model score: 0.95


In [None]:
from sklearn import datasets
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier

# Load the iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Define the parameter grid
param_grid = {'max_depth': range(1, 10)}

# Create a Decision Tree classifier
tree = DecisionTreeClassifier(random_state=42)

# Create a KFold object
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

# Perform grid search with cross validation
grid_search = GridSearchCV(tree, param_grid, cv=kfold, return_train_score=True)
grid_search.fit(X, y)

# Print best parameters and their score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best model score: {grid_search.best_score_:.2f}")

# Print the cross validation scores for each parameter combination
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
    print(f"Mean CV score: {mean_score:.2f} - Parameters: {params}")


Best parameters: {'max_depth': 4}
Best model score: 0.95
Mean CV score: 0.63 - Parameters: {'max_depth': 1}
Mean CV score: 0.95 - Parameters: {'max_depth': 2}
Mean CV score: 0.95 - Parameters: {'max_depth': 3}
Mean CV score: 0.95 - Parameters: {'max_depth': 4}
Mean CV score: 0.95 - Parameters: {'max_depth': 5}
Mean CV score: 0.95 - Parameters: {'max_depth': 6}
Mean CV score: 0.95 - Parameters: {'max_depth': 7}
Mean CV score: 0.95 - Parameters: {'max_depth': 8}
Mean CV score: 0.95 - Parameters: {'max_depth': 9}


In [None]:
# 실습

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split

# 데이터를 로드하고 학습 데이터와 테스트 데이터 세트 분리
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=121)

dtree = DecisionTreeClassifier()

# 하이퍼 파라미터 설정
# 파라미터들을 dictionary 형태로 설정
# key : 파라미터명, 리스트 값 : 파라미터 값
parameters = {'max_depth':[1, 2, 3], 'min_samples_split':[2, 3]}

In [None]:
import pandas as pd

# param_grid의 하이퍼 파라미터들을 3개의 train, test set fold로 나누어서 테스트 수행 설정
grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=3, refit=True, return_train_score=True)

# 붓꽃 학습 데이터로 param_grid의 하이퍼 파라미터들을 순차적으로 학습/평가
grid_dtree.fit(X_train, y_train) # 학습 데이터 세트 사용
# fit(학습 데이터 세트)
# - 학습 데이터를 cv에 기술된 폴딩 세트로 분할해
# - param_grid에 기술된 하이퍼 파라미터들을 순차적으로 변경하면서 학스/평가 수행하고
# - 그 결과를 cv_results_ 속성에 기록
# cv_results_ 속성 : GridSearchCV 결과 세트로 딕셔너리 형태

# GridSearchCV 결과 세트로 딕셔너리 형태인 cv_results_ 데이터 프레임으로 변환 후
# 필요한 파라미터 값만 추출
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95


In [None]:
# 여기에 다 저장됨
grid_dtree.cv_results_

{'mean_fit_time': array([0.00128055, 0.00108377, 0.00104316, 0.00103569, 0.00101177,
        0.0010399 ]),
 'std_fit_time': array([1.74183443e-04, 5.11033353e-05, 2.02794433e-05, 6.34713068e-05,
        4.34795718e-05, 4.62459666e-05]),
 'mean_score_time': array([0.00098753, 0.00083216, 0.00083915, 0.00081746, 0.00079346,
        0.00079528]),
 'std_score_time': array([1.18743144e-04, 4.13765963e-05, 4.86211552e-05, 3.53874716e-05,
        5.65016822e-05, 6.21055865e-05]),
 'param_max_depth': masked_array(data=[1, 1, 2, 2, 3, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 3, 2, 3, 2, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 1, 'min_samples_split': 2},
  {'max_depth': 1, 'min_samples_split': 3},
  {'max_depth': 2, 'min_samples_split': 2},
  {'max_depth': 2, 'min_sa

In [None]:
# best_params_ 속성 : 최적의 파라미터 값 저장
# 최고 성능을 가지는 파라미터 조합과 예측 성능 1위 값 출력
print('GridSearchCV 최적 파라미터 :', grid_dtree.best_params_)
print('GridSearchCV 최고 정확도 : {0:.4f}'.format(grid_dtree.best_score_))

GridSearchCV 최적 파라미터 : {'max_depth': 3, 'min_samples_split': 2}
GridSearchCV 최고 정확도 : 0.9750


In [None]:
# 결과값저장하는 것
#cvresults
#bestparmas
#bestscore