# Exploration 2번째 프로젝트

숫자/와인/유방암 데이터를 사용하여 모델 학습 및 평가하기

파이썬 연습할 겸, 공통적으로 사용되는 각 단계를 함수로 구현해 봄.

## 5가지 모델 선정

의사결정트리, 랜덤포레스트, 서포트벡터머신, SGD, 로지스틱회귀

In [449]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

def get_common_models():
    return {
        'DecisionTreeClassifier': DecisionTreeClassifier(),
        'RandomForestClassifier': RandomForestClassifier(random_state=32),
        'SupportVectorMachine': SVC(),
        'SGDClassifier': SGDClassifier(),
        'LogisticRegression': LogisticRegression(),
    }

print_title = lambda title: print('=====', title, '=====')

## 데이터셋 살펴보기

Pandas 데이터프레임을 활용하여 데이터 분석

In [450]:
import pandas as pd

def look(dataset, **kwargs):
    dataset_name = kwargs['dataset_name']

    data = pd.DataFrame(dataset.data, columns=dataset.feature_names)
    target = pd.DataFrame(dataset.target)
    target_category = dataset.target_names

    print_title(f'{dataset_name} 데이터 정보')
    print("1. shape:", data.shape)
    print("\n2. info:")
    print(data.info())
    print("\n3. sample:\n", data.sample(3, random_state=40))
    print()

    print_title(f'{dataset_name} 타겟 정보')
    print("1. shape:", target.shape)
    print("\n2. category:", target_category)
    print("\n3. info:")
    print(target.info())
    print("\n4. sample:\n", target.sample(3, random_state=40))
    print()


## 데이터셋 분리하기

train_test_split() 함수를 사용해서 훈련용과 테스트용 데이터 분리

In [None]:
from sklearn.model_selection import train_test_split

def split(dataset, test_size = 0.2, verbose=False):
    data_train, data_test, target_train, target_test = train_test_split(
        dataset.data, dataset.target, test_size=test_size, random_state=10)

    if verbose:
        print_title('데이터셋 분리 (훈련용/테스트용)')
        print('학습 데이터: {}개, 학습 타겟: {}개'.format(len(data_train), len(target_train)))
        print('테스트 데이터: {}개, 테스트 타겟: {}개'.format(len(data_test), len(target_test)))
        print()

    return data_train, data_test, target_train, target_test

## 모델 학습하기

학습용 데이터를 사용하여 모델 학습

In [452]:
def learn(model, data_train, target_train):
    model.fit(data_train, target_train)

## 모델 평가하기

테스트용 데이터를 사용하여 모델 평가

정확도(`accuracy`)를 평가지표로 사용함.

In [453]:
from sklearn.metrics import accuracy_score, classification_report

def estimate(model, data_test, target_test, model_name="", verbose=False):
    target_pred = model.predict(data_test)

    accuracy = accuracy_score(target_test, target_pred)
    report = classification_report(target_test, target_pred)

    print_title('{} 모델 평가'.format(model_name))

    print('정확도:', accuracy)
    if verbose: print('상세 리포트:\n', report)

## 숫자 데이터셋 학습

In [456]:
from sklearn.datasets import load_digits

dataset = load_digits()
look(dataset, dataset_name="Digits")
data_train, data_test, target_train, target_test = split(dataset, verbose=True)

models = get_common_models()
# 로지스틱 회귀 실행 시, 특정 개수 이상의 데이터는 iteration 제한값을 늘려야 경고 메시지가 뜨지 않는다.
models['LogisticRegression'] = LogisticRegression(max_iter=3600)

for model_name, model in models.items():
    learn(model, data_train, target_train)
    estimate(model, data_test, target_test, model_name=model_name)

===== Digits 데이터 정보 =====
1. shape: (1797, 64)

2. info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1797 entries, 0 to 1796
Data columns (total 64 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pixel_0_0  1797 non-null   float64
 1   pixel_0_1  1797 non-null   float64
 2   pixel_0_2  1797 non-null   float64
 3   pixel_0_3  1797 non-null   float64
 4   pixel_0_4  1797 non-null   float64
 5   pixel_0_5  1797 non-null   float64
 6   pixel_0_6  1797 non-null   float64
 7   pixel_0_7  1797 non-null   float64
 8   pixel_1_0  1797 non-null   float64
 9   pixel_1_1  1797 non-null   float64
 10  pixel_1_2  1797 non-null   float64
 11  pixel_1_3  1797 non-null   float64
 12  pixel_1_4  1797 non-null   float64
 13  pixel_1_5  1797 non-null   float64
 14  pixel_1_6  1797 non-null   float64
 15  pixel_1_7  1797 non-null   float64
 16  pixel_2_0  1797 non-null   float64
 17  pixel_2_1  1797 non-null   float64
 18  pixel_2_2  1797 non-null   floa

## 와인 데이터셋 학습

In [455]:
from sklearn.datasets import load_wine

dataset = load_wine()
look(dataset, dataset_name="Wine")
data_train, data_test, target_train, target_test = split(dataset, verbose=True)

models = get_common_models()
models['LogisticRegression'] = LogisticRegression(max_iter=600)

for model_name, model in models.items():
    learn(model, data_train, target_train)
    estimate(model, data_test, target_test, model_name=model_name)

# 모델을 평가할 때, 경고 메시지가 발생하는 이유는 예측 데이터에는 없는 테스트 데이터가 있기 때문이다.
# - https://stackoverflow.com/a/47285662

===== Wine 데이터 정보 =====
1. shape: (178, 13)

2. info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 유방암 데이터셋 학습

In [427]:
# 유방암 데이터셋 학습
from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer()
look(dataset, dataset_name="Breast Cancer")
data_train, data_test, target_train, target_test = split(dataset, verbose=True)

models = get_common_models()
models['LogisticRegression'] = LogisticRegression(max_iter=2200)

for model_name, model in models.items():
    learn(model, data_train, target_train)
    estimate(model, data_test, target_test, model_name=model_name)

===== Breast Cancer 데이터 정보 =====
1. shape: (569, 30)

2. info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error       