In [1]:
from matplotlib import pyplot as plt
import seaborn as sb

import pandas as pd
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## 2. 데이터 가져오기

In [3]:
origin = pd.read_excel("https://data.hossam.kr/G02/breast_cancer.xlsx")
origin.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


## 2. 데이터 전처리
### 독립/종속 변수 분리

In [5]:
x = origin.drop("target", axis=1)
y = origin['target']
x.shape, y.shape

((569, 30), (569,))

### 데이터 표준화

In [6]:
 scaler = StandardScaler()
std_x = scaler.fit_transform(x)
std_x[:1]

array([[ 1.09706398, -2.07333501,  1.26993369,  0.9843749 ,  1.56846633,
         3.28351467,  2.65287398,  2.53247522,  2.21751501,  2.25574689,
         2.48973393, -0.56526506,  2.83303087,  2.48757756, -0.21400165,
         1.31686157,  0.72402616,  0.66081994,  1.14875667,  0.90708308,
         1.88668963, -1.35929347,  2.30360062,  2.00123749,  1.30768627,
         2.61666502,  2.10952635,  2.29607613,  2.75062224,  1.93701461]])

### 훈련/데이터 분할

In [7]:
x_train, x_test, y_train, y_test = train_test_split(
    std_x, y, test_size = 0.3 , random_state = 111)
x_train.shape, x_test.shape, y_train.shape, y_test.shape


((398, 30), (171, 30), (398,), (171,))

### 분류 모델 구현

In [9]:
def singleML(modelName, train_x, train_y, test_x, test_y, cv=5, **kargs):
    # 모델 구축
    model = modelName(**kargs)
    # 학습
    model.fit(train_x, train_y)
    # 훈련 점수
    train_scores = cross_val_score(model, train_x, train_y, cv = cv).mean()
    # 각 훈려 별 회차별 점수표
    score_df = pd.DataFrame(cross_validate(model, train_x, train_y, cv = 5))
    # 검증 데이터에 대한 예측치 생성
    y_pred = model.predict(test_x)
    # 예측치에 대한 정확도 점수
    test_scores = accuracy_score(test_y, y_pred)
    # 리턴
    return [model, train_scores, test_scores, score_df]

### 사용하고자 하는 분류 리스트

In [10]:
ml_list = [LogisticRegression, KNeighborsClassifier, DecisionTreeClassifier, SVC]
ml_list

[sklearn.linear_model._logistic.LogisticRegression,
 sklearn.neighbors._classification.KNeighborsClassifier,
 sklearn.tree._classes.DecisionTreeClassifier,
 sklearn.svm._classes.SVC]

### Bagging 모델 구현

In [15]:
scores = []

# 서포터백터머신(SVC)의 경우 독립변수에 이름이 없으면 경고가 표시된다.
# 그래서 이름을 붙여준다. --> 데이터프레임으로 구성
x_train_df = pd.DataFrame(x_train, columns = x.columns)
x_test_df = pd.DataFrame(x_test, columns = x.columns)

for ml in ml_list:
    _, train_score, test_score, _ = singleML(
        modelName = BaggingClassifier,
        train_x = x_train_df,
        train_y = y_train,
        test_x = x_test_df,
        test_y = y_test,
        base_estimator= ml(),
        # maxz_samples=1, # KNN, SVM과 충돌
        bootstrap = True,
        random_state= 111,
        bootstrap_features = False,
        n_jobs = -1)
    scores.append({
        "name": ml.__name__, "train_score": train_score,
        "test_score": test_score})

df= pd.DataFrame(scores)
df



Unnamed: 0,name,train_score,test_score
0,LogisticRegression,0.96731,0.97076
1,KNeighborsClassifier,0.954778,0.97076
2,DecisionTreeClassifier,0.94481,0.947368
3,SVC,0.967342,0.976608
