Sklearn의 breast cancer 데이터의 mean radius, mean texture 변수를 사용해 종양을 분류하는 모델에 대한 평가와 성능 향상
1. 두 개 이상의 분류 모델의 AUC를 비교해 가장 좋은 모델 한 가지를 선택해 주세요. (ROC, PR 곡선 중 데이터에 더 적합한 기준 선택) 
2. 선택된 모델에 대해 5겹 교차검증을 통한 그리드 서치를 시행해 최적의 파라미터를 도출해 주세요. (1번에서 선택한 모델보다 좋은 성능을 내는 파라미터 찾기) 
3. 최종적으로 최적의 파라미터로 튜닝된 모델로 score를 산출해 주세요!

## 1번

In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [9]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target

In [10]:
df_data = pd.DataFrame(X, columns=data.feature_names)
df_target = pd.DataFrame(y, columns=['class'])
df = pd.concat([df_data,df_target], axis=1)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,class
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [11]:
model = LogisticRegression()
X = X[:, 0:1]
y =data.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [26]:
y_score = model.fit(X_train, y_train).decision_function(X_test)

# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, y_score)
# calculate AUC
roc_auc = roc_auc_score(y_test, y_score)
print(roc_auc)

0.9264003473729918




In [27]:
# calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
# calculate precision-recall AUC
pr_auc = auc(recall, precision)
print(pr_auc)

#결과: PR곡선의 AUC가 더 크다.(더 좋은 모델)

0.9575210307230717


## 2번

In [18]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1','l2']}

In [22]:
grid_search = GridSearchCV(model, param_grid, cv=5)
new_model=grid_search.fit(X_train, y_train)





In [20]:
print('최적의 파라미터: ', grid_search.best_params_)

최적의 파라미터:  {'C': 1, 'penalty': 'l2'}


In [23]:
new_model #최적의 파라미터로 구성된 새로운 모델

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

## 3번

In [24]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(new_model, X, y, cv=5)













In [25]:
scores

array([0.83478261, 0.84347826, 0.88495575, 0.95575221, 0.92035398])