# 1.필요한 모듈 import 하기

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 2.데이터 준비

In [2]:
breast_cancer = load_breast_cancer()

# 3.데이터 이해하기

In [3]:
# Feature Data 지정하기
print(breast_cancer.data.shape)
# 총 569개 데이터, feature는 30개
breast_cancer_data = breast_cancer.data

(569, 30)


In [4]:
# Label Data 지정하기
print(breast_cancer.target.shape)
breast_cancer_label = breast_cancer.target

(569,)


In [5]:
# Target Names 출력해보기
breast_cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [6]:
# 데이터 Describe해보기
breast_cancer.DESCR



# 4.train,test 데이터 분리

In [7]:
X_train,X_test,y_train,y_test = train_test_split(breast_cancer_data,breast_cancer_label,test_size=0.2,random_state=7)

# 5.다양한 모델로 학습시켜보기

In [8]:
# Decision Tree 사용해 보기
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred_decision = decision_tree.predict(X_test)

#print(classification_report(y_test, y_pred_decision))

In [9]:
# Random Forest 사용해 보기
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred_forest = random_forest.predict(X_test)

#print(classification_report(y_test, y_pred_forest))

In [10]:
# SVM 사용해 보기
from sklearn import svm

svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

#print(classification_report(y_test, y_pred_svm))

In [11]:
# SGD Classifier 사용해 보기
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred_sgd = sgd_model.predict(X_test)

#print(classification_report(y_test, y_pred_sgd,zero_division=))

In [12]:
from sklearn.linear_model import LogisticRegression
# Gradient Decending 이 수렴되지 않아서 warning메세지 나와서
# max_iter 값을 3000번 으로 변경
logistic_model = LogisticRegression(max_iter=3000)

logistic_model.fit(X_train, y_train)
y_pred_log = logistic_model.predict(X_test)

#print(classification_report(y_test, y_pred_log))

# 6.모델을 평가해 보기

In [13]:
#Decision Tree (y_pred_decision)
#Random Forest (y_pred_forest)
#SVM (y_pred_svm)
#SGD Classifier (y_pred_sgd)
#Logistic Regression (y_pred_log)
print('Decision Tree -----------------------------')
print(classification_report(y_test, y_pred_decision,zero_division=0))
print('Random Forest -----------------------------')
print(classification_report(y_test, y_pred_forest,zero_division=0))
print('SVM           -----------------------------')
print(classification_report(y_test, y_pred_svm,zero_division=0))
print('SGD Classifier ----------------------------')
print(classification_report(y_test, y_pred_sgd,zero_division=0))
print('Logistic Regression -----------------------')
print(classification_report(y_test, y_pred_log,zero_division=0))

Decision Tree -----------------------------
              precision    recall  f1-score   support

           0       0.92      0.82      0.87        40
           1       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114

Random Forest -----------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        74

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114

SVM           -----------------------------
              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.87      1.00      0.93        74

    accuracy                           0.90       

### 1.데이터 불균형이 존재하는가?  
 - 모델 별로 weighted avg f1-score 값과 accuracy 값의 차이가 별로 없어서 
   데이터 불균형은 낮은 것으로 판단   

### 2.모델의 성능을 평가하는 지표로 무엇이 좋을까요?  
 - 데이터 불균형이 낮으므로 정확성(accuracy) 지표를 사용하는 것이 좋을것 같습니다.  
 - Random Forest 모델이 100%로 가장 좋은 성능을 보이고 있습니다.