In [1]:
# import necessary libraries
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# set data toa variable
bcancer = load_breast_cancer()
print(bcancer.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [2]:
# set data toa variable
bcancer_data = bcancer.data
bcancer_label = bcancer.target

# print label names and details of the data
print(bcancer.target_names)
print(bcancer.DESCR)

['malignant' 'benign']
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instanc

In [3]:
# use train test split method
X_train, X_test, y_train, y_test = train_test_split(bcancer_data,
                                                    bcancer_label,
                                                    test_size=0.2,
                                                    random_state=45)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(455, 30) (114, 30) (455,) (114,)


In [4]:
# use decision tree classifier
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=66)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.79      0.85        42
           1       0.88      0.96      0.92        72

    accuracy                           0.89       114
   macro avg       0.90      0.87      0.88       114
weighted avg       0.90      0.89      0.89       114

[[33  9]
 [ 3 69]]


In [5]:
# use random forest classifier
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=66)
random_forest.fit(X_train, y_train)
y_pred2 = random_forest.predict(X_test)

print(classification_report(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96        42
           1       0.97      0.99      0.98        72

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

[[40  2]
 [ 1 71]]


In [6]:
# use support vector machine
from sklearn import svm

svm_model = svm.SVC(random_state=66)
svm_model.fit(X_train, y_train)
y_pred3 = svm_model.predict(X_test)

print(classification_report(y_test, y_pred3))
print(confusion_matrix(y_test, y_pred3))

              precision    recall  f1-score   support

           0       1.00      0.81      0.89        42
           1       0.90      1.00      0.95        72

    accuracy                           0.93       114
   macro avg       0.95      0.90      0.92       114
weighted avg       0.94      0.93      0.93       114

[[34  8]
 [ 0 72]]


In [7]:
# use stochastic gradient descent classifier
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(random_state=66)
sgd_model.fit(X_train, y_train)
y_pred4 = sgd_model.predict(X_test)

print(classification_report(y_test, y_pred4))
print(confusion_matrix(y_test, y_pred4))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89        42
           1       0.93      0.94      0.94        72

    accuracy                           0.92       114
   macro avg       0.92      0.91      0.91       114
weighted avg       0.92      0.92      0.92       114

[[37  5]
 [ 4 68]]


In [8]:
# use logistic regression
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(random_state=66, max_iter=2500)
logistic_model.fit(X_train, y_train)
y_pred5 = logistic_model.predict(X_test)

print(classification_report(y_test, y_pred5))
print(confusion_matrix(y_test, y_pred5))

              precision    recall  f1-score   support

           0       0.97      0.90      0.94        42
           1       0.95      0.99      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114

[[38  4]
 [ 1 71]]


Classification report와 confusion matrix로 봤을 때:
- 베스트 모델: Random Forest Classifier
- 워스트 모델: Decision Tree Classifier

질병관련 해서는 전체적인 정확성도 중요하지만 type 2 에러가 적어야 실제 병에 걸린 환자가 음성이 나오는 오진이 적어 지기 때문에 recall이 평균적으로 가장 높고 confusion matrix에서 type 2 (False Negative) 에러가 가장 적은 Random Forest Classifier가 베스트 모델