# 建立決策樹預測乳癌的診斷結果
　　我以kaggle上的Breast Cancer Wisconsin (Diagnostic) Data Set作為訓練集，細胞特徵資料透過乳房腫塊的Fine needle aspiration (FNA)圖像中計算得出。 <br>
特徵資料包含：<br>


1. radius (mean of distances from center to points on the perimeter)
2. texture (standard deviation of gray-scale values)
3. perimeter
4. area
5. smothness (local variation in radius lengths)
6. compactness (perimeter^2 / area - 1.0)
7. concavity (severity of concave portions of the contour)
8. concavity points (number of concave portions of the contour)
9. symmetry
10. fractal dimension ("coastline approximation" - 1)

　　為每張圖像計算這些特徵的平均值(mean)、標準誤(se)和"最差"或最大(三個最大值的平均值)(worst)特徵，最後產生**30個特徵**。



---


* Reference: [Breast Cancer Wisconsin (Diagnostic) Data Set](https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data)


In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split

In [None]:
wbc = pd.read_csv('/content/drive/MyDrive/dataset/breast_cancer.csv')
wbc.head() # diagnosis=='M' means Malignancy

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [None]:
wbc['flg'] = wbc['diagnosis'].map(lambda x: 1 if x == 'M' else 0) # Convert a string to a value
wbc.isnull().any()

id                         False
diagnosis                  False
radius_mean                False
texture_mean               False
perimeter_mean             False
area_mean                  False
smoothness_mean            False
compactness_mean           False
concavity_mean             False
concave points_mean        False
symmetry_mean              False
fractal_dimension_mean     False
radius_se                  False
texture_se                 False
perimeter_se               False
area_se                    False
smoothness_se              False
compactness_se             False
concavity_se               False
concave points_se          False
symmetry_se                False
fractal_dimension_se       False
radius_worst               False
texture_worst              False
perimeter_worst            False
area_worst                 False
smoothness_worst           False
compactness_worst          False
concavity_worst            False
concave points_worst       False
symmetry_w

In [None]:
wbc = wbc.drop('Unnamed: 32', axis=1) # drop the column has null value
wbc.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,flg
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


In [None]:
X = wbc.drop(['id', 'flg', 'diagnosis'], axis=1)
Y = wbc['flg']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
model = DecisionTreeClassifier(criterion='entropy', random_state=0)

model.fit(X_train, Y_train)
print('Training score: ', model.score(X_train, Y_train))
print('Test score: ', model.score(X_test, Y_test))

Training score:  1.0
Test score:  0.9298245614035088


In [None]:
#@title 輸入腫瘤特徵資料
radius_mean = 17.99 #@param {type:"number"}
texture_mean = 10.38 #@param {type:"number"}
perimeter_mean = 122.8 #@param {type:"number"}
area_mean = 1001 #@param {type:"number"}
smoothness_mean = 0.1184 #@param {type:"number"}
compactness_mean = 0.2776 #@param {type:"number"}
concavity_mean = 0.3001 #@param {type:"number"}
concave_points_mean = 0.1471 #@param {type:"number"}
symmetry_mean = 0.2419 #@param {type:"number"}
fractal_dimension_mean = 0.07871 #@param {type:"number"}
radius_se = 1.095 #@param {type:"number"}
texture_se = 0.9053 #@param {type:"number"}
perimeter_se = 8.589 #@param {type:"number"}
area_se = 153.4 #@param {type:"number"}
smoothness_se = 0.006399 #@param {type:"number"}
compactness_se = 0.04904 #@param {type:"number"}
concavity_se = 0.05373 #@param {type:"number"}
concave_points_se = .01587 #@param {type:"number"}
symmetry_se = .03003 #@param {type:"number"}
fractal_dimension_se = .006193 #@param {type:"number"}
radius_worst = 25.38 #@param {type:"number"}
texture_worst = 17.33 #@param {type:"number"}
perimeter_worst = 184.6 #@param {type:"number"}
area_worst = 2019 #@param {type:"number"}
smoothness_worst = 0.1622 #@param {type:"number"}
compactness_worst = 0.6656 #@param {type:"number"}
concavity_worst = 0.7119 #@param {type:"number"}
concave_points_worst = 0.2654 #@param {type:"number"}
symmetry_worst = 0.4601 #@param {type:"number"}
fractal_dimension_worst = 0.1189 #@param {type:"number"}

dict = {'radius_mean': radius_mean, 'texture_mean': texture_mean, 'perimeter_mean': perimeter_mean,
      'area_mean': area_mean, 'smoothness_mean': smoothness_mean, 'compactness_mean': compactness_mean,
      'concavity_mean': concavity_mean, 'concave points_mean': concave_points_mean,
      'symmetry_mean': symmetry_mean, 'fractal_dimension_mean': fractal_dimension_mean,
      'radius_se': radius_se, 'texture_se': texture_se, 'perimeter_se': perimeter_se,
      'area_se': area_se, 'smoothness_se': smoothness_se, 'compactness_se': compactness_se,
      'concavity_se': concavity_se, 'concave points_se': concave_points_se,
      'symmetry_se': symmetry_se, 'fractal_dimension_se': fractal_dimension_se,
      'radius_worst': radius_worst, 'texture_worst': texture_worst, 'perimeter_worst': perimeter_worst,
      'area_worst': area_worst, 'smoothness_worst': smoothness_worst, 'compactness_worst': compactness_worst,
      'concavity_worst': concavity_worst, 'concave points_worst': concave_points_worst,
      'symmetry_worst': symmetry_worst, 'fractal_dimension_worst': fractal_dimension_worst}
X = pd.DataFrame(dict, index=[0])
Y_predicted = model.predict(X)[0]
if Y_predicted == 1:
  dignosis = 'M (malignant)'
else:
  dignosis = 'B (benign)'
print('The prediction result: ', dignosis)

The prediction result:  M (malignant)
