# 二分类：预测乳腺癌是良性的还是恶性的

In [58]:
# 导入相关的库
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB  # 连续属性
# from sklearn.naive_bayes import CategoricalNB  # 离散属性

# 数据勘查及预处理

In [21]:
# 读取数据
data = pd.read_csv(r'../Datasets/breast-cancer.csv')

数据说明

- ...  各类指标
- type  乳腺癌类型，1 良性的，0 恶性的

In [22]:
# 数据基本信息
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [23]:
# 查看是否有重复数据
data.duplicated().any()

False

In [24]:
# 随机查看
data.sample(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,type
32,17.02,23.98,112.8,899.3,0.1197,0.1496,0.2417,0.1203,0.2248,0.06382,...,32.09,136.1,1344.0,0.1634,0.3559,0.5588,0.1847,0.353,0.08482,0
411,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,0.1714,0.0634,...,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881,1
213,17.42,25.56,114.5,948.0,0.1006,0.1146,0.1682,0.06597,0.1308,0.05866,...,28.07,120.4,1021.0,0.1243,0.1793,0.2803,0.1099,0.1603,0.06818,0
449,21.1,20.52,138.1,1384.0,0.09684,0.1175,0.1572,0.1155,0.1554,0.05661,...,32.07,168.2,2022.0,0.1368,0.3101,0.4399,0.228,0.2268,0.07425,0
423,13.66,19.13,89.46,575.3,0.09057,0.1147,0.09657,0.04812,0.1848,0.06181,...,25.5,101.4,708.8,0.1147,0.3167,0.366,0.1407,0.2744,0.08839,1


# 模型训练

In [47]:
# 构建数据集
d = data.sample(len(data), random_state=1)

# 取前500条数据做为训练集
X_train = d.iloc[:500, :-1]  # 特征
y_train = d.iloc[:500, -1]  # 结果

# 剩余数据做为测试集
X_test = d.iloc[500:, :-1]  # 特征
y_test = d.iloc[500:, -1]  # 结果

np.array(y_test)

array([1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1], dtype=int64)

In [48]:
# 训练模型
gnb = GaussianNB()

gnb.fit(X_train, y_train)

GaussianNB()

In [49]:
# 预测
y_pred = gnb.predict(X_test)

y_pred

array([1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1], dtype=int64)

# 模型评估

In [50]:
# 模型评估得分
gnb.score(X_train, y_train)

0.942

In [56]:
# 统计测试结果
print('共预测了{}条数据，其中{}条分类错误，准确率：{:.2f}%'
      .format(
          y_test.shape[0],
          (y_test != y_pred).sum(),
          (1 - (y_test != y_pred).sum() / y_test.shape[0]) * 100
      ))



共预测了69条数据，其中4条分类错误，准确率：94.20%
