In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 评价指标

In [2]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [44]:
# 准确率
y_pred = [0, 2, 1, 3]
y_true = [0, 1, 2, 3]
accuracy_score(y_true, y_pred)

0.5

In [45]:
y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
y_pred = ["ant", "ant", "cat", "cat", "ant", "bird"]
metrics.confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"])

array([[2, 0, 0],
       [0, 1, 0],
       [1, 0, 2]], dtype=int64)

In [46]:
metrics.precision_score(y_true, y_pred,average='micro'), metrics.recall_score(y_true, y_pred,average='micro')

(0.8333333333333334, 0.8333333333333334)

# 准备数据

In [47]:
from sklearn import datasets

In [87]:
data = datasets.load_breast_cancer()

In [91]:
data['target'].T

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

# 机器学习方法

In [92]:
X = data['data']
Y = data['target']
X.shape,Y.shape

((569, 30), (569,))

In [93]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import SGDClassifier

In [104]:
clf1 = LogisticRegressionCV(penalty='l2', max_iter=50000)
clf2 = SGDClassifier()

In [105]:
clf1.fit(X,Y)
clf2.fit(X,Y)

SGDClassifier()

In [106]:
clf1.score(X,Y),clf2.score(X,Y)

(0.9859402460456942, 0.8787346221441125)

In [107]:
predict = clf1.predict(X)

In [108]:
accuracy_score(Y,predict)

0.9859402460456942

# 深度学习方法

In [109]:
X = data['data']
Y = data['target'].reshape(-1,1)
X.shape,Y.shape

((569, 30), (569, 1))

In [110]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import functional as F
from torch import optim

In [111]:
# 数据随机batch模块
class DiabetesDataset(Dataset):
    def __init__(self, x_, y_):
        self.len = x_.shape[0]
        self.x_data = torch.from_numpy(x_)
        # 这部很关键， 特征类型要求float类型
        self.x_data = self.x_data.float()

        self.y_data = torch.from_numpy(y_)
        self.y_data = self.y_data.float()

    def __getitem__(self, item):
        return self.x_data[item], self.y_data[item]

    def __len__(self):
        return self.len

In [112]:
dataset = DiabetesDataset(X, Y)
train_loader = DataLoader(dataset=dataset, batch_size=50, shuffle=True)

In [116]:
#网络结构
class NeuralNet(nn.Module):
    def __init__(self, input_size_, hidden_size_1, hidden_size_2, num_classes_):
        super().__init__()
        self.fc1 = nn.Linear(input_size_, hidden_size_1) 
        self.fc2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.fc3 = nn.Linear(hidden_size_2, num_classes_)
    
    def forward(self, x):
        out = self.fc1(x)
        out = F.relu(out)
        out = self.fc2(out)
        out = F.relu(out)
        out = self.fc3(out)
        #二分类最后一步进行sigmoid
        out = torch.sigmoid(out)
        return out

In [123]:
model = NeuralNet(30, 20, 10, 1)
#损失函数
criterion = nn.BCELoss(reduction='mean')
#优化算法
optimizer = optim.Adam(model.parameters(), lr=0.001)

total_step = len(train_loader)
num_epochs = 2000
for epoch in range(num_epochs):
    for i, (x, y) in enumerate(train_loader):
        # Reshape images to (batch_size, input_size)
        x = x.reshape(-1, 30)
        # Forward pass
        outputs = model(x)
        loss = criterion(outputs, y)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #输出中间信息
        if i % 5 == 0 and epoch%500==0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                  .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))

Epoch [1/2000], Step [1/12], Loss: 30.2604
Epoch [1/2000], Step [6/12], Loss: 14.8315
Epoch [1/2000], Step [11/12], Loss: 3.9308
Epoch [501/2000], Step [1/12], Loss: 0.0938
Epoch [501/2000], Step [6/12], Loss: 0.0840
Epoch [501/2000], Step [11/12], Loss: 0.1600
Epoch [1001/2000], Step [1/12], Loss: 0.0573
Epoch [1001/2000], Step [6/12], Loss: 0.0602
Epoch [1001/2000], Step [11/12], Loss: 0.0455
Epoch [1501/2000], Step [1/12], Loss: 0.1343
Epoch [1501/2000], Step [6/12], Loss: 0.0965
Epoch [1501/2000], Step [11/12], Loss: 0.0805


In [124]:
predict_deep = model(torch.from_numpy(X).float())

In [125]:
# 预测出来的结果要转换01分布
predict_deep_01 = torch.where(predict_deep>0.5,torch.tensor(1),torch.tensor(0))

In [126]:
accuracy_score(Y,predict_deep_01.detach().numpy())

0.9771528998242531

# 集成方法

In [128]:
import xgboost as xgb

In [129]:
X = data['data']
Y = data['target'].reshape(-1,1)
X.shape,Y.shape

((569, 30), (569, 1))

In [130]:
#数据转换成Dmatrix格式，xgboost必须
xgtrain = xgb.DMatrix(X, Y)

In [131]:
#参数设置
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',# 二分类交叉熵
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 50,               # 构建树的深度，越大越容易过拟合
    'lambda': 1,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
    'silent': 0,                   # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.02,                  # 如同学习率
    'seed': 1000,
    'nthread': 4,                  # cpu 线程数
}

In [132]:
#设定需要查看模型训练时的输出
watchlist = [(xgtrain,'tranin')]
num_round = 200
bst = xgb.train(params, xgtrain, num_round, watchlist)
#使用模型预测
preds = bst.predict(xgtrain)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	tranin-logloss:0.67645
[1]	tranin-logloss:0.66049
[2]	tranin-logloss:0.64536
[3]	tranin-logloss:0.63027
[4]	tranin-logloss:0.61606
[5]	tranin-logloss:0.60260
[6]	tranin-logloss:0.58977
[7]	tranin-logloss:0.57637
[8]	tranin-logloss:0.56390
[9]	tranin-logloss:0.55171
[10]	tranin-logloss:0.54014
[11]	tranin-logloss:0.52845
[12]	tranin-logloss:0.51727
[13]	tranin-logloss:0.50682
[14]	tranin-logloss:0.49687
[15]	tranin-logloss:0.48711
[16]	tranin-logloss:0.47717
[17]	tranin-logloss:0.46780
[18]	tranin-logloss:0.45874
[19]	tranin-logloss:0.44969
[20]	tranin-logloss:0.44106
[21]	tranin-logloss:0.43253
[22]	tranin-logloss:0.42419
[23]	tranin-logloss:0.41617
[24]	tranin-logloss:0.40853
[25]	tranin-logloss:0.4011

In [137]:
# 输出的是0-1的实数值，要转换成具体01标签
preds_xg = np.where(preds>0.5,1,0)

In [136]:
#模型评估
accuracy_score(Y, preds_xg)

0.9876977152899824