In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 评价指标

In [2]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [3]:
# 准确率
y_pred = [0, 2, 1, 3]
y_true = [0, 1, 2, 3]
accuracy_score(y_true, y_pred)

0.5

In [4]:
y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
y_pred = ["ant", "ant", "cat", "cat", "ant", "bird"]
metrics.confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"])

array([[2, 0, 0],
       [0, 1, 0],
       [1, 0, 2]], dtype=int64)

In [5]:
metrics.precision_score(y_true, y_pred,average='micro'), metrics.recall_score(y_true, y_pred,average='micro')

(0.8333333333333334, 0.8333333333333334)

# 准备数据

In [6]:
from sklearn import datasets

In [7]:
data = datasets.load_iris()

In [8]:
data['target'].T

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

# sklearn的数据标签貌似都是一维，这里手工整理下

In [9]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder()

In [10]:
enc.fit(data['target'].reshape(-1,1))

OneHotEncoder()

In [11]:
multi_label = enc.transform(data['target'].reshape(-1,1)).toarray()

In [12]:
multi_label[:10]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

# 机器学习方法

In [13]:
X = data['data']
Y = multi_label
X.shape,Y.shape

((150, 4), (150, 3))

In [14]:
from sklearn import tree
from sklearn.linear_model import SGDClassifier

In [15]:
tr1 = tree.DecisionTreeClassifier()
clf2 = SGDClassifier()

In [16]:
tr1.fit(X,Y)
# clf2.fit(X,Y)
# 逻辑回归不支持多维的y

DecisionTreeClassifier()

In [17]:
tr1.score(X,Y)

1.0

In [18]:
predict = tr1.predict(X)

In [19]:
accuracy_score(Y,predict)

1.0

In [20]:
predict[:10]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

# 深度学习方法

In [21]:
X = data['data']
Y = multi_label
X.shape,Y.shape

((150, 4), (150, 3))

In [22]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import functional as F
from torch import optim

In [23]:
# 数据随机batch模块
class DiabetesDataset(Dataset):
    def __init__(self, x_, y_):
        self.len = x_.shape[0]
        self.x_data = torch.from_numpy(x_)
        # 这部很关键， 特征类型要求float类型
        self.x_data = self.x_data.float()
        # 多分类的标签必须要用LongTensor类型
        self.y_data = torch.LongTensor(y_)
        

    def __getitem__(self, item):
        return self.x_data[item], self.y_data[item]

    def __len__(self):
        return self.len

In [24]:
dataset = DiabetesDataset(X, Y)
train_loader = DataLoader(dataset=dataset, batch_size=50, shuffle=True)

In [25]:
#网络结构
class NeuralNet(nn.Module):
    def __init__(self, input_size_, hidden_size_1, num_classes_):
        super().__init__()
        self.fc1 = nn.Linear(input_size_, hidden_size_1) 
        self.fc2 = nn.Linear(hidden_size_1, num_classes_)
    
    def forward(self, x):
        out = self.fc1(x)
        out = F.relu(out)
        out = self.fc2(out)
        return out

In [26]:
# 对于多分类，最后输出的维度是标签数量，而不是一维
model = NeuralNet(4,2,3)
#损失函数
criterion = nn.CrossEntropyLoss(reduction='mean')
#优化算法
optimizer = optim.Adam(model.parameters(), lr=0.001)

total_step = len(train_loader)
num_epochs = 6000
for epoch in range(num_epochs):
    for i, (x, y) in enumerate(train_loader):
        # Reshape images to (batch_size, input_size)
        x = x.reshape(-1, 4)
        # Forward pass
        outputs = model(x)
        # 扁平化成一维
        y = y.squeeze()
        print(y.shape,outputs.shape)
        loss = F.nll_loss(outputs, y)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #输出中间信息
        if i % 5 == 0 and epoch%500==0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                  .format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))

torch.Size([50, 3]) torch.Size([50, 3])


RuntimeError: 1D target tensor expected, multi-target not supported

In [75]:
predict_deep = model(torch.from_numpy(X).float())

In [77]:
# 预测出来的结果要转换取每个数据最大值所在索引
predicted_class = predict_deep.argmax(dim=1)

In [78]:
accuracy_score(Y,predicted_class)

0.98

In [79]:
predicted_class

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])

# 集成方法

In [80]:
import xgboost as xgb

In [89]:
X = data['data']
Y = data['target'].reshape(-1,1)
X.shape,Y.shape

((150, 4), (150, 1))

In [90]:
#数据转换成Dmatrix格式，xgboost必须
xgtrain = xgb.DMatrix(X, Y)

In [99]:
#参数设置
params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',  # softmax交叉熵
    'num_class': 3,                # 多分类的类别个数要自己设置
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 50,               # 构建树的深度，越大越容易过拟合
    'lambda': 1,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
    'silent': 0,                   # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.02,                  # 如同学习率
    'seed': 1000,
    'nthread': 4,                  # cpu 线程数
}

In [100]:
#设定需要查看模型训练时的输出
watchlist = [(xgtrain,'tranin')]
num_round = 200
bst = xgb.train(params, xgtrain, num_round, watchlist)
#使用模型预测
preds = bst.predict(xgtrain)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	tranin-mlogloss:1.07402
[1]	tranin-mlogloss:1.05032
[2]	tranin-mlogloss:1.03005
[3]	tranin-mlogloss:1.01047
[4]	tranin-mlogloss:0.98796
[5]	tranin-mlogloss:0.96810
[6]	tranin-mlogloss:0.94700
[7]	tranin-mlogloss:0.93027
[8]	tranin-mlogloss:0.91555
[9]	tranin-mlogloss:0.89548
[10]	tranin-mlogloss:0.87657
[11]	tranin-mlogloss:0.85806
[12]	tranin-mlogloss:0.84010
[13]	tranin-mlogloss:0.82229
[14]	tranin-mlogloss:0.80581
[15]	tranin-mlogloss:0.78922
[16]	tranin-mlogloss:0.77317
[17]	tranin-mlogloss:0.75762
[18]	tranin-mlogloss:0.74248
[19]	tranin-mlogloss:0.73027
[20]	tranin-mlogloss:0.71827
[21]	tranin-mlogloss:0.70435
[22]	tranin-mlogloss:0.69438
[23]	tranin-mlogloss:0.68408
[24]	tranin-mlogloss:0.67094
[

In [101]:
# 多分类输出的直接就是标签值了
preds

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
      dtype=float32)

In [102]:
#模型评估
accuracy_score(Y, preds)

0.9666666666666667