In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author: 徐聪
# datetime: 2022-10-04 19:25
# software: PyCharm

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.datasets import load_iris
import pandas as pd


class LogisticModel:
    def __init__(self):
        pass

    def h_theta(self, X, theta):
        """
        计算逻辑回归定义式
        :param X: 特征向量
        :param theta: 参数
        :return:
        """
        return self.sigmoid(np.dot(X, theta.T))

    def train(self, x, y, alpha, epochs):
        """
        对数回归模型训练函数
        :param x: 特征向量
        :param y: 标记
        :param alpha: 学习率
        :param epochs: 训练次数
        :return:
        """
        # 初始化参数以及数据
        num_train, num_feature = x.shape
        X = np.append(np.ones((num_train, 1)), x, axis=1)
        theta = np.zeros((1, num_feature + 1))
        cost_list = []

        # 训练模型
        for epoch in range(epochs):
            h_theta_x = self.h_theta(X, theta)
            # 损失值
            cost = -1 / num_train * np.sum(y * np.log(h_theta_x) + (1 - y) * np.log(1 - h_theta_x))
            # 计算theta偏导
            d_theta = 1 / num_train * np.sum((h_theta_x - y) * X, axis=0)
            # 更新theta
            theta = theta - alpha * d_theta

            if epoch % 100 == 0:
                cost_list.append(cost)
                print(f"epoch={epoch}, cost={cost}")

        return theta, cost_list

    def predict(self, x, theta):
        """
        二分类模型预测
        :param x: 特征向量
        :param theta: 参数
        :return:
        """
        num_predict, num_feature = x.shape
        X = np.append(np.ones((num_predict, 1)), x, axis=1)
        y_predict = self.h_theta(X, theta)
        for i in range(len(y_predict)):
            if y_predict[i] > 0.5:
                y_predict[i] = 1
            else:
                y_predict[i] = 0
        return y_predict

    def sigmoid(self, x):
        """
        定义sigmoid函数
        :param x:
        :return:
        """
        return 1 / (1 + np.exp(-x))

    def multi_predict(self, x, theta_list):
        """
        多分类器预测
        :param x: 特征向量
        :param theta_list: 参数
        :return:
        """
        num_predict, num_feature = x.shape
        X = np.append(np.ones((num_predict, 1)), x, axis=1)

        Y_predict = self.h_theta(X, theta_list[0])
        y_predict = np.ones((num_predict, 1))
        for theta in theta_list[1:]:
            Y_predict = np.append(Y_predict, self.h_theta(X, theta), axis=1)

        # print(Y_predict)

        for i in range(num_predict):
            y_predict[i] = np.argmax(Y_predict[i, :])
        return y_predict


    def multi_train(self, x, y, alpha, epochs):
        """
        多分类器训练
        :param x: 特征向量
        :param y: 标签
        :param alpha: 学习率
        :param epochs: 训练次数
        :return:
        """
        # 获取数据集参数
        sort_list = np.unique(y)
        num_sort = len(sort_list)
        num_train, num_feature = x.shape

        theta_list = []
        cost_list = []
        for i in range(num_sort):
            print(f"==== classifier {sort_list[i]} train begin ====")
            # 获取当前分类label
            sort = sort_list[i]
            # 将当前分类的label变为1，其余变为0
            Y = np.copy(y)
            Y[Y != sort] = -1
            Y[Y == sort] = 1
            Y[Y == -1] = 0

            # 存储参数
            theta, costs = self.train(x, Y, alpha, epochs)
            theta_list.append(theta)
            cost_list.append(costs[-1])

        return theta_list, cost_list

# 获取鸢尾花数据集
iris = load_iris()
x, y = iris.data, iris.target.reshape(-1, 1).astype('i4')
data = np.append(x, y, axis=1)
df = pd.DataFrame(data)
df.columns = ["花萼长", "花萼宽", "花瓣长", "花瓣宽", "品种"]
print(df)

# 训练集和测试集切分
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# 训练模型
lm = LogisticModel()
# 0.1 5000 达到100%
theta_list, cost_list = lm.multi_train(x_train, y_train, 0.1, 5000)
print(f"theta_list={theta_list}\ncost_list={cost_list}")

# 模型准确性分析
print(classification_report(y_test, lm.multi_predict(x_test, theta_list)))

     花萼长  花萼宽  花瓣长  花瓣宽   品种
0    5.1  3.5  1.4  0.2  0.0
1    4.9  3.0  1.4  0.2  0.0
2    4.7  3.2  1.3  0.2  0.0
3    4.6  3.1  1.5  0.2  0.0
4    5.0  3.6  1.4  0.2  0.0
..   ...  ...  ...  ...  ...
145  6.7  3.0  5.2  2.3  2.0
146  6.3  2.5  5.0  1.9  2.0
147  6.5  3.0  5.2  2.0  2.0
148  6.2  3.4  5.4  2.3  2.0
149  5.9  3.0  5.1  1.8  2.0

[150 rows x 5 columns]
==== classifier 0 train begin ====
epoch=0, cost=0.6931471805599453
epoch=100, cost=0.05769593643181581
epoch=200, cost=0.031218910678123057
epoch=300, cost=0.02174329009269675
epoch=400, cost=0.016815394643394294
epoch=500, cost=0.013775009694066047
epoch=600, cost=0.01170341736321661
epoch=700, cost=0.010196900264913026
epoch=800, cost=0.009049599375363178
epoch=900, cost=0.008145275264132801
epoch=1000, cost=0.00741320702141016
epoch=1100, cost=0.006807838207383416
epoch=1200, cost=0.006298465903016461
epoch=1300, cost=0.0058636245809886725
epoch=1400, cost=0.005487843834623663
epoch=1500, cost=0.005159684898016107
ep