In [41]:
# coding: utf-8
 
import numpy as np
import math
from sklearn import datasets
from collections import Counter
from sklearn.model_selection import train_test_split

'''
逻辑回归的实现
'''
 
def sigmodFormatrix(Xb,betas):
    # 求内积
    params = - Xb.dot(betas)
    r = np.zeros(params.shape[0])#返回一个np数组
    for i in range(len(r)):
        r[i] = 1. /(1. + math.exp(params[i]))
    return r
 
def sigmodFormatrix2(Xb,betas):
    params = - Xb.dot(betas)
    r = np.zeros(params.shape[0])#返回一个np数组
    for i in range(len(r)):
        r[i] = 1. /(1. + math.exp(params[i]))
        if r[i] >=0.5:
            r[i] = 1
        else:
            r[i] = 0
    return r

def sigmod(Xi,betas):
    # 求向量Xi和betas的内积，结果是一个数
    params = - np.sum(Xi * betas)
    r = 1. /(1. + math.exp(params))
    return r
 
def normalize(x):
    num = x - np.min(x)
    denom = np.max(x) - np.min(x)
    return (num / denom)

class LinearLogsiticRegression(object):
    betas = None
    m = 0
    #训练
    def fit(self,X,y,alpha = 0.01,accuracy = 0.00001):
        # 数据归一化
        X = normalize(X)
        # 将betas构造成(b,w)，初始值从0.5开始        
        self.betas = np.full(X.shape[1]+1,0.5)
        self.m = X.shape[0] #行数
        a = np.full((self.m,1),1) # m*1的单位向量
        # 将X构造成(1,X)，插入第一列为1，构成xb矩阵，
        Xb = np.column_stack((a,X))
        dimension  = X.shape[1]+1 #X特征维度
        #梯度下降迭代
        count = 1
        oldJ = self.JFunc(Xb, y)
        while True:
            # oldJ = self.JFunc(Xb, y)
            #注意预测函数中使用的参数是未更新的
            c = sigmodFormatrix(Xb, self.betas)-y
            for j in range(dimension):
                self.betas[j] = self.betas[j] -alpha * np.sum(c * Xb[:,j])
            newJ = self.JFunc(Xb, y)
            if math.fabs(newJ - oldJ) < accuracy:
                print("代价函数迭代到最小值，退出！\n收敛到:",newJ)
                print("共迭代",count,"次!")
                break
            # print("迭代第",count,"次!")
            # print("代价函数上一次的差:",(newJ - oldJ))
            count += 1
            oldJ = newJ
 
    #预测
    def JFunc(self,Xb,y):
        sum = 0.0
        for i in range(self.m):
            yPre = sigmod(Xb[i,], self.betas)
            sum += y[i]*math.log(yPre)+(1 - y[i])*math.log(1-yPre)
        return -1/self.m * sum 
    
    def predict(self,X):
        # 数据归一化
        X = normalize(X)
        a = np.full((len(X),1),1)
        Xb = np.column_stack((a,X))
        return sigmodFormatrix2(Xb, self.betas)
    
    def score(self,X_test,y_test):
        y_predict = self.predict(X_test)
        re = (y_test==y_predict)
        re1 = Counter(re)
        a = re1[True] / (re1[True]+re1[False])
        return a


In [44]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
cancerlr = LinearLogsiticRegression()    
cancerlr.fit(X_train, y_train)
y_predict = cancerlr.predict(X_test)
 
print("测试数据准确度:",cancerlr.score(X_test, y_test)) 
print("训练数据准确度:",cancerlr.score(X_train, y_train)) 
'''
2.sklean中的逻辑回归
'''
 
from sklearn.linear_model import LogisticRegression
print("sklern中的逻辑回归:")
lr = LogisticRegression(solver="sag",max_iter=5000)
lr.fit(X_train,y_train)
y1_pre = lr.predict(X_test)
print("准确度:",lr.score(X_test,y_test))


代价函数迭代到最小值，退出！
收敛到: 0.20022241254225956
共迭代 1364 次!
测试数据准确度: 0.8421052631578947
训练数据准确度: 0.9221105527638191
sklern中的逻辑回归:
准确度: 0.8830409356725146


In [45]:
from sklearn import metrics
import pandas as pd


earrs = [
    [metrics.accuracy_score(y_test, y_predict),metrics.accuracy_score(y_test, y1_pre)],
    [metrics.precision_score(y_test, y_predict),metrics.precision_score(y_test, y1_pre)],
    [metrics.recall_score(y_test, y_predict),metrics.recall_score(y_test, y1_pre)],
    [metrics.f1_score(y_test, y_predict),metrics.f1_score(y_test, y1_pre)],
]

d1 = pd.DataFrame(data=earrs, index=['准确率', '精确率', '召回率', 'F1'], columns=['我的算法', 'sklearn-LR'])
d1

Unnamed: 0,我的算法,sklearn-LR
准确率,0.842105,0.883041
精确率,0.8,0.860656
召回率,1.0,0.972222
F1,0.888889,0.913043
