# 逻辑斯谛回归

LR是经典的分类方法


回归模型：$f(x) = \frac{1}{1+e^{-wx}}$

其中wx线性函数：$wx =w_0*x_0 + w_1*x_1 + w_2*x_2 +...+w_n*x_n,(x_0=1)$


In [1]:
from math import exp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

In [2]:
# label parse
def parseRecord(df_record):
    df_group = df_record.groupby(by="label")
    label_list = list(df_group.groups.keys())
    for idx,names in enumerate(label_list):
        df_record = df_record.replace(names,idx)
        print(f"{names}-->{idx}")
    return df_record

In [11]:
# data
def create_data(file_path,names):
    df = pd.read_csv(file_path,names=names)
    datas = parseRecord(df)
    return datas.iloc[:100,:-1], datas.iloc[:100,-1]

In [13]:
file_path = "../datas/iris.data"
names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']

X, y = create_data(file_path,names)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

Iris-setosa-->0
Iris-versicolor-->1
Iris-virginica-->2


In [87]:
class LogisticReressionClassifier:
    def __init__(self, max_iter=200, learning_rate=0.01):
        self.max_iter = max_iter
        self.learning_rate = learning_rate

    def sigmoid(self, x):
        return 1 / (1 + exp(-x))

    def data_matrix(self, X):
        data_mat = []
        for d in X.values:
            data_mat.append([*d, 1.0])
        return data_mat

    def fit(self, X, y):
        data_mat = self.data_matrix(X)  # m*n
        self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)

        for iter_ in range(self.max_iter):
            for i in range(len(data_mat)):
                result = self.sigmoid(np.dot(np.array(data_mat[i]), self.weights))
                error = y.iloc[i] - result
                self.weights += self.learning_rate * error * np.transpose([data_mat[i]])
            # print(f"iter_:{iter_}----error:{error}\nself.weights:{self.weights}")
        print('LogisticRegression Model(learning_rate={},max_iter={})'.format(self.learning_rate, self.max_iter))

    def score(self, X_test, y_test):
        right = 0
        X_test = self.data_matrix(X_test)
        for x, y in zip(X_test, y_test):
            result = np.dot(x, self.weights)
            if (result > 0 and y == 1) or (result < 0 and y == 0):
                right += 1
        return right / len(X_test)

In [88]:
lr_clf = LogisticReressionClassifier()
lr_clf.fit(X_train, y_train)

LogisticRegression Model(learning_rate=0.01,max_iter=200)


In [89]:
lr_clf.score(X_test, y_test)

1.0

## sklearn

### sklearn.linear_model.LogisticRegression

solver参数决定了我们对逻辑回归损失函数的优化方法，有四种算法可以选择，分别是：
- a) liblinear：使用了开源的liblinear库实现，内部使用了坐标轴下降法来迭代优化损失函数。
- b) lbfgs：拟牛顿法的一种，利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。
- c) newton-cg：也是牛顿法家族的一种，利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。
- d) sag：即随机平均梯度下降，是梯度下降法的变种，和普通梯度下降法的区别是每次迭代仅仅用一部分的样本来计算梯度，适合于样本数据多的时候。

In [75]:
from sklearn.linear_model import LogisticRegression

In [76]:
clf = LogisticRegression(max_iter=200)

In [79]:
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [80]:
clf.score(X_test, y_test)

1.0

In [81]:
print(clf.coef_, clf.intercept_)

[[-0.38166654 -1.36087692  2.04124436  0.90087909]] [-0.26009577]
