In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(1)

In [8]:
# 随机生成二分类样本数据。
# 每一个类别5000样本
num_observations = 5000
x1 = np.random.multivariate_normal([0,0],[[1,.75],[.75, 1]],num_observations)
x2 = np.random.multivariate_normal([1,4],[[1,.75],[.75, 1]],num_observations)

In [31]:
X = np.vstack((x1,x2)).astype(np.float32)
y = np.hstack((np.zeros(num_observations),np.ones(num_observations)))

In [32]:
print(x1.shape)
print(X.shape)
print(y.shape)

(5000, 2)
(10000, 2)
(10000,)


In [13]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [24]:
def log_likelihood(X,y,w,b):
    """
    计算负的log likelihood, cross-entropy loss
    N: 样本数量  D: 特征维度
    X: (N,D)
    y: (N,)
    w: (D,)
    b: 标量
    """
    # 获得正样本和负样本的索引
    pos, neg = np.where(y==1), np.where(y==0)
    
    # -sum[ y_i*log(sigmoid(wx+b)) + (1-y_i)*log(1-sigmoid(wx+b))]
    # 计算 y_i*log(sigmoid(wx+b)) 
    pos_num = np.sum(np.log(sigmoid(np.dot(X[pos],w)+b)))
    # 计算 (1-y_i)*log(1-sigmoid(wx+b))
    neg_sum = np.sum(np.log(1-sigmoid(np.dot(X[neg],w)+b)))
    
    return -(pos_num + neg_sum)


In [25]:
def logistic_regression_train(X, y, batch_size, num_steps, lr):
    """
    train
    """
    w,b = np.zeros(X.shape[1]), 0
    for step in range(num_steps):
        #生成数据索引
        batch = np.random.choice(X.shape[0],batch_size) 
        # 训练数据
        X_batch, y_batch = X[batch], y[batch]
        
        # 计算梯度，log likelihood的导数 (y_hat-y)*x
        error = sigmoid(np.dot(X_batch, w)+b) - y_batch
        grad_w = np.matmul(X_batch.T, error)
        grad_b = np.sum(error)
        
        w = w - lr * grad_w
        b = b - lr * grad_b
        
        # 查看log likelihood
        if step % 10000 == 0:
            print("log likelihodd: ",log_likelihood(X,y,w,b))
    return w, b

In [26]:
w, b = logistic_regression_train(X, y, batch_size=100, num_steps = 500000, lr = 5e-4)

log likelihodd:  6419.139226954701
log likelihodd:  262.4370478618892
log likelihodd:  219.00758264431036
log likelihodd:  203.11520791664208
log likelihodd:  194.8640961717476
log likelihodd:  189.7494168443975
log likelihodd:  186.39211382161773
log likelihodd:  184.0840011426568
log likelihodd:  182.3301176646583
log likelihodd:  181.08497032781776
log likelihodd:  180.2518607149372
log likelihodd:  179.30466305390422
log likelihodd:  178.8039889038197
log likelihodd:  178.1869077287103
log likelihodd:  177.8546069070652
log likelihodd:  177.42234499592757
log likelihodd:  177.19025721379504
log likelihodd:  176.95960956424963
log likelihodd:  176.8820871070047
log likelihodd:  176.6243105216392
log likelihodd:  176.4756631709221
log likelihodd:  176.3617810523368
log likelihodd:  176.27659723645303
log likelihodd:  176.2072233853113
log likelihodd:  176.22254066633758
log likelihodd:  176.08732573597004
log likelihodd:  176.01763638488274
log likelihodd:  175.954387820399
log likel

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
clf = LogisticRegression(fit_intercept=True, C = 1e15) # C设置很大，不想加入正则
clf.fit(X, y)

LogisticRegression(C=1000000000000000.0)

In [29]:
print("self 结果 ", w, b)
print("slearn 结果 ", clf.coef_, clf.intercept_)

self 结果  [-4.5741644   7.31394704] -12.259032729459491
slearn 结果  [[-4.63907718  7.41733945]] [-12.39910964]
