### 作业要求

1. 使用sklearn数据集训练逻辑回归模型，调整学习率，样本数据拆分比率，观察训练结果
2. 训练后,将模型参数保存到文件
3. 使用参数测试训练的模型，并预测结果
4. 总结逻辑回归运算及训练相关知识点

####  使用sklearn数据集训练逻辑回归的模型
1. 导入sklearn的数据集 

网址：https://scikit-learn.org/stable/api/index.html


2. 拆分数据集为训练集和测试集  7：3
3. 设定参数 ， 权重参数、超参数、训练次数
4. 模型的计算函数
5. 计算损失函数
6. 计算梯度  ： theta (weight parameters) 、bias(weight parameters) 
7. 模型训练过程



In [27]:
# 1. import sklearn datasets 
from sklearn.datasets import load_breast_cancer
# This function is a utility in scikit-learn that allows you to split your dataset into training and testing subsets
from sklearn.model_selection import train_test_split

import numpy as np

# save the dataset to a file
import joblib


# 2. 加载数据集，拆分数据集为训练集和测试集  7：3
X ,y = load_breast_cancer( return_X_y=True, as_frame=False)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.3)

# freatures: X.shape[1] ,X.shape[-1]
print("X's features:{}".format(X.shape[1]))

print("X_train:{}".format(X_train))
print("X_test:{}".format(X_test))

print("y_train:{}".format(y_train))
print("y_test:{}".format(y_test))

# save the datatset for testing to a file
test_dataset = {
    "X_test":X_test,
    "y_test":y_test
}
joblib.dump(test_dataset,"breast_cancer_test_dataset.pkl")


['breast_cancer_test_dataset.pkl']

In [19]:
# 3. 设定参数 ， 权重参数、超参数、训练次数

theta = np.random.randn(1,30)
bias = 0

#hyper parameters
lr = 0.1
# training times
epochs = 3000



In [22]:
# 4. 模型的计算函数
def forward(x,theta,bias):
    # linear operation (multiple)
    z =np.dot(theta,x.T) + bias
    # sigmoid
    y_hat = 1 / (1 + np.exp(-z)) 
    return y_hat


In [None]:

# 5. 计算损失函数 (loss function)
def loss(y,y_hat):
    e = 1e-8
    return -y * np.log(y_hat + e) - (1 -y ) * np.log(1 - y_hat + e)





In [None]:

# 6. 计算梯度  ： theta (weight parameters) 、bias(weight parameters) 
def calc_gradient(x,y,y_hat):
    # calculate gradient 
    m = x.shape[-1]
    # calculate gradient for theta
    delta_theta = np.dot((y_hat - y),x) / m
    # calculate gradient for bias
    delta_bias = np.mean(y_hat - y)
    return delta_theta, delta_bias




In [24]:
# 7. 模型训练过程

for i in range(epochs):
    # forward
    y_hat = forward(X_train,theta,bias)
    # calculate loss
    loss_val = loss(y_train,y_hat)
    # calculate gradient
    delta_theta, delta_bias = gradient(X_train,y_train,y_hat)
    # update theta and bias
    theta = theta -lr * delta_theta
    bias = bias - lr * delta_bias

    if i % 100 == 0:
        # calculate accuracy
        acc = np.mean(np.round(y_hat) == y_train)
        print(f"epoch:{i}, loss:{np.mean(loss_val)}, acc:{acc}")

NameError: name 'loss' is not defined

### 完整的代码
使用乳腺癌的数据集来训练逻辑回归模型，使用文件保存模型参数和保存测试参数

In [82]:
# 1. import sklearn datasets 
from sklearn.datasets import load_breast_cancer
# This function is a utility in scikit-learn that allows you to split your dataset into training and testing subsets
from sklearn.model_selection import train_test_split

import numpy as np

# save the dataset to a file
import joblib


# 2. 加载数据集，拆分数据集为训练集和测试集  7：3
X ,y = load_breast_cancer( return_X_y=True, as_frame=False)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.3)

# freatures: X.shape[1] ,X.shape[-1]
print("X's features:{}".format(X.shape[1]))

# print("X_train:{}".format(X_train))
# print("X_test:{}".format(X_test))

# print("y_train:{}".format(y_train))
# print("y_test:{}".format(y_test))

# save the datatset for testing to a file
test_dataset = {
    "X_test":X_test,
    "y_test":y_test
}
joblib.dump(test_dataset,"breast_cancer_test_dataset.pkl")



# 3. 设定参数 ， 权重参数、超参数、训练次数

theta = np.random.randn(1,30)
bias = 0

#hyper parameters
lr = 0.35# training times
epochs = 5000

# 4. 模型的计算函数
def forward(x,theta,bias):
    # linear operation (multiple)
    z =np.dot(theta,x.T) + bias
    # sigmoid
    y_hat = 1 / (1 + np.exp(-z)) 
    return y_hat

# 5. 计算损失函数 (loss function)
def loss(y,y_hat):
    e = 1e-8
    return -y * np.log(y_hat + e) - (1 -y ) * np.log(1 - y_hat + e)


# 6. 计算梯度  ： theta (weight parameters) 、bias(weight parameters) 
def calc_gradient(x,y,y_hat):
    # calculate gradient 
    m = x.shape[-1]
    # calculate gradient for theta
    delta_theta = np.dot((y_hat - y),x) / m
    # calculate gradient for bias
    delta_bias = np.mean(y_hat - y)
    return delta_theta, delta_bias

# 7. 模型训练过程
# acc = float(0)
# while True:
    
#     for i in range(epochs):
#         # forward
#         y_hat = forward(X_train,theta,bias)
#         # calculate loss
#         loss_val = loss(y_train,y_hat)
#         # calculate gradient
#         delta_theta, delta_bias = calc_gradient(X_train,y_train,y_hat)
#         # update theta and bias
#         theta = theta -lr * delta_theta
#         bias = bias - lr * delta_bias

#         if i % 1000 == 0:
#             # calculate accuracy
#             acc = np.mean(np.round(y_hat) == y_train)
#             print(f"epoch:{i}, loss:{np.mean(loss_val)}, acc:{acc}")
    
#     if acc >= 0.95:
#         break



# lr is variable
acc_history = float(0)  
while True:
    # forward
    y_hat = forward(X_train,theta,bias)
    # calculate loss
    loss_val = loss(y_train,y_hat)
    # calculate gradient
    delta_theta, delta_bias = calc_gradient(X_train,y_train,y_hat)
    # update theta and bias
    theta = theta -lr * delta_theta
    bias = bias - lr * delta_bias

    if i % 100 == 0:
        acc = np.mean(np.round(y_hat) == y_train)
        print(f"epoch:{i}, loss:{np.mean(loss_val)}, acc:{acc}")
        if acc > acc_history:
            lr = lr * 0.3 + lr
            
            print(f"lr:{lr}","acc_history:{acc_history}","acc:{acc}")
        else:
            lr = lr - lr * 0.3
            acc_history = acc
            print(f"lr:{lr}","acc_history:{acc_history}","acc:{acc}")

    if acc_history >= 0.95:
        break
            # save the model parameters to a file



X's features:30


  y_hat = 1 / (1 + np.exp(-z))


KeyboardInterrupt: 