In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np

# 1.生产数据集
# n_features: 特征数
X,y = make_classification(n_features=10)

# 划分数据集
# test_size: 测试集占比
# random_state: 随机种子    保证每次划分的结果一样  保证结果的可重复性          
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# 初始化参数
theta = np.random.randn(1,10)
# 偏置
bias = 0
# 学习率/超参数
lr = 0.01
# 训练次数      
epochs = 1000


# 2.模型计算函数
def forward(x, theta, bias):
    # 线性回归
    z = np.dot(theta, x.T) + bias
    # 激活函数/sigmoid 
    y_hat = 1/(1+np.exp(-z))
    return y_hat

# 3.损失函数
def loss(y, y_hat):
    # 交叉熵损失函数
    e = 1e-8
    return -y*np.log(y_hat+e) - (1-y)*np.log(1-y_hat+e)

# 4.计算梯度
def cal_gradient(x, y, y_hat):
    # 样本数
    m = x.shape[-1]
    # 计算梯度
    delta_theta = np.dot(y_hat-y, x)/m
    # 偏置梯度
    delta_bias = np.mean(y_hat-y)
    # 返回梯度
    return delta_theta, delta_bias


# 5.模型训练
for i in range(epochs):
    # 前向计算
    y_hat = forward(X_train, theta, bias)
    # 计算损失
    loss_value = loss(y_train, y_hat)
    # 计算梯度
    delta_theta, delta_bias = cal_gradient(X_train, y_train, y_hat)
    # 更新参数
    theta -= lr*delta_theta
    bias -= lr*delta_bias
    # 计算准确率
    acc = np.mean(np.round(y_hat) == y_train)
    print(f"epoch: {i}, loss: {np.mean(loss_value)}, acc: {acc}")


In [None]:
# 6.模型推理
idx = np.random.randint(len(X_test))
x = X_test[idx]
y = y_test[idx]
predict = np.round(forward(x, theta, bias))
print(f"predict: {predict}, true: {y}")

In [None]:
# iris数据集
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# 加载数据集
iris = load_iris()
X, y = iris.data, iris.target

# 数据探索
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
print(df.head())
print(df.describe())
print(df['target'].value_counts())

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练模型
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 模型评估
y_pred = model.predict(X_test)
print("准确率:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=iris.target_names))

# 可视化决策边界
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap=plt.cm.Set1, edgecolor='k')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('Iris Dataset (PCA)')
plt.show()