# 一些sklearn模块的用法与功能

## 模块导入
``` Python
from sklearn.model_selection import train_test_split #数据集划分
from sklearn.preprocessing import StandardScaler # 特征缩放
from sklearn.linear_model import LogisticRegression # logistic regression 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report # 评估指标

#### train_test_split 
``` python 
from sklearn.model_selection import train_test_split
# 基本用法
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 20%作为测试集
    random_state=42,    # 固定随机种子
    stratify=y          # 保持各类别比例
)


#### StandradScalar特征缩放
原理是中心化,标准化公式为:$$\frac{x - \mu}{\sigma}$$
其中$\mu$是均值$\sigma$是标准差,下面给出一个实现


In [2]:
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 假设我们有两个特征：年龄和收入
data = np.array([
    [25, 30000],   # 25岁，年收入3万
    [30, 45000],   # 30岁，年收入4.5万
    [35, 60000],   # 35岁，年收入6万
    [40, 75000],   # 40岁，年收入7.5万
    [45, 90000],   # 45岁，年收入9万
    [50, 105000],  # 50岁，年收入10.5万
    [55, 120000],  # 55岁，年收入12万
    [60, 135000]   # 60岁，年收入13.5万
])

X = data
y = np.array([0, 0, 0, 1, 1, 1, 1, 1])  #标签

print('raw_data')
print('ages income')
for i, row in enumerate(X):
    print(f"{row[0]:2.0f} {row[1]:6.0f}")

raw_data
ages income
25  30000
30  45000
35  60000
40  75000
45  90000
50 105000
55 120000
60 135000


#### 01-数据划分

In [None]:
# 01-Data-dividing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print('TrainSet')
print('age income')
for row in X_train:
    print(f"{row[0]:2.0f} {row[1]:6.0f}")
    
print('TestSet')
print('age income')
for row in X_test:
    print(f"{row[0]:2.0f} {row[1]:6.0f}")
    

TrainSet
age income
25  30000
60 135000
35  60000
45  90000
40  75000
55 120000
TestSet
age income
30  45000
50 105000


#### 02-Standard 03-fit&transform

In [None]:
# 02-Data-Standard
scaler = StandardScaler()

# 03-fit&transform
print('---Standard_Process---')
print('TrainSet_info')
print(f"age_maen: {X_train[:,0].mean():.1f}, std:{X_train[:,0].std():.1f}") #compute mean std
print(f"income_maen: {X_train[:,0].mean():.1f}, std:{X_train[:,0].std():.1f}")

X_train_scaled = scaler.fit_transform(X_train) #进行标准化

print(f"\n标准化器得到的参数")
print(f"mean_:{scaler.mean_}")
print(f"std:{scaler.scale_}")

print(f"\n After standard_TrainSet:")
print(f"age_sd income_sd")
for row in X_train_scaled:
    print(f"{row[0]:8.2f} {row[1]:8.2f}")
    
X_test_scaled = scaler.transform(X_test)

print(f"\n After stadard_TestSet:")
print(f"age_sd income_sd")
for row in X_test_scaled:
    print(f"{row[0]:8.2f} {row[1]:8.2f}")
    

#### 04-verifying-perfermance

In [None]:
# 04-verifying-perfermance
print("---Standard_perfermance---")
print("TrainSet_statistical_info")
print(f"age mean{X_train_scaled[:,0].mean():.6f} std{X_train_scaled[:,0].std():.6f}")
print(f"income mean{X_train_scaled[:,1].mean():.6f} std{X_train_scaled[:,1].std():.6f}")

print("\nTestSet_satistical_info")
print(f"age mean{X_test_scaled[:,0].mean():.2f} std{X_train_scaled[:,0].std():.2f}")
print(f"income mean{X_test_scaled[:,1].mean():.2f} std{X_train_scaled[:,1].std():.2f}")

#### Logistic_Regression
**作用**：线性分类算法，通过 sigmoid 函数将线性组合映射概率值。  
**数学原理**：
$$
P(y = 1|x) = \frac{1}{1 + \exp\left(-(\beta_0 + \beta_1 x_1 + \cdots + \beta_n x_n)\right)}
$$

In [16]:
# 创建模型
from sklearn.linear_model import LogisticRegression 
model = LogisticRegression(random_state = 42,max_iter = 1000)

model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)
y_prob = model.predict(X_test_scaled)

print(y_pred)

[0 1]


#### 评估模块,判断准确率
```Python
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 4.1 准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"准确率"{accuracy:.3f})

# 4.2 confusion_matrix 
cm = confusion_matrix(y_test, y_pred)
print("混淆矩阵")
print(cm)

# 分类报告
report = classification_report(y_test, y_pred)
print(report)
```