#### train_test_split 
``` python 
from sklearn.model_selection import train_test_split
# 基本用法
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 20%作为测试集
    random_state=42,    # 固定随机种子
    stratify=y          # 保持各类别比例
)


#### StandradScalar特征缩放
原理是中心化,标准化公式为:$$\frac{x - \mu}{\sigma}$$
其中$\mu$是均值$\sigma$是标准差


In [1]:
# 使用sklearn库实现,特征缩放,logistic回归
import pandas as pd 
import numpy as np

# sklearn 导入需要的模块
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 

# 导入数据
file_path = '../data/raw/breast_cancer_data.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [None]:
# 01_分离特征和目标
X = df.drop('target', axis=1) # raw data 
y = df['target']

# 02_划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"训练集大小{X_train.shape}")
print(f"测试集大小{X_test.shape}")

#### Feature Scaling 

In [3]:
# 01_初始化
scaler = StandardScaler()

# 02_fit&transfrom(X_train)
X_train_scaled = scaler.fit_transform(X_train)

# 03_transfrom(X_test)
X_test_scaled = scaler.transform(X_test)

print('one row traning data',X_train_scaled[0])



one row traning data [-0.12348985 -0.29680142 -0.17050713 -0.20861569 -1.2016799  -0.7731696
 -0.76231194 -0.93324109 -1.22994935 -0.94816603 -0.53359339 -0.86028757
 -0.61678096 -0.39177533 -1.35556152 -0.52503193 -0.4817033  -0.97940018
 -0.88459317 -0.68548672 -0.19761978 -0.5067476  -0.30791001 -0.27357592
 -1.50742388 -0.44926047 -0.57223884 -0.84082156 -0.8563616  -0.76574773]


#### Training model 

In [4]:
# 设置random_state保证结果可以复现
model = LogisticRegression(random_state=42)

# 使用缩放后的训练数据来训练模型
model.fit(X_train_scaled, y_train) # 输入要求是特征矩阵和对应的标签向量

print("model training Success!")


model training Success!


#### Predict 

In [5]:
# 01_Predict
y_pred = model.predict(X_test_scaled)

# 02_评估
accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy_score{accuracy:.4f}')

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

accuracy_score0.9825
[[ 62   1]
 [  2 106]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98        63
           1       0.99      0.98      0.99       108

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171

