#### 机器学习模型训练

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score  ## precision和recall对应用户精度和生产者精度。
from sklearn.model_selection import train_test_split
import numpy as np


In [34]:
path_numpy = 'data/Section-7/sams_array.npy'
sams = np.load(path_numpy)
sams_fea, sams_label = sams[:, 0:6], sams[:, 6]
sams.shape


(480, 7)

In [35]:
### 数据标准化
### 对于某些对特征值尺度较为敏感的算法，如SVM算法、神经网络算法等。需进行特征值归一化。
sams_fea = sams_fea/(10000-0)   ### 该处标准化参数应与模型分类时数据标准化参数一致。


##### 数据集划分：训练集和测试集

In [36]:
### 数据集随机划分：训练集、测试集
X_train, X_test, y_train, y_test = train_test_split(sams_fea, sams_label, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
X_train


(336, 6)
(144, 6)


array([[0.    , 0.    , 0.    , 0.    , 0.    , 0.    ],
       [0.1278, 0.1279, 0.1206, 0.115 , 0.1148, 0.1117],
       [0.1619, 0.1867, 0.1638, 0.1439, 0.1154, 0.113 ],
       ...,
       [0.    , 0.    , 0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    , 0.    , 0.    ],
       [0.1399, 0.1508, 0.1298, 0.1075, 0.1075, 0.1076]])

##### 模型构建、评估及训练

In [37]:
## 模型构建，设置参数（最优参数应通过格网寻优获得，该课程统一采用默认参数）。
model_rf = RandomForestClassifier(random_state=42)
# model_svc = SVC(random_state=42)


In [38]:
### 模型训练, 应用训练样本
model_rf = model_rf.fit(X=X_train, y=y_train)



##### 模型预测及精度评估

In [40]:
y_test_pred = model_rf.predict(X_test)


In [41]:
### 绘制混淆矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=y_test, 
                 y_pred=y_test_pred,
                 labels=[0, 1])


array([[78,  0],
       [ 0, 66]], dtype=int64)

In [42]:
### precision对应用户精度，recall对应生产者精度。
oa = accuracy_score(y_true=y_test, y_pred=y_test_pred) ### 全局精度(测试集)
precision = precision_score(y_true=y_test, y_pred=y_test_pred, average=None)  ## 精确度：用户精度
recall = recall_score(y_true=y_test, y_pred=y_test_pred, average=None)  ### 召回率：生产者精度
print('overall accuracy:', oa)
print('precision scores:', precision)
print('recall scores:', recall)


overall accuracy: 1.0
precision scores: [1. 1.]
recall scores: [1. 1.]


In [None]:
# ### 模型保存
# import pickle
# path_model = 'data/Section-7/model_rf.pickle'
# with open(path_model, 'wb') as file:
#   pickle.dump(model_rf, file)
