#### 机器学习模型训练

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score  ## precision和recall对应用户精度和生产者精度。
from sklearn.model_selection import train_test_split
import numpy as np


In [12]:
path_numpy = 'data/Section-7/sams_array.npy'
sams = np.load(path_numpy)
sams_fea, sams_label = sams[:, 0:6], sams[:, 6]
sams.shape


(362, 7)

In [13]:
### 数据标准化
### 对于某些对特征值尺度较为敏感的算法，如SVM算法、神经网络算法等。需进行特征值归一化。
sams_fea = sams_fea/(10000-0)   ### 该处标准化参数应与模型分类时数据标准化参数一致。
sams_fea


array([[0.1465, 0.1751, 0.1458, 0.1242, 0.1132, 0.1094],
       [0.1456, 0.174 , 0.1459, 0.1243, 0.1127, 0.1114],
       [0.1484, 0.1753, 0.146 , 0.1248, 0.1119, 0.1097],
       ...,
       [0.2084, 0.2454, 0.248 , 0.3894, 0.2975, 0.2412],
       [0.205 , 0.236 , 0.2344, 0.3832, 0.2933, 0.2344],
       [0.2036, 0.2396, 0.2388, 0.468 , 0.2957, 0.2454]])

##### 数据集划分：训练集和测试集

In [14]:
### 数据集随机划分：训练集、测试集
X_train, X_test, y_train, y_test = train_test_split(sams_fea, sams_label, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
X_train


(253, 6)
(109, 6)


array([[0.1225, 0.1264, 0.1183, 0.1067, 0.1108, 0.1084],
       [0.278 , 0.3186, 0.3316, 0.3504, 0.3345, 0.3005],
       [0.1216, 0.1265, 0.1186, 0.1095, 0.113 , 0.1092],
       ...,
       [0.1684, 0.182 , 0.196 , 0.2208, 0.2714, 0.2305],
       [0.138 , 0.1532, 0.1476, 0.2848, 0.2452, 0.1825],
       [0.1276, 0.1447, 0.1283, 0.1077, 0.1117, 0.1089]])

##### 模型构建、评估及训练

In [15]:
## 模型构建，设置参数（最优参数应通过格网寻优获得，该课程统一采用默认参数）。
model_rf = RandomForestClassifier(random_state=42)
# model_svc = SVC(random_state=42)


In [None]:
### 模型训练, 应用训练样本
model_rf = model_rf.fit(X=X_train, y=y_train)


##### 模型预测及精度评估

In [17]:
y_test_pred = model_rf.predict(X_test)


In [18]:
### 绘制混淆矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=y_test, 
                 y_pred=y_test_pred,
                 labels=[0, 1])


array([[67,  0],
       [ 0, 42]])

In [19]:
### precision对应用户精度，recall对应生产者精度。
oa = accuracy_score(y_true=y_test, y_pred=y_test_pred) ### 全局精度(测试集)
precision = precision_score(y_true=y_test, y_pred=y_test_pred, average=None)  ## 精确度：用户精度
recall = recall_score(y_true=y_test, y_pred=y_test_pred, average=None)  ### 召回率：生产者精度
print('overall accuracy:', oa)
print('precision scores:', precision)
print('recall scores:', recall)


overall accuracy: 1.0
precision scores: [1. 1.]
recall scores: [1. 1.]


In [None]:
# ### 模型保存
# import pickle
# path_model = 'data/Section-7/model_rf.pickle'
# with open(path_model, 'wb') as file:
#   pickle.dump(model_rf, file)
