#### 机器学习模型训练

In [12]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
import numpy as np


In [13]:
path_numpy = 'data/Section-7/sams_array.npy'
sams = np.load(path_numpy)
sams_fea, sams_label = sams[:, 0:6], sams[:, 6]
sams.shape


(635, 7)

In [14]:
### 对于某些对特征值尺度较为敏感的算法，如SVM算法、神经网络算法等。需进行特征值归一化。
sams_fea = sams_fea/(sams_fea.max()-sams_fea.min())


##### 数据集划分：训练集和测试集

In [15]:
X_train, X_test, y_train, y_test = train_test_split(sams_fea, sams_label, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
X_train


(444, 6)
(191, 6)


array([[0.14086875, 0.16216721, 0.15413358, 0.2608127 , 0.3489958 ,
        0.30667912],
       [0.03484353, 0.05576833, 0.05782345, 0.29350771, 0.16029893,
        0.09173284],
       [0.0281177 , 0.04306399, 0.03232135, 0.04007473, 0.01905652,
        0.01728164],
       ...,
       [0.03914059, 0.06221392, 0.05081738, 0.22344699, 0.1483419 ,
        0.08080336],
       [0.07323681, 0.11564689, 0.16384867, 0.31929005, 0.25604858,
        0.18187763],
       [0.0287716 , 0.03876693, 0.03063989, 0.03783279, 0.02765063,
        0.02326016]])

##### 模型构建、评估及训练

In [16]:
## 模型构建，设置参数（最优参数应通过格网寻优获得，该课程统一采用默认参数）。
model_rf = RandomForestClassifier(random_state=42)
# model_rf = SVC(random_state=42)


In [17]:
## 模型评估（训练集）, k-折交叉验证，该步骤可省略。
scores = cross_val_score(estimator = model_rf, \
                          X=X_train, y=y_train, 
                          cv=5)
scores.mean()


0.9527579162410623

In [18]:
### 模型训练, 应用所有样本
model_rf = model_rf.fit(X=X_train, y=y_train)
model_rf


##### 模型预测及精度评估

In [19]:
y_test_pred = model_rf.predict(X_test)


In [20]:
### precision对应用户精度，recall对应生产者精度。
precision = precision_score(y_true=y_test, y_pred=y_test_pred, average=None)
recall = recall_score(y_true=y_test, y_pred=y_test_pred, average=None)
print('precision scores:', precision)
print('recall scores:', recall)


precision scores: [1.         1.         0.97777778 0.93478261]
recall scores: [1.         0.96226415 0.95652174 1.        ]


In [21]:
### 全局精度(测试集)
accuracy = accuracy_score(y_true=y_test, y_pred=y_test_pred)
print("Accuracy Score:", accuracy)


Accuracy Score: 0.9790575916230366


In [22]:
import pickle
path_model = 'data/Section-7/model_rf.pickle'
with open(path_model, 'wb') as file:
  pickle.dump(model_rf, file)
