In [1]:
from src.data_processing import FeatureManager
from src.data_processing import Dataset
from src.optimization.model_selection import best_model_by_auc, threshold_by_mcc, best_threshold_by_mcc
from src.model import plot_mcc_variation, test_metrics
import os

**所有的文件路径你都可以自行用字符串替代，只要路径写对即可**

## 加载数据文件

In [2]:
data_path = os.path.join("data","processed","cleaned_data.csv")

fm = FeatureManager().get_all_features()

# file_path: 文件路径
# feature_names: 如果要变动，请修改src/data_processing/feature_manager.py中的self.features变量
# target_col: 预测目标列
# target_mapping: 将预测目标列中的值转换成1/0，比如这里表示将'group'列中的'R'记为1，'NR'记为0。
# split_col: 如果数据文件中已经划分好训练集和测试集，请设置为该列的名称。注意该列的元素必须有且仅有'train'和'test'
# split_ratio: 如果没有指定split_col，Dataset类将会根据该比率随机划分训练集和测试集，默认值为0.7

dataset = Dataset(file_path = data_path, 
                  feature_names = fm, 
                  target_col = 'group',
                  target_mapping = {'R':1, 'NR': 0},
                  split_col = 'type')

X_train = dataset.X_train
y_train = dataset.y_train
X_test = dataset.X_test
y_test = dataset.y_test

## 如果你没有运行过main.py的话，请先去掉下面这段代码的注释并运行

In [3]:
# 该函数会在n_split折验证下，
# 在src/optimization/search_config.py设置的模型超参数范围中，
# 寻找每个模型的roc-auc表现最佳的超参数组合
# 并保存模型文件在trained_models文件夹中

#models = best_model_by_auc(dataset.X_train, dataset.y_train, n_split=5)

# 该函数会对上一个函数返回的models结果中的最佳模型寻找阈值
# 并在n_split折验证下，对[min_thres, max_thres]范围内间隔为interval的阈值组合实验，
# 计算其mcc随阈值变化的曲线
# 并保存在mcc_threshold_curve文件夹中

#mcc_variation = threshold_by_mcc(models, dataset.X_train, dataset.y_train, n_split=5, min_thres=0.1, max_thres=0.9, interval=0.1)

## 基于mcc寻找模型最佳阈值

In [4]:
model_path = os.path.join("trained_models", "SVM_(RBF).pkl")
n_split = 5
min_thres = 0.1
max_thres = 0.9
interval = 0.1

# 数据集比较小的话，n_split不建议设置太大
# min_thres: 要测试的最小阈值
# max_thres: 要测试的最大阈值
# interval: 间隔
avg_mcc_scores = best_threshold_by_mcc(model_path, X_train, y_train, 
                                       n_split=n_split, min_thres=min_thres, max_thres=max_thres, interval=interval)

In [5]:
# 返回值是{阈值：对应的MCC值}
# 可以拿这个变量去画需要的图
print(avg_mcc_scores)

{0.4: 0.10012523486435178, 0.5: 0.2521156440134337, 0.6: 0.4521391467207941, 0.7: 0.4034960864477526, 0.8: 0.143074484830962}


## 在测试集上评估模型

In [6]:
model_to_test = os.path.join("trained_models", "SVM_(RBF).pkl")
threshold_to_test = 0.5

metrics, confusion = test_metrics(model_to_test, dataset.X_test, dataset.y_test, threshold_to_test)

In [7]:
print("Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

Metrics:
MCC: 0.3475
AUC: 0.8116
F1: 0.9362
Precision: 0.9167
Recall: 0.9565
Accuracy: 0.8846


In [8]:
print("Confusion:")
print(confusion)

Confusion:
[[ 2  4]
 [ 2 44]]
