In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
import xgboost as xgb

In [20]:
# ds_name = 'datasets/20220328-or-eng-full.csv'
ds_name = 'pre_processing/output2.csv'
test_size=0.3

data = pd.read_csv(ds_name)

# removed_cols = ['Postoperative Olanzapine', 'Postoperative Fluphenazine', 'Postoperative Flupentixol']
# data = data.drop(removed_cols, axis=1)

# 将特征和目标分开
features = data.drop('Label', axis=1).values
target = data['Label'].values

# Count posivite target and negative target
pos = 0
neg = 0
for i in target:
    if i == 1:
        pos += 1
    else:
        neg += 1
print('Positive target: ', pos)
print('Negative target: ', neg)

print(features.shape)
print(target.shape)

Positive target:  68
Negative target:  1134
(1202, 34)
(1202,)


In [21]:
# 划分训练集和测试集
train_features, test_features, train_target, test_target = train_test_split(
    features, target, test_size=test_size, random_state=42)

In [22]:
def test_model(outputs, features, target):
    # Test in all data
    outputs = outputs > 0.5

    success = np.sum(outputs == target)
    print('Success: {}/{}'.format(success, len(target)))

    acc = success / len(target)
    print('Accuracy: {:.2f}'.format(acc))

    # Confusion matrix
    TP = np.sum((outputs == 1) & (target == 1))
    TN = np.sum((outputs == 0) & (target == 0))
    FP = np.sum((outputs == 1) & (target == 0))
    FN = np.sum((outputs == 0) & (target == 1))

    print('TP: {}, TN: {}, FP: {}, FN: {}'.format(TP, TN, FP, FN))

    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    print('Sensitivity: {:.2f}, Specificity: {:.2f}'.format(sensitivity, specificity))

In [23]:
# 创建SVM分类器对象
svm_classifier = svm.SVC(class_weight={0: 1, 1: 10})

# 训练模型
svm_classifier.fit(train_features, train_target)

# 在测试集上进行预测
svm_predictions = svm_classifier.predict(test_features)

# 计算混淆矩阵
test_model(svm_predictions, test_features, test_target)

Success: 322/361
Accuracy: 0.89
TP: 11, TN: 311, FP: 26, FN: 13
Sensitivity: 0.46, Specificity: 0.92


In [24]:
# 将数据集转换为DMatrix格式
dtrain = xgb.DMatrix(train_features, label=train_target)
dtest = xgb.DMatrix(test_features, label=test_target)

# 设置XGBoost的参数
params = {
    'max_depth': 5,
    'eta': 0.01,
    'objective': 'binary:logistic',
    'eval_metric': 'error',
    'scale_pos_weight': 10
}

# 训练模型
xgb_classifier = xgb.train(params, dtrain)

# 在测试集上进行预测
xgb_predictions = xgb_classifier.predict(dtest)

# 计算混淆矩阵
test_model(xgb_predictions, test_features, test_target)

Success: 261/361
Accuracy: 0.72
TP: 17, TN: 244, FP: 93, FN: 7
Sensitivity: 0.71, Specificity: 0.72
