# 心脏病预测模型对比

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [28]:
# 1. 加载数据（假设数据集为heart.csv）
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
df = pd.read_csv(url, header=None, na_values='?')
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
           'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
df.columns = columns
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)  # 转换为二分类问题
df = df.dropna()

数据预处理
缺失值处理：删除包含缺失值的样本

特征工程：

数值特征：标准化（StandardScler）

类别特征：独热编码（OneHotEncoder）

In [29]:
# 2. 数据预处理
# 分类变量和数值变量
cat_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
num_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(), cat_features)
    ])

X = df.drop('target', axis=1)
y = df['target']
X_processed = preprocessor.fit_transform(X)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, stratify=y, random_state=42)

In [30]:
# 3. 构建神经网络模型
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_split=0.2, callbacks=[early_stop], verbose=0)

# 神经网络评估
y_pred_nn = (model.predict(X_test) > 0.5).astype(int)
acc_nn = accuracy_score(y_test, y_pred_nn)
f1_nn = f1_score(y_test, y_pred_nn)
auc_nn = roc_auc_score(y_test, y_pred_nn)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step


In [31]:
# 4. 训练传统模型
# 逻辑回归
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
auc_lr = roc_auc_score(y_test, y_pred_lr)

# SVM（使用线性核）
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
acc_svm = accuracy_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
auc_svm = roc_auc_score(y_test, y_pred_svm)

In [33]:
# 5. 结果对比
results = pd.DataFrame({
    'Model': ['Neural Network', 'Logistic Regression', 'SVM'],
    'Accuracy': [acc_nn, acc_lr, acc_svm],
    'F1 Score': [f1_nn, f1_lr, f1_svm],
    'AUC': [auc_nn, auc_lr, auc_svm]
})

print(results)

                 Model  Accuracy  F1 Score       AUC
0       Neural Network  0.850000  0.830189  0.845982
1  Logistic Regression  0.816667  0.784314  0.810268
2                  SVM  0.833333  0.807692  0.828125
