In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
print('导入需要的包')

导入需要的包


In [2]:
columns = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'
]
df = pd.read_csv('data/processed.cleveland.data', header=None, names=columns, na_values='?')


In [3]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64

In [4]:
# 用众数填充'ca'和'thal'的缺失值
for col in ['ca','thal']:
    mode_val = df[col].mode()[0]
    df[col] = df[col].fillna(mode_val)

In [5]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [6]:

df_processed = df.copy()


continuous_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for col in continuous_cols:
    df_processed[col] = pd.cut(df_processed[col], bins=4, labels=False, include_lowest=True)

categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
for col in categorical_cols:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col])

In [9]:
X = df_processed.drop('num', axis=1)
y = df_processed['num']


X = X.astype(int)
y = y.astype(int) 


X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y 
)

In [10]:
model = CategoricalNB()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"模型准确率: {accuracy:.4f}")
print("分类报告:")
print(classification_report(y_test, y_pred))
print("混淆矩阵:")
print(confusion_matrix(y_test, y_pred))

模型准确率: 0.5410
分类报告:
              precision    recall  f1-score   support

           0       0.88      0.85      0.86        33
           1       0.27      0.36      0.31        11
           2       0.00      0.00      0.00         7
           3       0.17      0.14      0.15         7
           4       0.00      0.00      0.00         3

    accuracy                           0.54        61
   macro avg       0.26      0.27      0.26        61
weighted avg       0.54      0.54      0.54        61

混淆矩阵:
[[28  4  1  0  0]
 [ 4  4  3  0  0]
 [ 0  3  0  4  0]
 [ 0  2  4  1  0]
 [ 0  2  0  1  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# # ---------------------- 终极解决方案代码开始 ----------------------

# 创建一个临时的数据帧，用于存放每个特征的所有可能类别
# 我们的目标是让 X_train 包含 X 中所有的类别
x_train_fixed = x_train.copy()
y_train_fixed = y_train.copy()

# 找出 X 中每个特征的所有唯一类别
all_categories = {col: x_train[col].unique() for col in x_train.columns}

# 对于每一个特征
for col in x_train.columns:
    # 找出 X_train 中缺失的类别
    missing_categories = set(all_categories[col]) - set(x_train[col].unique())
    
    # 如果有缺失的类别
    if missing_categories:
        print(f"在训练集中为特征 '{col}' 手动添加缺失的类别: {missing_categories}")
        
        # 对每个缺失的类别，我们都添加一个“假”样本
        for cat in missing_categories:
            # 创建一个假样本行，所有值都设为0
            dummy_row = pd.Series(0, index=X.columns)
            # 把当前特征列的值设为这个缺失的类别
            dummy_row[col] = cat
            
            # 将这个假样本行添加到修复后的训练集中
            x_train_fixed = pd.concat([x_train_fixed, pd.DataFrame([dummy_row])], ignore_index=True)
            
            # 同时也要为这个假样本添加一个对应的y值（添加0或1都可以，不影响模型学习类别存在性）
            y_train_fixed = pd.concat([y_train_fixed, pd.Series([0])], ignore_index=True)


print("--- 使用修复后的训练集进行训练 ---")
# 现在，我们使用修复过的 X_train_fixed 和 y_train_fixed 来训练模型
# 此时，我们甚至不再需要 min_categories 参数了，因为训练集已经完美了
model = CategoricalNB()
model.fit(x_train_fixed, y_train_fixed)

# 使用原始的、未经改动的 x_test 进行预测
y_pred = model.predict(x_test)

print("预测成功！")

# --- 评估模型 ---
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"模型准确率: {accuracy:.4f}")
print("混淆矩阵:")
print(conf_matrix)

# ---------------------- 终极解决方案代码结束 ----------------------
