In [1]:
import pandas as pd
import numpy as np
print('导入需要的包')

导入需要的包


In [4]:
# 1. 下载数据并添加列名
# 假设数据文件 'processed.cleveland.data' 已下载到当前目录
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
data = pd.read_csv(r'D:\desktop\实验作业\数据挖掘\实验四\heart+disease\processed.cleveland.data', header=None, names=column_names)

print("原始数据信息")
print(data.info())
print("原始数据预览")
print(data.head())




原始数据信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB
None
原始数据预览
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0

In [7]:
def calculate_entropy(data):
    """计算数据集的信息熵"""
    labels = data['num']
    total_samples = len(labels)
    if total_samples == 0:
        return 0
    
    # 统计各个类别的数量
    label_counts = labels.value_counts()
    entropy = 0.0
    for count in label_counts:
        prob = count / total_samples
        entropy -= prob * np.log2(prob)
    return entropy

def calculate_information_gain(data, feature):
    """计算指定特征的信息增益"""
    # 1. 计算总信息熵
    total_entropy = calculate_entropy(data)
    
    # 2. 计算条件熵
    total_samples = len(data)
    feature_values = data[feature].unique()
    conditional_entropy = 0.0
    
    for value in feature_values:
        subset = data[data[feature] == value]
        subset_samples = len(subset)
        prob = subset_samples / total_samples
        conditional_entropy += prob * calculate_entropy(subset)
        
    # 3. 计算信息增益
    information_gain = total_entropy - conditional_entropy
    return information_gain

# 计算根节点总熵
root_entropy = calculate_entropy(data)
print(f"根节点的信息熵: {root_entropy:.4f}\n")

# 计算三个属性的信息增益
gain_sex = calculate_information_gain(data, 'sex')
gain_fbs = calculate_information_gain(data, 'fbs')
gain_exang = calculate_information_gain(data, 'exang')

print(f"属性 'sex' 的信息增益: {gain_sex:.4f}")
print(f"属性 'fbs' 的信息增益: {gain_fbs:.4f}")
print(f"属性 'exang' 的信息增益: {gain_exang:.4f}")


根节点的信息熵: 1.8459

属性 'sex' 的信息增益: 0.0580
属性 'fbs' 的信息增益: 0.0185
属性 'exang' 的信息增益: 0.1501


In [11]:


def get_majority_class(data):
    """获取数据集中样本数最多的类别"""
    labels = data['num']
    if len(labels) == 0:
        # 如果子集为空，返回一个默认值或其父节点的最多数类别。
        # 为简化，这里我们返回一个最常见的类别，比如 0 (无病)。
        # 在一个更鲁棒的实现中，这个值应该从父节点继承。
        return 0
    return labels.value_counts().idxmax()

def choose_best_feature(data, features):
    """在可用特征中选择信息增益最大的特征"""
    best_feature = None
    max_gain = -1
    for feature in features:
        gain = calculate_information_gain(data, feature)
        if gain > max_gain:
            max_gain = gain
            best_feature = feature
    return best_feature

def create_tree(data, features):
    """递归构建决策树"""
    # 终止条件1: 类别纯净
    if len(data['num'].unique()) == 1:
        return data['num'].iloc[0]
    
    # 终止条件2: 特征耗尽
    if len(features) == 0:
        return get_majority_class(data)
    
    # 选择最佳划分属性
    best_feature = choose_best_feature(data, features)
    
    # 如果信息增益为0，也停止分裂
    if best_feature is None or calculate_information_gain(data, best_feature) < 1e-9: # 增加一个阈值防止浮点误差
        return get_majority_class(data)

    my_tree = {best_feature: {}}
    remaining_features = [f for f in features if f != best_feature]
    
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        my_tree[best_feature][value] = create_tree(subset, remaining_features)
        
    return my_tree

def map_leaf_values(tree):
    """将叶子节点的值从 0/1 映射到 '无病'/'有病'"""
    if not isinstance(tree, dict):
        return '有病' if tree == 1 else '无病'
    
    node = list(tree.keys())[0]
    sub_tree = tree[node]
    for key, value in sub_tree.items():
        sub_tree[key] = map_leaf_values(value)
    return tree

# ----------------------------------------------------------------
# 执行本步骤的核心代码
# ----------------------------------------------------------------

# 假设 `data` 已经是预处理好的DataFrame
# （这里可以重新加载或直接使用上一步的 `data`）
# column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
# data = pd.read_csv('processed.cleveland.data', header=None, names=column_names)
# data = data.replace('?', np.nan)
# data.dropna(inplace=True)
# data['ca'] = data['ca'].astype(float)
# data['thal'] = data['thal'].astype(float)
# data['num'] = data['num'].apply(lambda x: 1 if x > 0 else 0)
# continuous_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
# for feature in continuous_features:
#     mean_val = data[feature].mean()
#     data[feature] = data[feature].apply(lambda x: 0 if x <= mean_val else 1)


# **核心步骤：只选择 'thal', 'ca', 'cp' 三个特征**
selected_features = ['thal', 'ca', 'cp']

# 使用这三个特征构建决策树
simple_heart_tree = create_tree(data, selected_features)

# 映射叶子节点值为可读文本
mapped_simple_tree = map_leaf_values(simple_heart_tree)

# 使用 pprint 美观地打印决策树
print("使用 'thal', 'ca', 'cp' 构建的决策树结构")
pprint.pprint(mapped_simple_tree)


使用 'thal', 'ca', 'cp' 构建的决策树结构
{'thal': {'3.0': {'ca': {'0.0': {'cp': {np.float64(1.0): '无病',
                                        np.float64(2.0): '无病',
                                        np.float64(3.0): '无病',
                                        np.float64(4.0): '无病'}},
                         '1.0': {'cp': {np.float64(1.0): '无病',
                                        np.float64(2.0): '无病',
                                        np.float64(3.0): '无病',
                                        np.float64(4.0): '有病'}},
                         '2.0': {'cp': {np.float64(1.0): '无病',
                                        np.float64(2.0): '无病',
                                        np.float64(3.0): '无病',
                                        np.float64(4.0): '无病'}},
                         '3.0': {'cp': {np.float64(3.0): '无病',
                                        np.float64(4.0): '无病'}},
                         '?': '无病'}},
          '6.0': {'ca': {'0.0': {'cp': {n