## 数据集的加载

In [3]:
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 

# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets

# metadata 
#print(heart_disease.metadata) 
  
# variable information 
# print(heart_disease.variables) 
# 将数据和标签转换成numpy
X_array = X.to_numpy()
y_array = y.to_numpy()[:,0] 
print(X_array) 
print(y_array)

[[63.  1.  1. ...  3.  0.  6.]
 [67.  1.  4. ...  2.  3.  3.]
 [67.  1.  4. ...  2.  2.  7.]
 ...
 [57.  1.  4. ...  2.  1.  7.]
 [57.  0.  2. ...  2.  1.  3.]
 [38.  1.  3. ...  1. nan  3.]]
[0 2 1 0 0 0 3 0 2 1 0 0 2 0 0 0 1 0 0 0 0 0 1 3 4 0 0 0 0 3 0 2 1 0 0 0 3
 1 3 0 4 0 0 0 1 4 0 4 0 0 0 0 2 0 1 1 1 1 0 0 2 0 1 0 2 2 1 0 2 1 0 3 1 1
 1 0 1 0 0 3 0 0 0 3 0 0 0 0 0 0 0 3 0 0 0 1 2 3 0 0 0 0 0 0 3 0 2 1 2 3 1
 1 0 2 2 0 0 0 3 2 3 4 0 3 1 0 3 3 0 0 0 0 0 0 0 0 4 3 1 0 0 1 0 1 0 1 4 0
 0 0 0 0 0 4 3 1 1 1 2 0 0 4 0 0 0 0 0 0 1 0 3 0 1 0 4 1 0 1 0 0 3 2 0 0 1
 0 0 2 1 2 0 3 1 2 0 3 0 0 0 1 0 0 0 0 0 3 3 3 0 1 0 4 0 3 1 0 0 0 0 0 0 0
 0 3 1 0 0 0 3 2 0 2 1 0 0 3 2 1 0 0 0 0 0 2 0 2 2 1 3 0 0 1 0 0 0 0 0 0 0
 1 0 3 0 0 4 2 2 2 1 0 1 0 2 0 1 0 0 0 1 0 2 0 3 0 2 4 2 0 0 0 1 0 2 2 1 0
 3 1 1 2 3 1 0]


In [4]:
from array import array

def find_duplicates(arrays, threshold=0.85):
    """
    找到重复个体的索引。
    :param arrays: 一个包含 array.array 的列表
    :param threshold: 重复的判断阈值
    :return: 重复对的索引列表
    """
    n = len(arrays)
    duplicates = []  # 用于记录重复对的索引

    for i in range(n):
        for j in range(i + 1, n):
            # 当前两组数组
            a = arrays[i]
            b = arrays[j]
            
            # 计算1的个数
            ones_a = sum(a)
            ones_b = sum(b)
            
            # 如果其中一个数组全是0，不可能满足条件
            if ones_a == 0 or ones_b == 0:
                continue
            
            # 计算交集中的1的数量
            common_ones = sum(x & y for x, y in zip(a, b))
            
            # 判断是否满足重复的定义
            if (common_ones / ones_a > threshold) and (common_ones / ones_b > threshold):
                duplicates.append((i, j))

    return duplicates

def remove_duplicates(arrays, duplicates):
    """
    移除重复的个体。
    :param arrays: 一个包含 array.array 的列表
    :param duplicates: 重复对的索引列表
    :return: 去重后的列表
    """
    # 找到所有需要移除的索引
    to_remove = set(j for _, j in duplicates)  # 只保留后出现的索引
    # 构造去重后的列表
    return [arrays[i] for i in range(len(arrays)) if i not in to_remove]

# 示例输入
arrays = [
    array('b', [1, 0, 1, 0, 1]),
    array('b', [1, 1, 1, 0, 0]),
    array('b', [0, 1, 0, 1, 1]),
    array('b', [1, 0, 1, 0, 1])
]

# 找到重复对的索引
duplicates = find_duplicates(arrays)
print("重复对的索引:", duplicates)

# 移除重复个体
deduplicated_arrays = remove_duplicates(arrays, duplicates)
print("去重后的数组:")
for arr in deduplicated_arrays:
    print(list(arr))


重复对的索引: [(0, 3)]
去重后的数组:
[1, 0, 1, 0, 1]
[1, 1, 1, 0, 0]
[0, 1, 0, 1, 1]
