总数据集中筛选使元素平衡

In [2]:
import csv
import random
import numpy as np
from collections import defaultdict

# 读取CSV文件
input_filename = '345.csv'
output_filename = 'filtered_output.csv'
target_rows = 180

# 存储所有数据行
data_rows = []
element_columns = ['Co', 'Cr', 'Cu', 'Fe', 'Mn', 'Ni', 'V']

# 读取CSV文件内容
with open(input_filename, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data_rows.append(row)

# 计算每行的元素数量并统计
element_count_per_row = []
for row in data_rows:
    count = sum(1 for elem in element_columns if float(row[elem]) > 0)
    element_count_per_row.append(count)

# 计算总元素出现次数
total_elements = sum(element_count_per_row)
avg_per_row = total_elements / len(data_rows)
print(f"数据集统计: 总行数={len(data_rows)}, 平均每行元素数={avg_per_row:.2f}")

# 估计50行的总元素出现次数
estimated_total = avg_per_row * target_rows
target_per_element = estimated_total / len(element_columns)
print(f"目标: 筛选{target_rows}行, 估计总元素出现次数={estimated_total:.1f}, 每种元素目标出现次数={target_per_element:.1f}")

# 构建元素存在矩阵 (1表示元素存在)
element_matrix = []
for row in data_rows:
    row_vector = []
    for elem in element_columns:
        value = float(row[elem])
        row_vector.append(1 if value > 0 else 0)
    element_matrix.append(row_vector)

# 贪心算法选择行
selected_indices = []
element_counts = [0] * len(element_columns)  # 每种元素的计数

for _ in range(target_rows):
    best_idx = None
    best_score = float('inf')
    
    # 尝试所有未选中的行
    available_indices = [i for i in range(len(data_rows)) if i not in selected_indices]
    if not available_indices:
        break
    
    # 随机选择起始点以避免局部最优
    start_idx = random.choice(available_indices)
    
    # 尝试一组候选行
    for idx in available_indices:
        # 计算添加该行后的临时计数
        temp_counts = element_counts.copy()
        for i, present in enumerate(element_matrix[idx]):
            if present:
                temp_counts[i] += 1
                
        # 计算与目标频率的差距
        # 使用加权评分：优先考虑低于目标的元素
        score = 0
        for count, target in zip(temp_counts, [target_per_element]*len(element_columns)):
            # 低于目标值的惩罚更大
            if count < target:
                score += (target - count) * 3.0  # 低于目标加倍惩罚
            else:
                score += (count - target) * 1.0  # 高于目标单倍惩罚
        
        # 选择最小化差距的行
        if score < best_score:
            best_score = score
            best_idx = idx
    
    # 添加最佳行并更新计数
    if best_idx is not None:
        selected_indices.append(best_idx)
        for i, present in enumerate(element_matrix[best_idx]):
            if present:
                element_counts[i] += 1

# 输出选择的元素统计
element_names = element_columns
print("\n最终元素出现次数统计:")
for name, count in zip(element_names, element_counts):
    target = target_per_element
    diff = count - target
    print(f"{name}: {count}次 ({diff:+.1f} 差异)")

# 计算总元素出现次数和分布差异
total_selected = sum(element_counts)
variance = sum((count - target_per_element) ** 2 for count in element_counts)
std_dev = (variance / len(element_columns)) ** 0.5

print(f"\n总结: 总元素出现次数={total_selected}, 平均每行元素={total_selected/target_rows:.2f}")
print(f"元素分布标准差={std_dev:.2f}")

# 保存筛选结果到新CSV
with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=reader.fieldnames)
    writer.writeheader()
    for idx in selected_indices:
        writer.writerow(data_rows[idx])

print(f"\n已筛选 {len(selected_indices)} 行数据保存到 {output_filename}")

数据集统计: 总行数=5589, 平均每行元素数=4.67
目标: 筛选180行, 估计总元素出现次数=840.6, 每种元素目标出现次数=120.1

最终元素出现次数统计:
Co: 126次 (+5.9 差异)
Cr: 120次 (-0.1 差异)
Cu: 159次 (+38.9 差异)
Fe: 122次 (+1.9 差异)
Mn: 120次 (-0.1 差异)
Ni: 139次 (+18.9 差异)
V: 86次 (-34.1 差异)

总结: 总元素出现次数=872, 平均每行元素=4.84
元素分布标准差=20.95

已筛选 180 行数据保存到 filtered_output.csv


限定主元下使元素平衡

筛选3元

In [None]:
import csv
import random
import numpy as np
from collections import defaultdict

# 读取CSV文件
input_filename = '3元.csv'
output_filename = 'filtered_3.csv'
target_rows = 60

# 存储所有数据行
data_rows = []
element_columns = ['Co', 'Cr', 'Cu', 'Fe', 'Mn', 'Ni', 'V']

# 读取CSV文件内容
with open(input_filename, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data_rows.append(row)

# 构建元素存在矩阵 (1表示元素存在)
element_matrix = []
for row in data_rows:
    row_vector = []
    for elem in element_columns:
        value = float(row[elem])
        row_vector.append(1 if value > 0 else 0)
    element_matrix.append(row_vector)

# 贪心算法选择行
selected_indices = []
element_counts = [0] * 7  # 7种元素的计数
target_count = 180 / 7    # 60行 * 3元素/行 / 7元素 ≈ 25.71

for _ in range(target_rows):
    best_idx = None
    best_score = float('inf')
    
    # 尝试所有未选中的行
    for idx in range(len(data_rows)):
        if idx in selected_indices:
            continue
            
        # 计算添加该行后的临时计数
        temp_counts = element_counts.copy()
        for i, present in enumerate(element_matrix[idx]):
            if present:
                temp_counts[i] += 1
                
        # 计算与目标频率的差距
        score = sum((count - target_count) ** 2 for count in temp_counts)
        
        # 选择最小化差距的行
        if score < best_score:
            best_score = score
            best_idx = idx
    
    # 添加最佳行并更新计数
    if best_idx is not None:
        selected_indices.append(best_idx)
        for i, present in enumerate(element_matrix[best_idx]):
            if present:
                element_counts[i] += 1

# 输出选择的元素统计
element_names = ['Co', 'Cr', 'Cu', 'Fe', 'Mn', 'Ni', 'V']
print("最终元素出现次数统计:")
for name, count in zip(element_names, element_counts):
    print(f"{name}: {count}次")

# 保存筛选结果到新CSV
with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=reader.fieldnames)
    writer.writeheader()
    for idx in selected_indices:
        writer.writerow(data_rows[idx])

print(f"\n已筛选 {len(selected_indices)} 行数据保存到 {output_filename}")

最终元素出现次数统计:
Co: 27次
Cr: 23次
Cu: 29次
Fe: 27次
Mn: 23次
Ni: 28次
V: 23次

已筛选 60 行数据保存到 filtered_3.csv


筛选4元

In [4]:
import csv
import random
import numpy as np
from collections import defaultdict

# 读取CSV文件
input_filename = '4元.csv'
output_filename = 'filtered_4.csv'
target_rows = 60

# 存储所有数据行
data_rows = []
element_columns = ['Co', 'Cr', 'Cu', 'Fe', 'Mn', 'Ni', 'V']

# 读取CSV文件内容
with open(input_filename, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data_rows.append(row)

# 构建元素存在矩阵 (1表示元素存在)
element_matrix = []
for row in data_rows:
    row_vector = []
    for elem in element_columns:
        value = float(row[elem])
        row_vector.append(1 if value > 0 else 0)
    element_matrix.append(row_vector)

# 贪心算法选择行
selected_indices = []
element_counts = [0] * 7  # 7种元素的计数
target_count = 240 / 7    # 60行 * 4元素/行 / 7元素 ≈ 34.29

for _ in range(target_rows):
    best_idx = None
    best_score = float('inf')
    
    # 尝试所有未选中的行
    for idx in range(len(data_rows)):
        if idx in selected_indices:
            continue
            
        # 计算添加该行后的临时计数
        temp_counts = element_counts.copy()
        for i, present in enumerate(element_matrix[idx]):
            if present:
                temp_counts[i] += 1
                
        # 计算与目标频率的差距
        score = sum((count - target_count) ** 2 for count in temp_counts)
        
        # 选择最小化差距的行
        if score < best_score:
            best_score = score
            best_idx = idx
    
    # 添加最佳行并更新计数
    if best_idx is not None:
        selected_indices.append(best_idx)
        for i, present in enumerate(element_matrix[best_idx]):
            if present:
                element_counts[i] += 1

# 输出选择的元素统计
element_names = ['Co', 'Cr', 'Cu', 'Fe', 'Mn', 'Ni', 'V']
print("最终元素出现次数统计:")
for name, count in zip(element_names, element_counts):
    print(f"{name}: {count}次")

# 保存筛选结果到新CSV
with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=reader.fieldnames)
    writer.writeheader()
    for idx in selected_indices:
        writer.writerow(data_rows[idx])

print(f"\n已筛选 {len(selected_indices)} 行数据保存到 {output_filename}")

最终元素出现次数统计:
Co: 35次
Cr: 34次
Cu: 36次
Fe: 34次
Mn: 34次
Ni: 35次
V: 32次

已筛选 60 行数据保存到 filtered_4.csv


筛选5元

In [5]:
import csv
import random
import numpy as np
from collections import defaultdict

# 读取CSV文件
input_filename = '5元.csv'
output_filename = 'filtered_5.csv'
target_rows = 60

# 存储所有数据行
data_rows = []
element_columns = ['Co', 'Cr', 'Cu', 'Fe', 'Mn', 'Ni', 'V']

# 读取CSV文件内容
with open(input_filename, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data_rows.append(row)

# 构建元素存在矩阵 (1表示元素存在)
element_matrix = []
for row in data_rows:
    row_vector = []
    for elem in element_columns:
        value = float(row[elem])
        row_vector.append(1 if value > 0 else 0)
    element_matrix.append(row_vector)

# 贪心算法选择行
selected_indices = []
element_counts = [0] * 7  # 7种元素的计数
target_count = 300 / 7    # 60行 * 5元素/行 / 7元素 ≈ 42.86

for _ in range(target_rows):
    best_idx = None
    best_score = float('inf')
    
    # 尝试所有未选中的行
    for idx in range(len(data_rows)):
        if idx in selected_indices:
            continue
            
        # 计算添加该行后的临时计数
        temp_counts = element_counts.copy()
        for i, present in enumerate(element_matrix[idx]):
            if present:
                temp_counts[i] += 1
                
        # 计算与目标频率的差距
        score = sum((count - target_count) ** 2 for count in temp_counts)
        
        # 选择最小化差距的行
        if score < best_score:
            best_score = score
            best_idx = idx
    
    # 添加最佳行并更新计数
    if best_idx is not None:
        selected_indices.append(best_idx)
        for i, present in enumerate(element_matrix[best_idx]):
            if present:
                element_counts[i] += 1

# 输出选择的元素统计
element_names = ['Co', 'Cr', 'Cu', 'Fe', 'Mn', 'Ni', 'V']
print("最终元素出现次数统计:")
for name, count in zip(element_names, element_counts):
    print(f"{name}: {count}次")

# 保存筛选结果到新CSV
with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=reader.fieldnames)
    writer.writeheader()
    for idx in selected_indices:
        writer.writerow(data_rows[idx])

print(f"\n已筛选 {len(selected_indices)} 行数据保存到 {output_filename}")

最终元素出现次数统计:
Co: 43次
Cr: 42次
Cu: 44次
Fe: 43次
Mn: 43次
Ni: 43次
V: 42次

已筛选 60 行数据保存到 filtered_5.csv
