In [2]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

# 1. 处理成分数据

In [None]:
# def parse_elements(composition):
#     pattern = r"\(([A-Za-z\d.]+)\)([\d.]+)|([A-Z][a-z]*)([\d.]+)?"
#     matches = re.findall(pattern, composition)
#     elements = {}
#     for match in matches:
#         if match[0]:  # 如果有括号
#             sub_scale_factor = sum(float(x[1]) if x[1] else 1 for x in re.findall(r"([A-Z][a-z]*)([\d.]+)?", match[0]))
#             scale_factor = float(match[1]) / sub_scale_factor
#             sub_matches = re.findall(r"([A-Z][a-z]*)([\d.]+)?", match[0])
#             for sub_match in sub_matches:
#                 if sub_match[0] not in elements:
#                     elements[sub_match[0]] = float(sub_match[1]) * scale_factor if sub_match[1] else scale_factor
#         elif match[2]:  # 如果没有括号
#             if match[2] not in elements:
#                 elements[match[2]] = float(match[3]) if match[3] else 1
#     sorted_elements = {key: value for key, value in sorted(elements.items(), key=lambda item: item[1], reverse=True)}
#     element_strings = [f"{element}{round(value,2)}" for element, value in sorted_elements.items()]
#     chem = "".join(element_strings)
#     return elements, chem

def parse_elements(composition):
    pattern = r"\[([A-Za-z\d.\(\)]+)]([\d.]+)|\(([A-Za-z\d.]+)\)([\d.]+)|([A-Z][a-z]*)([\d.]+)?"
    matches = re.findall(pattern, composition)
    elements = {}

    for match in matches:
        if match[0]:  # 如果有方括号
            bracket_scale_factor = float(match[1])
            bracket_content = match[0]
            bracket_matches = re.findall(pattern, bracket_content)

            for bracket_match in bracket_matches:
                if bracket_match[2]:  # 如果有括号
                    paren_scale_factor = float(bracket_match[3])
                    paren_content = bracket_match[2]
                    paren_matches = re.findall(r"([A-Z][a-z]*)([\d.]+)?", paren_content)

                    for element, weight in paren_matches:
                        scaled_weight = float(weight) * paren_scale_factor * bracket_scale_factor if weight else paren_scale_factor * bracket_scale_factor

                        if element not in elements:
                            elements[element] = scaled_weight
                        else:
                            elements[element] += scaled_weight
                        
                elif bracket_match[4]:  # 如果没有括号
                    element, weight = bracket_match[4], bracket_match[5]
                    scaled_weight = float(weight) * bracket_scale_factor if weight else bracket_scale_factor

                    if element not in elements:
                        elements[element] = scaled_weight
                    else:
                        elements[element] += scaled_weight

        elif match[2]:  # 如果有括号，但没有方括号
            paren_scale_factor = float(match[3])
            paren_content = match[2]
            paren_matches = re.findall(r"([A-Z][a-z]*)([\d.]+)?", paren_content)

            for element, weight in paren_matches:
                scaled_weight = float(weight) * paren_scale_factor if weight else paren_scale_factor

                if element not in elements:
                    elements[element] = scaled_weight
                else:
                    elements[element] += scaled_weight

        elif match[4]:  # 如果没有括号和方括号
            element, weight = match[4], match[5]
            scaled_weight = float(weight) if weight else 1

            if element not in elements:
                elements[element] = scaled_weight
            else:
                elements[element] += scaled_weight

    sorted_elements = {key: value for key, value in sorted(elements.items(), key=lambda item: item[1], reverse=True)}
    element_strings = [f"{element}{round(value,2)}" for element, value in sorted_elements.items()]
    chem = "".join(element_strings)
    return elements, chem


def check_sum(elements):
    total = sum(elements.values())
    return round(total, 2)

# 使用parse_elements和check_sum函数解析和检查成分
def parse_and_check(composition):
    elements, chem = parse_elements(composition)
    is_sum_100 = check_sum(elements)
    if abs(is_sum_100 - 100) > 0.5:
        print(f"Warning: {composition} is not normalized. Sum is {is_sum_100}")
    return elements, chem, abs(is_sum_100 - 100) < 0.5

In [None]:
def parse_elements(composition):
    pattern = r"\{([^\{\}]+)\}([\d.]+)|\[([^\[\]]+)\]([\d.]+)|\(([A-Za-z\d.]+)\)([\d.]+)|([A-Z][a-z]*)([\d.]+)?"
    elements = {}

    def adjust_weight(wt):
        # 将大于1的权重调整为0-1之间，小于等于1的保持不变
        return wt/100 if wt>=1 else wt

    def parse(match, scale_factor=1.0, is_outermost=True):
        for inner_match in re.findall(pattern, match):
            if inner_match[0]:  # 大括号内容
                parse(inner_match[0], float(inner_match[1]) * scale_factor, False)
            elif inner_match[2]:  # 方括号内容
                parse(inner_match[2], float(inner_match[3]) * scale_factor, False)
            elif inner_match[4]:  # 圆括号内容
                flag = 0
                for elem, amt in re.findall(r"([A-Z][a-z]*)([\d.]+)", inner_match[4]):
                    weight = adjust_weight(float(amt)) * float(inner_match[5]) * scale_factor
                    elements[elem] = elements.get(elem, 0) + weight
            else:  # 普通元素
                elem, amt = inner_match[6], inner_match[7]
                weight = float(amt) if amt else 1.0
                if not is_outermost:
                    weight = adjust_weight(weight)
                weight *= scale_factor
                elements[elem] = elements.get(elem, 0) + weight

    parse(composition)

    sorted_elements = {k: v for k, v in sorted(elements.items(), key=lambda item: item[1], reverse=True)}
    element_strings = [f"{el}{round(val,2)}" for el, val in sorted_elements.items()]
    chem = "".join(element_strings)
    return elements, chem


def check_sum(elements):
    total = sum(elements.values())
    return round(total, 2)

# 使用parse_elements和check_sum函数解析和检查成分
def parse_and_check(composition):
    elements, chem = parse_elements(composition)
    is_sum_100 = check_sum(elements)
    if abs(is_sum_100 - 100) > 0.5:
        print(elements)
        print(f"Warning: {composition} is not normalized. Sum is {is_sum_100}")
    return elements, chem, abs(is_sum_100 - 100) < 0.5

In [None]:
data_o = pd.read_csv("./Bian's Data/collected experimental data.csv")
record_chem = {}
for index, row in data_o.iterrows():
    composition = row["BMGs"]
    # composition = row["GFA"]
    if composition in record_chem:
        pass
        # print(record_chem[composition], index, composition)
    else:
        record_chem[composition] = index
    elements, chem, is_sum_100 = parse_and_check(composition)

    if not is_sum_100:
        print(f"index {index} sum {is_sum_100}:\n{row}\n")
    data_o.loc[index, "Chemical composition"] = chem
    # 将新的成分列添加到DataFrame中
    for element, value in elements.items():
        if element not in data_o.columns:
            data_o[element] = 0
        data_o.loc[index, element] = value

In [None]:
data_o.to_excel("./Bian's Data/collected experimental data_parsed.xlsx", index=False)

In [None]:
# data_o.drop_duplicates(subset='Chemical composition', keep='first', inplace=True)
data_o.replace("–", np.nan, inplace=True)
# data_o.describe().to_excel("./All/All_data_des.xlsx")
# data_o.to_excel("./ALL/ALL_data_processed.xlsx", index=False)

# Group 处理数据

In [8]:
def process_group(group):
    processed_data = {}
    warnings = []
    # 处理 'Chemical composition' 列
    processed_data['BMGs'] = group['BMGs'].iloc[0]
    processed_data['Chemical composition'] = group['Chemical composition'].iloc[0]
    # 对 'Dmax' 列取最大值
    processed_data['Dmax(mm)'] = group['Dmax(mm)'].max()

    # 遍历除 'Chemical composition' 和 'Dmax' 外的所有列
    for col in group.columns.drop(['Chemical composition', 'Dmax(mm)']):
        if group[col].dtype in [np.float64, np.int64]:  # 确保只处理数值型列
            max_value = group[col].max()
            min_value = group[col].min()
            
            # 检查是否超过阈值
            if min_value != 0 and np.abs((max_value - min_value) / min_value) > 0.05:
                print(f"警报：'{col}' 列{list(group[col])}在化学成分 '{group['Chemical composition'].iloc[0]}'（原成分: {list(group['BMGs'])}） 的值变化超过{round(100 *np.abs((max_value - min_value) / min_value), 2)}%。")
            processed_data[col] = group[col].mean()
    

    return pd.DataFrame([processed_data])

data_o = pd.read_excel("./ALL/ALL_data_processed.xlsx")
print(len(data_o))
# 对 'Chemical composition' 列进行分组，并应用上面定义的函数
grouped = data_o.groupby('Chemical composition', group_keys=False)
result_df = grouped.apply(process_group).reset_index(drop=True)
print(len(result_df))

1340
警报：'Modulus (GPa)' 列[nan, nan, nan, 105.0, 182.0, nan]在化学成分 'Al16.67Co16.67Cr16.67Cu16.67Fe16.67Ni16.67'（原成分: ['AlCoCrCuFeNI', 'AlCoCrCuFeNI', 'AlCoCrCuFeNI', 'AlCoCrCuFeNi', 'AlCoCrCuFeNi', 'AlCoCrCuFeNi']） 的值变化超过73.33%。
警报：'Ε(%)' 列[24.0, 9.9, 16.0, nan, nan, 34.0]在化学成分 'Al16.67Co16.67Cr16.67Cu16.67Fe16.67Ni16.67'（原成分: ['AlCoCrCuFeNI', 'AlCoCrCuFeNI', 'AlCoCrCuFeNI', 'AlCoCrCuFeNi', 'AlCoCrCuFeNi', 'AlCoCrCuFeNi']） 的值变化超过243.43%。
警报：'Modulus (GPa)' 列[147.6, 90.1]在化学成分 'Al16.67Co16.67Cr16.67Fe16.67Ni16.67Ti16.67'（原成分: ['AlCoCrFeNiTi', 'AlCoCrFeNiTi']） 的值变化超过63.82%。
警报：'Ε(%)' 列[6.4, 8.8]在化学成分 'Al16.67Co16.67Cr16.67Fe16.67Ni16.67Ti16.67'（原成分: ['AlCoCrFeNiTi', 'AlCoCrFeNiTi']） 的值变化超过37.5%。
警报：'yield(MPa)' 列[1138.0, nan, 1051.0, 1373.0, 1138.0, 1110.0, 1500.0]在化学成分 'Al20.0Co20.0Cr20.0Fe20.0Ni20.0'（原成分: ['AlCoCrFeNi', 'AlCoCrFeNi', 'AlCoCrFeNi', 'AlCoCrFeNi', 'AlCoCrFeNi', 'AlCoCrFeNi', 'AlCoCrFeNi']） 的值变化超过42.72%。
警报：'Ε(%)' 列[nan, 27.0, nan, 24.5, nan, nan, 26.9]在化学成分 'Al20.0Co20.0Cr2

In [9]:
# 保存结果到 Excel 文件
# # 保留Dmax(mm)为None的数据,并且去除Dmax(mm)大于60的数据
# result_df = result_df[(result_df['Dmax(mm)'].isna()) | (result_df['Dmax(mm)'] <= 60)]

# # 删除Be列值>0的数据，并删除Be列
# result_df = result_df[result_df['Be'] == 0]
# result_df.drop(columns=['Be'], inplace=True)

result_df.to_excel("./ALL/ALL_data_grouped_processed.xlsx", index=False)

# 汇总所有数据，统一特征列

In [None]:
data_0 = pd.read_excel("./ALL/ALL_data_grouped_processed_bk.xlsx")
data_1 = pd.read_excel("./Bian's Data/collected experimental data_parsed.xlsx")
print(len(data_0.columns), len(data_1.columns))
print(len(data_0), len(data_1))
# 合并两个DataFrame
data = pd.concat([data_0, data_1], ignore_index=True)
print(len(data))
print(len(data.columns))

In [None]:
data_2 = pd.read_excel("./Xiong's Data/HEA_GFA.xlsx")
data_2['BMGs'] = data_2['AAAAlloys']
data_2['Chemical composition'] = data_2['GFA']
data_2.drop(columns=['AAAAlloys', 'GFA'], inplace=True)

print(len(data_2.columns), len(data.columns))
print(len(data_2), len(data))

data = pd.concat([data, data_2], ignore_index=True)
print(len(data))
print(len(data.columns))

In [None]:
chem_columns = ['Ni', 'Cr', 'Nb', 'P', 'B', 'Si',
       'Fe', 'C', 'Mo', 'Y', 'Co', 'Au', 'Ge', 'Pd', 'Cu', 'Zr', 'Ti', 'Al',
       'Mg', 'Ag', 'Gd', 'La', 'Ga', 'Hf', 'Sn', 'In', 'Ca', 'Zn', 'Nd', 'Er',
       'Dy', 'Pr', 'Ho', 'Ce', 'Sc', 'Ta', 'Mn', 'Tm', 'Pt', 'V', 'W', 'Tb',
       'Li', 'Sm', 'Lu', 'Yb', 'Pb', 'Sr', 'Ru', 'Be', 'Rh']
# chem_columns = 用0填充
data[chem_columns] = data[chem_columns].fillna(0)
# Dmax(mm)	Tg(K)	Tx(K)	Tl(K)	yield(MPa)	Modulus (GPa)	Ε(%)这些列保证是数字
target_columns = ['Dmax(mm)', 'Tg(K)', 'Tx(K)', 'Tl(K)', 'yield(MPa)', 'Modulus (GPa)', 'Ε(%)']

# Convert columns to numeric, coercing errors will turn non-convertible values to NaN
data[target_columns] = data[target_columns].apply(pd.to_numeric, errors='coerce')
data.to_excel("./ALL/ALL_data_processed.xlsx", index=False)

In [None]:
# data_3的列按照chem_columns的顺序排列
data_3 = pd.read_excel('./cls_data/GFA_parsed.xlsx')
# 首先判断data_3除了GFA	Class	Chemical composition列之外的列是否都在chem_columns中
not_exist_columns = []
for col in data_3.columns:
    if col not in ['GFA', 'Class', 'Chemical composition']:
        if col not in chem_columns:
            print(f"{col} not in chem_columns")
            not_exist_columns.append(col)
# 遍历not_exist_columns，丢弃这些列下面值大于0的行
print(len(data_3))
for col in not_exist_columns:
    data_3 = data_3[data_3[col] == 0]
    data_3.drop(columns=[col], inplace=True)
print(len(data_3))

# 检查data_3的chem_columns sum是否都为100
for index, row in data_3.iterrows():
    sum = row[chem_columns].sum()
    if abs(sum -100)  > 0.1:
        print(f"index {index} sum {sum}:\n{row}\n")
# data3按照chem_columns的顺序排列
data_3 = data_3[['GFA', 'Class', 'Chemical composition'] + chem_columns]
data_3.to_excel('./cls_data/GFA_cls.xlsx', index=False)

# 2. 绘制数据分析图

In [1]:
import pandas as pd
result_df = pd.read_excel("./ALL/ALL_data_grouped_processed.xlsx")
result_df.describe().to_excel("./ALL/ALL_data_grouped_processed_des.xlsx")

In [3]:
configs = {
    # 'NewCriterion': {
    #     "save_path": "./NewCriterion",
    #     "data_path": "./NewCriterion/NewCriterion_processed.xlsx",
    #     "drop_columns": ["BMGs", "Chemical composition"],
    #     "target_names": ["Tg(K)", "Tx(K)", "Tl(K)", "Dmax(mm)"]
    # },
    # "NCSupply": {
    #     "save_path": "./NCSupply",
    #     "data_path": "./NCSupply/NCSupply_processed.xlsx",
    #     "drop_columns": ["BMGs", "Chemical composition"],
    #     "target_names": ["Tg(K)", "Tl(K)", "Dmax(mm)"]
    # },
    # 'CTT Dataset': {
    #     "save_path": "./Xiong's Data",
    #     "data_path": "./Xiong's Data/CTT Dataset_processed.xlsx",
    #     "drop_columns": ["BMGs", "Chemical composition"],
    #     "target_names": ["Tg(K)", "Tx(K)", "Tl(K)", "Dmax(mm)"]
    # },
    # 'Wang': {
    #     "save_path": "./Wang's Data",
    #     "data_path": "./Wang's Data/Wang_processed.xlsx",
    #     "drop_columns": ["BMGs", "Chemical composition"],
    #     "target_names": ["E(GPa)", "Tg(K)", "Tm(K)", "ry(GPa)", "Tx(K)", "Tl(K)", "Dmax(mm)"]
    # },
    # 'Bian': {
    #     "save_path": "./Bian's Data",
    #     "data_path": "./Bian's Data/Bian_processed.xlsx",
    #     "drop_columns": ["BMGs", "Chemical composition"],
    #     "target_names": ["Dmax(mm)", "Tg(K)", "Tx(K)", "Tl(K)", "σy(MPa)", "Modulus (GPa)", "Ε(%)"]
    # },
    'ALL': {
        "save_path": "./All",
        "data_path": "./All/ALL_data_grouped_processed.xlsx",
        "drop_columns": ["BMGs", "Chemical composition"],
        "target_names": ["Tg(K)", "Tx(K)", "Tl(K)", "Dmax(mm)", "yield(MPa)", "Modulus (GPa)", "Ε(%)"]
    }

}

In [4]:
# 设置箱线图样式
boxprops = dict(linestyle='-', linewidth=2, color='black')
whiskerprops = dict(linestyle='--', linewidth=1.5, color='gray')
flierprops = dict(marker='o', markersize=5, markerfacecolor='red', alpha=0.7)
medianprops = dict(linestyle='-', linewidth=2, color='blue')

def draw_box(data, target_name, save_path):
    plt.cla()
    data_d = [data]

    # 设置每组数据对应的标签
    labels = [target_name]

    # 绘制箱线图
    plt.boxplot(data_d, vert=False, showmeans=True, meanline=True,
                boxprops=boxprops, whiskerprops=whiskerprops, flierprops=flierprops, medianprops=medianprops, labels=labels)

    # 添加标题和标签
    plt.title(f'{target_name} Target Distributions(Filter)')
    plt.xlabel('Range')

    # 显示图形
    plt.savefig(f"{save_path}/{target_name}_box.png", dpi=300)
    plt.close()

def element_hist(data_f, save_path):
    # 计算每个元素的出现频率
    element_count = (data_f != 0).sum()

    # 按频率降序排序
    element_count_sorted = element_count.sort_values(ascending=False)

    # 创建图形
    fig, ax = plt.subplots(figsize=(20, 12))

    # 绘制条形图，元素按频率降序显示
    ax.bar(element_count_sorted.index, element_count_sorted.values)
    ax.set_xlabel("Elements")
    ax.set_ylabel("Frequency")
    ax.set_title("Element Frequency in Compositions")

    # 保存图像
    plt.savefig(f"{save_path}/element_statistics.png", dpi=300)
    plt.close()
    
def draw_target_hist(data_f, target_name, save_path):
    plt.cla()
    # 创建图形
    fig, ax = plt.subplots(figsize=(10, 6))

    # 绘制直方图
    ax.hist(data_f, bins=20, edgecolor='black')

    # 添加标题和标签
    ax.set_title(f'{target_name} Distribution')
    ax.set_xlabel(target_name)
    ax.set_ylabel('Frequency')

    # 保存图像
    plt.savefig(f"{save_path}/{target_name}_hist.png", dpi=300)
    plt.close()

def plot_element_proportions(data_f, save_path):
    # 计算每种元素的总和
    element_totals = data_f.sum()

    # 计算总和，用于确定每种元素的占比
    total_sum = element_totals.sum()

    # 计算每种元素占总元素的比例，并转换为百分比
    proportions = (element_totals / total_sum) * 100

    # 将比例小于3%的元素归类为“Others”
    small_proportions = proportions[proportions < 3]
    proportions = proportions[proportions >= 3]
    proportions['Others'] = small_proportions.sum()

    # 排序，以便在饼图上有序显示
    proportions = proportions.sort_values(ascending=False)

    # 选择一个配色方案
    colors = plt.get_cmap('tab20').colors

    # 绘制饼图
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.pie(proportions, labels=proportions.index, startangle=140, 
                                       colors=colors, explode=[0.01]*len(proportions), autopct='%1.1f%%')
    
    centre_circle = plt.Circle((0,0),0.70,fc='white')
    fig.gca().add_artist(centre_circle)

    # 等比例显示，保证饼图为圆形
    ax.axis('equal')

    # 设置标题
    plt.title("Element Proportions", weight='bold', size='x-large')

    # 保存图像
    plt.savefig(f"{save_path}/element_proportions.png", dpi=300, bbox_inches='tight')

    plt.close()

def plot_element_count(data_f, save_path):
    max_elements = data_f.idxmax(axis=1)
    # 计算每种元素作为最大元素出现的次数
    element_counts = max_elements.value_counts()
    element_counts = (element_counts / len(max_elements)) * 100
     # 将比例小于3%的元素归类为“Others”
    small_proportions = element_counts[element_counts < 3]
    element_counts = element_counts[element_counts >= 3]
    element_counts['Others'] = small_proportions.sum()

    pie_data = element_counts
    pie_data = pie_data.sort_values(ascending=False)
    colors = plt.get_cmap('tab20').colors  # 选择颜色方案
    
    # 绘制饼图
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.pie(pie_data, labels=pie_data.index, startangle=140, 
                                       colors=colors, explode=[0.01]*len(pie_data), autopct='%1.1f%%')
    
    centre_circle = plt.Circle((0,0),0.70,fc='white')
    fig.gca().add_artist(centre_circle)

    ax.axis('equal')  # 等比例显示

    plt.title("Element Max Proportion Counts")
    plt.savefig(f"{save_path}/element_max_proportion_counts.png", dpi=300, bbox_inches='tight')
    plt.close()


In [5]:
for dataset in configs.keys():
    data = pd.read_excel(configs[dataset]["data_path"])
    save_path = configs[dataset]["save_path"]
    target_names = configs[dataset]["target_names"]
    drop_columns = configs[dataset]["drop_columns"]
    # 从特征中删除目标名称和不需要的列
    data_f = data.drop(target_names, axis=1, inplace=False)
    data_f = data_f.drop(drop_columns, axis=1, inplace=False)
    # 绘制箱图
    for target in target_names:
        non_empty_values = data.dropna(subset=[target])[target]
        draw_box(non_empty_values, target.replace('/', '_'), save_path)
    # 绘制柱状图
    for target in target_names:
        draw_target_hist(data, target, save_path)


    # 绘制元素直方图
    element_hist(data_f, save_path)
    # 绘制元素占比饼图
    plot_element_proportions(data_f, save_path)
    # 绘制元素最大占比饼图
    plot_element_count(data_f, save_path)
plt.close()


# 将ALL data加到cls数据中

In [9]:
import pandas as pd
all_data = pd.read_excel("./ALL/ALL_data_grouped_processed.xlsx")
cls_data = pd.read_excel("./cls_data/GFA_cls.xlsx")
HEA_data = pd.read_excel("./Xiong's Data/HEA_GFA.xlsx")
GFAs = set(HEA_data['GFA'].unique())

In [10]:
# 给all_data添加Class列，如果Chemical composition在GFAs中，则为CRA，否则为BMG
all_data['Class'] = all_data['Chemical composition'].apply(lambda x: 'CRA' if x in GFAs else 'BMG')
print(all_data['Class'].value_counts())

BMG    1064
CRA     205
Name: Class, dtype: int64


In [11]:
all_data['GFA'] = all_data['BMGs']
print(all_data.values.shape)
all_data = all_data.drop(columns=['BMGs', 'Dmax(mm)', 'yield(MPa)', 'Modulus (GPa)', 'Ε(%)', 'Tg(K)', 'Tx(K)', 'Tl(K)'])
print(all_data.values.shape, cls_data.values.shape)
# 将all_data的和cls_data合并，并根据Chemical composition去重
all_data = pd.concat([all_data, cls_data], ignore_index=True)
print(all_data.values.shape)
all_data.drop_duplicates(subset='Chemical composition', keep='first', inplace=True)
print(all_data.values.shape)

(1269, 62)
(1269, 54) (6439, 54)
(7708, 54)
(6857, 54)


In [12]:
# all_data新加一个列，如果Class为BMG则为1，其余为0
all_data['cls_label'] = all_data['Class'].apply(lambda x: 1 if x == 'BMG' else 0)
print(all_data['cls_label'].value_counts())

0    5431
1    1426
Name: cls_label, dtype: int64


In [13]:
all_data.to_excel("./ALL/ALL_data_cls.xlsx", index=False)