In [None]:
import pandas as pd
import re
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

In [None]:
data_high_bound = 30
data_low_bound = -20
need_to_normalized = False
need_to_write_result_to_csv = True

data = pd.read_excel('./dataset/rt_df_thermo1.xlsx')
data.head()

In [None]:
# 函数：解析化学成分及其比例
def parse_normalized_formulas(formula):
    """
    将Normalized_Formulas解析为化学成分及其比例。
    返回一个字典，键为化学成分，值为比例。
    """
    elements = re.findall(r'([A-Z][a-z]*)(\d*\.?\d+)', formula)
    return {element: float(ratio) for element, ratio in elements}

In [None]:
# 提取所有化学成分
all_elements = set()
for formula in data['Normalized_Formulas']:
    parsed = parse_normalized_formulas(formula)
    all_elements.update(parsed.keys())

In [None]:
all_elements

In [None]:
# 确保列顺序一致
all_elements = sorted(all_elements)

# 创建新列：每个元素作为一列，未出现的元素填充为0
for element in all_elements:
    data[element] = data['Normalized_Formulas'].apply(
        lambda x: parse_normalized_formulas(x).get(element, 0)
    )

In [None]:
data.head()

In [None]:
df = data.copy()
df = df.drop('formula', axis=1)
df = df.drop('Normalized_Formulas', axis=1)
df = df.drop('ID', axis=1)
df.head()

In [None]:
import class_plotpicture as pl
# 绘制目标特征的条形图
pl.plot_prediction_feature(df, 'the thermal expansion', 'ImageOfThermal')

# 脏数据清理

In [None]:
# 删除 'the thermal expansion' 列中大于 data_high_bound 的行
df_cleaned = df[df['the thermal expansion'] <= data_high_bound]

# 删除 'the thermal expansion' 列中小于 data_low_bound 的行
df_cleaned = df_cleaned[df_cleaned['the thermal expansion'] >= data_low_bound]

In [None]:
import class_plotpicture as pl

# 绘制目标特征的条形图
pl.plot_prediction_feature(df_cleaned, 'the thermal expansion', 'ImageOfThermal')

# 大致符合正态分布

In [None]:
# 绘制数据的相关性：热力图
# pl.plot_headmap(df_cleaned, 'the thermal expansion', 'ImageOfThermal')

# 划分数据，进行训练和测试

In [None]:
all_features = df_cleaned.drop('the thermal expansion', axis=1)
all_labels = df_cleaned['the thermal expansion']
print(f'全部的特征：{all_features.shape}')
print(f'全部的标签：{all_labels.shape}')

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
# 标准化特征
scaler = StandardScaler()
all_labels_scaler = scaler.fit_transform(np.array(all_labels).reshape(-1, 1))
all_labels = np.array(all_labels).reshape(-1, 1)

In [None]:
from sklearn.model_selection import train_test_split
# 将总的数据集分开。这里根据是否需要对特征进行标准化
if need_to_normalized:
    X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels_scaler, test_size=0.2, random_state=42)
else:
    X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.2, random_state=42)
print(f'训练集的特征：{X_train.shape}, 标签：{y_train.shape}')
print(f'测试集的特征：{X_test.shape}, 标签：{y_test.shape}')

# 超参数优化

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [None]:
# 创建XGBoost分类器对象
clf = xgb.XGBRegressor(random_state=42, n_estimators=223, learning_rate=0.4)
# 在训练集上拟合模型
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
# 对测试集进行预测
label_pred = clf.predict(X_test)
# 计算MSE(平均误差)和精确度
mse = mean_squared_error(y_test, label_pred)
mae = mean_absolute_error(y_test, label_pred)
r2 = r2_score(y_test, label_pred)

# 输出模型评估结果和目标方程
print(f'MSE:{mse:.5f}')
print(f'MAE:{mae:.5f}')
print(f"R2: {r2:.5f}")

# 计算MSE(平均误差)和精确度
mse_train = mean_squared_error(y_train, train_pred)
mae_train = mean_absolute_error(y_train, train_pred)
r2_train = r2_score(y_train, train_pred)

# 输出模型评估结果和目标方程
print(f'MSE_train:{mse_train:.5f}')
print(f'MAE_train:{mae_train:.5f}')
print(f"R2_train: {r2_train:.5f}")

# 预测LATP（带离子电导率预测）

In [None]:
latp_with_conductivity_data_raw = pd.read_csv('./dataset/pred33Ti_with_conductivity_prediction.csv', header=None)

latp_conductivity_column = latp_with_conductivity_data_raw.iloc[:, 1].reset_index(drop=True)
latp_original_formula_column = latp_with_conductivity_data_raw.iloc[:, 19].reset_index(drop=True)

# 归一化化学式
from utils import normalize_chemical_formula
latp_formula_column = latp_original_formula_column.apply(lambda x: normalize_chemical_formula(x))

latp_formula_column.name = 'pretty_formula'
latp_data = pd.DataFrame(latp_formula_column)
print(latp_data.head())

# 确保列顺序一致
all_elements = sorted(all_elements)

# 创建新列：每个元素作为一列，未出现的元素填充为0
for element in all_elements:
    latp_data[element] = latp_data['pretty_formula'].apply(
        lambda x: parse_normalized_formulas(x).get(element, 0)
    )

latp_data = latp_data.drop('pretty_formula', axis=1)
print(latp_data.head())

latp_pred = clf.predict(latp_data)

print(f'latp_pred: {latp_pred}')

if need_to_normalized:
    # 将数据反归一化（归一化是 fit_transform
    salered_latp_pred = scaler.inverse_transform(latp_pred.reshape(-1,1))
    print(f'salered_latp_pred: {salered_latp_pred}')

In [None]:
# 绘制图像，横轴是热膨胀系数，纵轴是锂离子电导率
x = latp_pred
y = 10**latp_conductivity_column * 1000
plt.figure(figsize=(8, 6))  # 设定图形大小
plt.scatter(x, y, color='blue', marker='o')  # 绘制散点
# plt.axvline(x=7, color='red', linestyle='--', label='x = 7')  # 红色虚线

plt.title('Scatter Plot of latp_pred vs latp_conductivity_column')  # 图标题
plt.xlabel('latp_pred')  # x轴标签
plt.ylabel('latp_conductivity_column')  # y轴标签
plt.grid(True)  # 显示网格
plt.show()  # 显示图形

if need_to_write_result_to_csv:
    save_data_conductivity_TX = pd.DataFrame({'formula': latp_formula_column, 'original_formula': latp_original_formula_column, 'thermal_expansion': latp_pred, 'li ion conductivity': 10**latp_conductivity_column * 1000})
    save_data_conductivity_TX.to_csv('result/XGBoost-最终预测结果.csv', index=False, header=True)