In [None]:
import pandas as pd
import re
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

In [None]:
data_high_bound = 30
data_low_bound = -20
need_to_normalized = False
need_to_write_result_to_csv = True

data = pd.read_excel('./dataset/rt_df_thermo1.xlsx')
print(data.head())

print(f"max = {data['the thermal expansion'].max()}")
print(f"min = {data['the thermal expansion'].min()}")

in_range_count = data[(data['the thermal expansion'] >= -20) & (data['the thermal expansion'] <= 30)].shape[0]

# 计算总数
total_count = data.shape[0]

# 计算比例
proportion = in_range_count / total_count
print(proportion)

In [None]:
# 函数：解析化学成分及其比例
from collections import defaultdict
def parse_normalized_formulas(formula):
    """
    将Normalized_Formulas解析为化学成分及其比例。
    返回一个字典，键为化学成分，值为比例。
    """
    element_count = defaultdict(float)
    elements = re.findall(r'([A-Z][a-z]*)(\d*\.?\d+)', formula)
    for element, ratio in elements:
        # 如果没有数量则默认为1
        count = float(ratio) if ratio else 1.0
        element_count[element] += count
    return dict(element_count)
    # return {element: float(ratio) for element, ratio in elements}

In [None]:
# 提取所有化学成分
all_elements = set()
for formula in data['Normalized_Formulas']:
    parsed = parse_normalized_formulas(formula)
    all_elements.update(parsed.keys())

In [None]:
all_elements

In [None]:
# 确保列顺序一致
all_elements = sorted(all_elements)

# 创建新列：每个元素作为一列，未出现的元素填充为0
for element in all_elements:
    data[element] = data['Normalized_Formulas'].apply(
        lambda x: parse_normalized_formulas(x).get(element, 0)
    )

In [None]:
data.head()

In [None]:
df = data.copy()
df = df.drop('formula', axis=1)
df = df.drop('Normalized_Formulas', axis=1)
df = df.drop('ID', axis=1)
df.head()

In [None]:
import class_plotpicture as pl
# 绘制目标特征的条形图
pl.plot_prediction_feature(df, 'the thermal expansion', 'ImageOfThermal')

# 可见需要把大于3000的视为异常值

In [None]:
# 删除 'the thermal expansion' 列中大于 3000 的行
df_cleaned = df[df['the thermal expansion'] <= data_high_bound]

In [None]:
# 删除 'the thermal expansion' 列中小于 -200 的行
df_cleaned = df_cleaned[df_cleaned['the thermal expansion'] >= data_low_bound]

In [None]:
import class_plotpicture as pl

# 绘制目标特征的条形图
pl.plot_prediction_feature(df_cleaned, 'the thermal expansion', 'ImageOfThermal')

# 大致符合正态分布

# 划分数据，进行训练和测试

In [None]:
all_features = df_cleaned.drop('the thermal expansion', axis=1)
all_labels = df_cleaned['the thermal expansion']
print(f'全部的特征：{all_features.shape}')
print(f'全部的标签：{all_labels.shape}')

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
# 标准化特征
scaler = StandardScaler()
all_labels_scaler = scaler.fit_transform(np.array(all_labels).reshape(-1, 1))
all_labels = np.array(all_labels).reshape(-1, 1)

In [None]:
from sklearn.model_selection import train_test_split
# 将总的数据集分开
if need_to_normalized:
    X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels_scaler, test_size=0.2, random_state=42)
else:
    X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.2, random_state=42)
print(f'训练集的特征：{X_train.shape}, 标签：{y_train.shape}')
print(f'测试集的特征：{X_test.shape}, 标签：{y_test.shape}')

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
# 创建随机森林分类器对象
# clf = RandomForestRegressor(random_state=42, n_estimators=2, max_features=38, min_samples_leaf=1, max_depth=29)
# clf = RandomForestRegressor(random_state=42, n_estimators=66, max_features=6, min_samples_leaf=1, max_depth=41)
# clf = RandomForestRegressor(random_state=42, n_estimators=38, max_features=20, min_samples_leaf=1, max_depth=24)
# clf = RandomForestRegressor(random_state=42, n_estimators=15, max_features=1, min_samples_leaf=1, max_depth=43)
clf = RandomForestRegressor(random_state=42, n_estimators=14, max_features=1, min_samples_leaf=1, max_depth=43)
# 在训练集上拟合模型
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
# 对测试集进行预测
label_pred = clf.predict(X_test)
# 计算MSE(平均误差)和精确度
mse = mean_squared_error(y_test, label_pred)
r2 = r2_score(y_test, label_pred)
# 输出模型评估结果和目标方程
print(f'MSE:{mse:.5f}')
print(f"R2: {r2:.5f}")

# 预测3

In [None]:
data = pd.read_excel('./dataset/to_predict_3.xlsx')
print(data.head())

In [None]:
data_raw = pd.read_excel('./dataset/to_predict_3.xlsx')
data = data_raw[['pretty_formula']]
print(data.head())

# 确保列顺序一致
all_elements = sorted(all_elements)

# 创建新列：每个元素作为一列，未出现的元素填充为0
for element in all_elements:
    data[element] = data['pretty_formula'].apply(
        lambda x: parse_normalized_formulas(x).get(element, 0)
    )

data = data.drop('pretty_formula', axis=1)
print(data.head())

pred = clf.predict(data)

print(f'pred: {pred}')
print(type(pred))

# 结果保存为文件
pred_df = pd.DataFrame(pred, columns=['thermal expansion predict'])

data_result = pd.concat([data_raw, pred_df], axis=1)

data_result.to_excel('./dataset/to_predict_3_result.xlsx', index=False)

# 预测

In [None]:
# 预测
lmo_data = {
    'Normalized_Formulas': [
        'Li0.25O0.75', 
        'Li0.1667V0.1667F0.6667',
        f'Li{round(1/8, 4)}V{round(1/8, 4)}F{round(6/8, 4)}',
        f'Li{round(1/10, 4)}Fe{round(2/10, 4)}F{round(7/10, 4)}'
        ]
}

# 根据字典初始化 DataFrame
lmo_data = pd.DataFrame(lmo_data)

# 确保列顺序一致
all_elements = sorted(all_elements)

# 创建新列：每个元素作为一列，未出现的元素填充为0
for element in all_elements:
    lmo_data[element] = lmo_data['Normalized_Formulas'].apply(
        lambda x: parse_normalized_formulas(x).get(element, 0)
    )

print(lmo_data)

lmo_data = lmo_data.drop('Normalized_Formulas', axis=1)
print(lmo_data)

lmo_pred = clf.predict(lmo_data)

print(f'lmo_pred: {lmo_pred}')

if need_to_normalized:
    # 将数据反归一化（归一化是 fit_transform
    salered_lmo_pred = scaler.inverse_transform(lmo_pred.reshape(-1,1))
    print(f'salered_lmo_pred: {salered_lmo_pred}')

# 预测LATP

In [None]:
latp_data_raw = pd.read_excel('./dataset/pred33Ti.xlsx')
latp_data = latp_data_raw[['pretty_formula']]
print(latp_data.head())

# 确保列顺序一致
all_elements = sorted(all_elements)

# 创建新列：每个元素作为一列，未出现的元素填充为0
for element in all_elements:
    latp_data[element] = latp_data['pretty_formula'].apply(
        lambda x: parse_normalized_formulas(x).get(element, 0)
    )

latp_data = latp_data.drop('pretty_formula', axis=1)
print(latp_data.head())

latp_pred = clf.predict(latp_data)

print(f'latp_pred: {latp_pred}')

# sorted_latp_pred = sorted(latp_pred, reverse=True)
# print(f'sorted_latp_pred: {sorted_latp_pred}')

target = 7.77845455
closest_numbers = find_closest_numbers(latp_pred, target, 50)

# 打印与目标值最接近的n个数字及其索引
for diff, index, number in closest_numbers:
    print(f"化学式: {latp_data_raw['pretty_formula'][index]}, 预测结果: {number}, 与 {target} 的差: {diff}, 索引: {index}, ")

if need_to_normalized:
    # 将数据反归一化（归一化是 fit_transform
    salered_latp_pred = scaler.inverse_transform(latp_pred.reshape(-1,1))
    print(f'salered_latp_pred: {salered_latp_pred}')
    sorted_salered_latp_pred = sorted(salered_latp_pred)
    print(f'sorted_salered_latp_pred: {sorted_salered_latp_pred}')


# 预测LATP（带离子电导率预测）

In [None]:
latp_with_conductivity_data_raw = pd.read_csv('./dataset/pred33Ti_with_conductivity_prediction.csv', header=None)

latp_conductivity_column = latp_with_conductivity_data_raw.iloc[:, 1].reset_index(drop=True)
latp_original_formula_column = latp_with_conductivity_data_raw.iloc[:, 19].reset_index(drop=True)

# 归一化化学式
from utils import normalize_chemical_formula
latp_formula_column = latp_original_formula_column.apply(lambda x: normalize_chemical_formula(x))

latp_formula_column.name = 'pretty_formula'
latp_data = pd.DataFrame(latp_formula_column)
print(latp_data.head())

# 确保列顺序一致
all_elements = sorted(all_elements)

# 创建新列：每个元素作为一列，未出现的元素填充为0
for element in all_elements:
    latp_data[element] = latp_data['pretty_formula'].apply(
        lambda x: parse_normalized_formulas(x).get(element, 0)
    )

latp_data = latp_data.drop('pretty_formula', axis=1)
print(latp_data.head())

latp_pred = clf.predict(latp_data)

print(f'latp_pred: {latp_pred}')

if need_to_normalized:
    # 将数据反归一化（归一化是 fit_transform
    salered_latp_pred = scaler.inverse_transform(latp_pred.reshape(-1,1))
    print(f'salered_latp_pred: {salered_latp_pred}')

In [None]:
# 绘制图像，横轴是热膨胀系数，纵轴是锂离子电导率
x = latp_pred
y = 10**latp_conductivity_column * 1000
plt.figure(figsize=(8, 6))  # 设定图形大小
plt.scatter(x, y, color='blue', marker='o')  # 绘制散点
# plt.axvline(x=7, color='red', linestyle='--', label='x = 7')  # 红色虚线

plt.title('Scatter Plot of latp_pred vs latp_conductivity_column')  # 图标题
plt.xlabel('latp_pred')  # x轴标签
plt.ylabel('latp_conductivity_column')  # y轴标签
plt.grid(True)  # 显示网格
plt.show()  # 显示图形

if need_to_write_result_to_csv:
    save_data_conductivity_TX = pd.DataFrame({'formula': latp_formula_column, 'original_formula': latp_original_formula_column, 'thermal_expansion': latp_pred, 'li ion conductivity': 10**latp_conductivity_column * 1000})
    save_data_conductivity_TX.to_csv('result/随机森林-最终预测结果.csv', index=False, header=True)

# 寻找 电导率 与 热膨胀系数 综合最优

In [None]:
data_conductivity_TX = pd.DataFrame({'thermal_expansion': latp_pred, 'formula': latp_formula_column, 'original_formula': latp_original_formula_column})
data_conductivity_TX['conductivity'] = latp_conductivity_column
data_conductivity_TX.head()

# 目标值
target_value = 7

# 计算 A 与目标值的绝对差
data_conductivity_TX['abs_diff'] = (data_conductivity_TX['thermal_expansion'] - 7).abs()

# 找到对于每个 B 值，C 值最小的行
def find_min_C(group):
    return group[group['abs_diff'] == group['abs_diff'].min()]

result = data_conductivity_TX.groupby('conductivity').apply(find_min_C).reset_index(drop=True)

result['conductivity'] = 10 ** result['conductivity'] * 1000
# print(result[['thermal_expansion', 'conductivity', 'abs_diff']])

filter_result = result[(result['thermal_expansion'] > 6.5) & (result['thermal_expansion'] < 7.5)].sort_values(by='conductivity', ascending=False).head(10)

print(filter_result[['thermal_expansion', 'conductivity', 'abs_diff', 'formula', 'original_formula']])

In [None]:
# 找电导率最高的54个结构
result_54 = data_conductivity_TX.sort_values(by='conductivity',ascending=False).head(54)
result_54['conductivity'] = 10 ** result_54['conductivity'] * 1000
result_54
