In [None]:
import pandas as pd
import re
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

In [None]:
data_high_bound = 30
data_low_bound = -20
need_to_normalized = False
need_to_write_result_to_csv = True

data = pd.read_excel('./dataset/rt_df_thermo1.xlsx')
print(data.head())

print(f"max = {data['the thermal expansion'].max()}")
print(f"min = {data['the thermal expansion'].min()}")

in_range_count = data[(data['the thermal expansion'] >= -20) & (data['the thermal expansion'] <= 30)].shape[0]

# 计算总数
total_count = data.shape[0]

# 计算比例
proportion = in_range_count / total_count
print(proportion)

In [None]:
# 函数：解析化学成分及其比例
def parse_normalized_formulas(formula):
    """
    将Normalized_Formulas解析为化学成分及其比例。
    返回一个字典，键为化学成分，值为比例。
    """
    elements = re.findall(r'([A-Z][a-z]*)(\d*\.?\d+)', formula)
    return {element: float(ratio) for element, ratio in elements}

In [None]:
# 提取所有化学成分
all_elements = set()
for formula in data['Normalized_Formulas']:
    parsed = parse_normalized_formulas(formula)
    all_elements.update(parsed.keys())

In [None]:
all_elements

In [None]:
# 确保列顺序一致
all_elements = sorted(all_elements)

# 创建新列：每个元素作为一列，未出现的元素填充为0
for element in all_elements:
    data[element] = data['Normalized_Formulas'].apply(
        lambda x: parse_normalized_formulas(x).get(element, 0)
    )

In [None]:
data.head()

In [None]:
df = data.copy()
df = df.drop('formula', axis=1)
df = df.drop('Normalized_Formulas', axis=1)
df = df.drop('ID', axis=1)
df.head()

# 脏数据清理

In [None]:
# 删除 'the thermal expansion' 列中大于 data_high_bound 的行
df_cleaned = df[df['the thermal expansion'] <= data_high_bound]

# 删除 'the thermal expansion' 列中小于 data_low_bound 的行
df_cleaned = df_cleaned[df_cleaned['the thermal expansion'] >= data_low_bound]

# 划分数据，进行训练和测试

In [None]:
all_features = df_cleaned.drop('the thermal expansion', axis=1)
all_labels = df_cleaned['the thermal expansion']
print(f'全部的特征：{all_features.shape}')
print(f'全部的标签：{all_labels.shape}')

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
# 为标准化特征做准备。但实际使用使用标准化后的特征，取决于代码最开始的 need_to_normalized
scaler = StandardScaler()
all_labels_scaler = scaler.fit_transform(np.array(all_labels).reshape(-1, 1))
all_labels = np.array(all_labels).reshape(-1, 1)

In [None]:
from sklearn.model_selection import train_test_split
# 将总的数据集分开。这里根据是否需要对特征进行标准化
if need_to_normalized:
    X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels_scaler, test_size=0.2, random_state=42)
else:
    X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.2, random_state=42)
print(f'训练集的特征：{X_train.shape}, 标签：{y_train.shape}')
print(f'测试集的特征：{X_test.shape}, 标签：{y_test.shape}')

# 超参数优化

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
# 统计不同的值对K近邻的变化
cllo_mse = []
cllo_r2 = []
best_idx, best_mse, best_r2 = 0, 100, 0
# 先查找 n_estimators ，其他参数默认
for i in range(1, 275):
    print(f'第{i+1}个')
    clf = KNeighborsRegressor(n_neighbors=i)
    # 在训练集上拟合模型
    clf.fit(X_train,y_train)
    # 对测试集进行预测
    label_pred = clf.predict(X_test)
    # 计算MSE(平均误差)和精确度
    mse = mean_squared_error(y_test, label_pred)
    r2 = r2_score(y_test, label_pred)
    cllo_mse.append(mse)
    cllo_r2.append(r2)
    if best_mse > mse:
        best_mse = mse
        best_idx = i
        best_r2 = r2
print(f'在K近邻调优 n_neighbors 过程中，最好的效果：mse:{best_mse:.5f}, r2:{best_r2:.5f}, n_neighbors:{best_idx}')

plt.figure(figsize=(10,6))
plt.subplot(121)
plt.plot(cllo_mse)
plt.title('在调节 n_neighbors 参数时MSE的变化')
plt.subplot(122)
plt.plot(cllo_r2)
plt.title('在调节 n_neighbors 参数时R2的变化')
plt.show()

if need_to_write_result_to_csv:
    import utils
    filename = 'KNN-超参数的不同取值下模型的mse变化曲线-n_neighbors'
    utils.write_to_csv(f'result/{filename}.csv', range(1, len(cllo_mse) + 1), cllo_mse, 'n_neighbors', 'mse')

In [None]:
# 统计不同的值对K近邻的变化
cllo_mse = []
cllo_r2 = []
best_idx, best_mse, best_r2 = 0, 100, 0
# 先查找 n_estimators ，其他参数默认
for i in range(1, 301):
    print(f'第{i+1}个')
    clf = KNeighborsRegressor(n_neighbors=2, leaf_size=i)
    # 在训练集上拟合模型
    clf.fit(X_train,y_train)
    # 对测试集进行预测
    label_pred = clf.predict(X_test)
    # 计算MSE(平均误差)和精确度
    mse = mean_squared_error(y_test, label_pred)
    r2 = r2_score(y_test, label_pred)
    cllo_mse.append(mse)
    cllo_r2.append(r2)
    if best_mse > mse:
        best_mse = mse
        best_idx = i
        best_r2 = r2
print(f'在K近邻调优 leaf_size 过程中，最好的效果：mse:{best_mse:.5f}, r2:{best_r2:.5f}, leaf_size:{best_idx}')

plt.figure(figsize=(10,6))
plt.subplot(121)
plt.plot(cllo_mse)
plt.title('在调节 leaf_size 参数时MSE的变化')
plt.subplot(122)
plt.plot(cllo_r2)
plt.title('在调节 leaf_size 参数时R2的变化')
plt.show()

if need_to_write_result_to_csv:
    import utils
    filename = 'KNN-超参数的不同取值下模型的mse变化曲线-leaf_size'
    utils.write_to_csv(f'result/{filename}.csv', range(1, len(cllo_mse) + 1), cllo_mse, 'leaf_size', 'mse')

In [None]:
# 统计不同的值对K近邻的变化
cllo_mse = []
cllo_r2 = []
best_idx, best_mse, best_r2 = 0, 100, 0
# 先查找 n_estimators ，其他参数默认
for i in range(1, 11):
    print(f'第{i+1}个')
    clf = KNeighborsRegressor(n_neighbors=2, leaf_size=1, p=i)
    # 在训练集上拟合模型
    clf.fit(X_train,y_train)
    # 对测试集进行预测
    label_pred = clf.predict(X_test)
    # 计算MSE(平均误差)和精确度
    mse = mean_squared_error(y_test, label_pred)
    r2 = r2_score(y_test, label_pred)
    cllo_mse.append(mse)
    cllo_r2.append(r2)
    if best_mse > mse:
        best_mse = mse
        best_idx = i
        best_r2 = r2
name = 'p'
print(f'在K近邻调优{name}过程中，最好的效果：mse:{best_mse:.5f}, r2:{best_r2:.5f}, {name}:{best_idx}')

plt.figure(figsize=(10,6))
plt.subplot(121)
plt.plot(cllo_mse)
plt.title(f'在调节{name}参数时MSE的变化')
plt.subplot(122)
plt.plot(cllo_r2)
plt.title(f'在调节{name}参数时R2的变化')
plt.show()

if need_to_write_result_to_csv:
    import utils
    filename = 'KNN-超参数的不同取值下模型的mse变化曲线-p'
    utils.write_to_csv(f'result/{filename}.csv', range(1, len(cllo_mse) + 1), cllo_mse, 'p', 'mse')

In [None]:
# 创建K近邻分类器对象
clf = KNeighborsRegressor(n_neighbors=2, leaf_size=1, p=5)
# 在训练集上拟合模型
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
# 对测试集进行预测
label_pred = clf.predict(X_test)
# 计算MSE(平均误差)和精确度
mse = mean_squared_error(y_test, label_pred)
r2 = r2_score(y_test, label_pred)
# 输出模型评估结果和目标方程
print(f'MSE:{mse:.5f}')
print(f"R2: {r2:.5f}")

In [None]:
if need_to_normalized:
    # 先将数据反归一化
    salered_train_label_pred = scaler.inverse_transform(train_pred.reshape(-1,1))
    salered_train_labels = scaler.inverse_transform(y_train)
    # 预测集
    salered_test_labels = scaler.inverse_transform(y_test)
    salered_label_pred = scaler.inverse_transform(label_pred.reshape(-1,1))
else:
    salered_train_label_pred = train_pred.reshape(-1, 1)
    salered_train_labels = y_train.reshape(-1, 1)
    salered_test_labels = y_test.reshape(-1, 1)
    salered_label_pred = label_pred.reshape(-1, 1)

plt.scatter(salered_train_labels, salered_train_label_pred, color='blue', label='Predicted vs True')
# 绘制Y=X的直线，表示完美预测
plt.plot([min(salered_train_labels), max(salered_train_label_pred)], [min(salered_train_labels), max(salered_train_label_pred)], 'r--', label='Perfect Prediction (Y=X)')
# 添加图例
plt.legend()
# 设置坐标轴标签
plt.xlabel('真确的值')
plt.ylabel('预测的值')
# 设置标题
plt.title('KNN的训练集预测结果')
# 显示图形
plt.show()

plt.scatter(salered_test_labels, salered_label_pred, color='blue', label='Predicted vs True')
# 绘制Y=X的直线，表示完美预测
plt.plot([min(salered_test_labels), max(salered_label_pred)], [min(salered_test_labels), max(salered_label_pred)], 'r--', label='Perfect Prediction (Y=X)')
# 添加图例
plt.legend()
# 设置坐标轴标签
plt.xlabel('真确的值')
plt.ylabel('预测的值')
# 设置标题
plt.title('KNN的测试集预测结果')
# 显示图形
plt.show()

if need_to_write_result_to_csv:
    import utils
    filename = 'KNN-训练集-预测结果vs真实结果'
    utils.write_to_csv(f'result/{filename}.csv', salered_train_labels.reshape(-1), salered_train_label_pred.reshape(-1), 'trainset real', 'trainset predict')
    filename = 'KNN-测试集-预测结果vs真实结果'
    utils.write_to_csv(f'result/{filename}.csv', salered_test_labels.reshape(-1), salered_label_pred.reshape(-1), 'testset real', 'testset predict')

# 查看预测的偏差值

In [None]:

big_num = 0
small_num = 0
for i in range(len(label_pred)):
    if label_pred[i].mean() >= y_test[i].mean():
        big_num += 1
    else:
        small_num += 1
print(f'预测值大于原值的个数：{big_num}, 预测值小于原值的个数：{small_num}')

# 计算偏移量
value = []
for i in range(len(label_pred)):
    value.append(salered_test_labels[i] - salered_label_pred[i])

bins = np.arange(-70, 71, 10)  # 从-1到1，每隔0.1一个区间
counts, _ = np.histogram(value, bins=bins)
print(counts)
bin_centers = (bins[:-1] + bins[1:]) / 2
# 绘制柱形图
plt.figure(figsize=(10, 6))  # 设置图形大小
plt.bar(bin_centers, counts, width=5, color='skyblue', edgecolor='black')  # 宽度设置为0.1与区间宽度相匹配

# 添加标题和轴标签
plt.title('误差偏移量')
plt.xlabel('偏移量区间')
plt.ylabel('统计个数')

# 显示网格
plt.grid(True)
plt.show()

# 预测LiMn2O4

In [None]:
# 预测LiMn2O4
lmo_data = {
    'Normalized_Formulas': f'Li{round(1/7, 4)}Mn{round(2/7, 4)}O{round(4/7, 4)}'
}

# 根据字典初始化 DataFrame
lmo_data = pd.DataFrame(lmo_data, index=[0])

# 确保列顺序一致
all_elements = sorted(all_elements)

# 创建新列：每个元素作为一列，未出现的元素填充为0
for element in all_elements:
    lmo_data[element] = lmo_data['Normalized_Formulas'].apply(
        lambda x: parse_normalized_formulas(x).get(element, 0)
    )

print(lmo_data)

lmo_data = lmo_data.drop('Normalized_Formulas', axis=1)
print(lmo_data)

lmo_pred = clf.predict(lmo_data)

print(f'lmo_pred: {lmo_pred}')
# 将数据反归一化（归一化是 fit_transform
salered_lmo_pred = scaler.inverse_transform(lmo_pred.reshape(-1,1))
print(f'salered_lmo_pred: {salered_lmo_pred}')