In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
from mordred import Calculator, descriptors
import seaborn as sns
import matplotlib.pyplot as plt

# 读取Excel文件
df = pd.read_excel(r"D:\张雨林\研三工作\力学工站制备高通量_formulations_whole.xlsx")

# 提取第3列和第5列的数据
polymers = df.iloc[:, 2].unique()
ionic_liquids = df.iloc[:, 4].unique()

# 定义SMILES字典
polymer_smiles = {
    'N-羟乙基丙烯酰胺': 'C=CC(=O)NCCO',
    '丙烯酸': 'C=CC(=O)O',
    '丙烯酸羟乙酯': 'C=CC(=O)OCCO',
    '丙烯酸苯甲酯': 'C=CC(=O)OCC1=CC=CC=C1  ',
    '丙烯酸丁酯': 'C=CC(=O)OCCCC',
    '4-羟基丁基丙烯酸酯': 'C=CC(=O)OCCCCO  ',
    '丙烯酸己酯': 'C=CC(=O)OCCCCCC',
    '4-丙烯酰吗啉': 'C=CC(=O)N1CCOCC1',
    '甲基丙烯酸甲酯': 'C=C(C)C(=O)OC',
    '甲基丙烯酸羟乙酯': 'C=C(C)C(=O)OCCO',
}

cation_smiles = {
    'EMIM': 'CCN1C=C[N+](C)=C1',
    'BMIM': 'CCCCN1C=C[N+](C)=C1'
}

anion_smiles = {
    'DCA': 'N#CC#N',
    'BF4': 'F[B-](F)(F)F',
    'TFSI': 'F[S](=O)(=O)N[S](=O)(=O)C(F)(F)F',
    'ES': 'CCOS(=O)(=O)[O-]',
    'OSF': 'O=S(=O)([O-])C(F)(F)F',
    'DPO3': 'O=P([O-])(O)O',
    'NO3': '[N+](=O)([O-])[O-]',
}





def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    
    # RDKit descriptors
    rdkit_desc = {name: func(mol) for name, func in Descriptors._descList}
    
    # Mordred descriptors
    calc = Calculator(descriptors)
    mordred_desc = calc(mol).fill_missing()
    
    return {**rdkit_desc, **mordred_desc.asdict()}




# 计算描述符
polymer_descriptors = {polymer: calculate_descriptors(smiles) for polymer, smiles in polymer_smiles.items()}
cation_descriptors = {cation: calculate_descriptors(smiles) for cation, smiles in cation_smiles.items()}
anion_descriptors = {anion: calculate_descriptors(smiles) for anion, smiles in anion_smiles.items()}

# 合并离子液体描述符
ionic_liquid_descriptors = {}
for il in ionic_liquids:
    cation, anion = il.split()
    ionic_liquid_descriptors[il] = {
        **{f"cation_{k}": v for k, v in cation_descriptors[cation].items()},
        **{f"anion_{k}": v for k, v in anion_descriptors[anion].items()}
    }

# 创建特征表格
feature_df = pd.DataFrame()
for _, row in df.iterrows():
    polymer = row.iloc[2]
    ionic_liquid = row.iloc[4]
    features = {
        **polymer_descriptors[polymer],
        **ionic_liquid_descriptors[ionic_liquid]
    }
    feature_df = pd.concat([feature_df, pd.DataFrame([features])], ignore_index=True)


# 输出特征数量和分布趋势
print(f"总特征数量: {feature_df.shape[1]}")
print(f"聚合物特征数量: {len(polymer_descriptors[polymers[0]])}")
print(f"离子液体特征数量: {len(ionic_liquid_descriptors[ionic_liquids[0]])}")

# 可视化特征分布
plt.figure(figsize=(12, 6))
sns.boxplot(data=feature_df)
plt.title("特征分布")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig("feature_distribution.png")

# 保存特征表格
feature_df.to_csv("material_features.csv", index=False)


print("特征表格已保存为 material_features.csv")
print("特征分布图已保存为 feature_distribution.png")
