In [9]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

# 你原来的主训练集
train= pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test= pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
all_test = test.rename(columns={'smiles': 'SMILES'})[['SMILES']]
# 统一列集合（没有的列先留空）
base_cols = ['SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg']
train_base = train.reindex(columns=base_cols)

In [6]:
# --- 读入你的 addtional 数据 ---
# 读 dataset1
ds1 = pd.read_csv(
    '/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv',
    sep=None, header=0, engine='python'  # 指定 engine，消除警告
)
ds1 = ds1.rename(columns={'TC_mean': 'Tc'})[['SMILES', 'Tc']].dropna(subset=['SMILES', 'Tc'])
ds1_pad = ds1.assign(Tg=np.nan, FFV=np.nan, Density=np.nan, Rg=np.nan)[base_cols]

# dataset3: 只有 Tg（列名就是 SMILES 和 Tg；若是 Tg_mean 也自动识别）
ds3_path = '/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv'
_read_attempts = [
    dict(sep='\t', header=0),
    dict(delim_whitespace=True, header=0),
    dict(sep=',', header=0, engine='python'),
]
for args in _read_attempts:
    try:
        _tmp = pd.read_csv(ds3_path, **args)
        _tmp.columns = _tmp.columns.str.strip().str.lstrip('\ufeff')
        cols = {c.lower(): c for c in _tmp.columns}
        smi = cols.get('smiles')
        tg  = cols.get('tg') or cols.get('tg_mean')
        if smi and tg:
            ds3 = _tmp.rename(columns={smi: 'SMILES', tg: 'Tg'})[['SMILES', 'Tg']].dropna(subset=['SMILES', 'Tg'])
            break
    except Exception:
        pass
else:
    cols_preview = pd.read_csv(ds3_path, sep='\t', header=0, nrows=0).columns.tolist()
    raise KeyError(f"dataset3 无法识别列名，实际列为：{cols_preview}")

ds3_pad = ds3.assign(FFV=np.nan, Tc=np.nan, Density=np.nan, Rg=np.nan)[base_cols]

# dataset4: 只有 FFV（列名可能是 FFV 或 FFV_mean）
ds4_path = '/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv'
_read_attempts = [
    dict(sep='\t', header=0),
    dict(sep=r'\s+', header=0, engine='python'),  # 代替 delim_whitespace=True
    dict(sep=',', header=0, engine='python'),
]
for args in _read_attempts:
    try:
        _tmp = pd.read_csv(ds4_path, **args)
        _tmp.columns = _tmp.columns.str.strip().str.lstrip('\ufeff')
        cols = {c.lower(): c for c in _tmp.columns}
        smi = cols.get('smiles')
        ffv = (cols.get('ffv') or cols.get('ffv_mean') or
               cols.get('free volume fraction') or cols.get('fractional free volume'))
        if smi and ffv:
            ds4 = (_tmp.rename(columns={smi: 'SMILES', ffv: 'FFV'})
                        [['SMILES', 'FFV']]
                        .dropna(subset=['SMILES', 'FFV']))
            break
    except Exception:
        pass
else:
    cols_preview = pd.read_csv(ds4_path, sep='\t', header=0, nrows=0).columns.tolist()
    raise KeyError(f"dataset4 无法识别列名，实际列为：{cols_preview}")

ds4_pad = ds4.assign(Tg=np.nan, Tc=np.nan, Density=np.nan, Rg=np.nan)[base_cols]


# --- 合并：按 SMILES 去重，冲突时以主训练集为准 ---
all_train = pd.concat([train_base, ds1_pad, ds3_pad, ds4_pad], ignore_index=True)
all_train = (all_train
             .groupby('SMILES', as_index=False)
             .agg({'Tg':'first','FFV':'first','Tc':'first','Density':'first','Rg':'first'}))


  _tmp = pd.read_csv(ds3_path, **args)


In [24]:
# 更强的 ECFP：加手性+药效团特征，半径=3，位数=4096
def smiles_to_fp(smi, radius=3, nBits=4096):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return np.zeros((nBits,), dtype=np.uint8)
    fp = AllChem.GetMorganFingerprintAsBitVect(
        mol, radius, nBits, useChirality=True, useFeatures=True
    )
    arr = np.zeros((nBits,), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

# 简单的随机SMILES生成（只用于推理增强）
def randomize_smiles_once(smi):
    m = Chem.MolFromSmiles(smi)
    if m is None:
        return smi
    return Chem.MolToSmiles(m, doRandom=True)

# 为每个测试分子准备多份增强指纹（TTA）
K = 5
X_test_list, test_map = [], []
for i, smi in enumerate(all_test["SMILES"].tolist()):
    variants = {smi}
    for _ in range(K - 1):
        variants.add(randomize_smiles_once(smi))
    for v in variants:
        X_test_list.append(smiles_to_fp(v))
        test_map.append(i)
X_test_aug = np.vstack(X_test_list)
test_map = np.array(test_map, dtype=int)

# 目标 & 提交框架
targets = ["Tg", "FFV", "Tc", "Density", "Rg"]
submission = pd.DataFrame({"id": test["id"].values})

# 训练集指纹
X = np.vstack(all_train["SMILES"].apply(smiles_to_fp))

from sklearn.ensemble import ExtraTreesRegressor
# === 关键：把 fit / predict / 写入 都放进 for 循环 ===
for tgt in targets:
    mask = all_train[tgt].notna()
    X_train = X[mask]
    y_train = all_train.loc[mask, tgt].values

    model = ExtraTreesRegressor(
        n_estimators=1600,
        max_depth=None,
        max_features=0.5,
        min_samples_leaf=2,
        min_samples_split=5,
        n_jobs=-1,
        random_state=42,
    )
    model.fit(X_train, y_train)

    # TTA 预测并按原样本求平均
    preds_all = model.predict(X_test_aug)
    preds_tta = np.zeros(len(all_test), dtype=float)
    counts = np.zeros(len(all_test), dtype=int)
    for p, idx in zip(preds_all, test_map):
        preds_tta[idx] += p
        counts[idx] += 1
    preds_tta /= counts

    submission[tgt] = preds_tta  # ← 每个目标都写入一列

# 可选：快速自检，确保五列齐全
assert all(col in submission.columns for col in targets), submission.columns




In [31]:
submission = submission[["id"] + targets]
submission.to_csv("/kaggle/working/submission.csv", index=False)