In [13]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from sklearn.ensemble import RandomForestRegressor

# 你原来的主训练集
train= pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test= pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')

# 统一列集合（没有的列先留空）
base_cols = ['SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg']
train_base = train.reindex(columns=base_cols)

In [25]:
# --- 读入你的 addtional 数据 ---
# 读 dataset1
ds1 = pd.read_csv(
    '/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv',
    sep=None, header=0, engine='python'  # 指定 engine，消除警告
)
ds1 = ds1.rename(columns={'TC_mean': 'Tc'})[['SMILES', 'Tc']].dropna(subset=['SMILES', 'Tc'])
ds1_pad = ds1.assign(Tg=np.nan, FFV=np.nan, Density=np.nan, Rg=np.nan)[base_cols]

# dataset3: 只有 Tg（列名就是 SMILES 和 Tg；若是 Tg_mean 也自动识别）
ds3_path = '/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv'
_read_attempts = [
    dict(sep='\t', header=0),
    dict(delim_whitespace=True, header=0),
    dict(sep=',', header=0, engine='python'),
]
for args in _read_attempts:
    try:
        _tmp = pd.read_csv(ds3_path, **args)
        _tmp.columns = _tmp.columns.str.strip().str.lstrip('\ufeff')
        cols = {c.lower(): c for c in _tmp.columns}
        smi = cols.get('smiles')
        tg  = cols.get('tg') or cols.get('tg_mean')
        if smi and tg:
            ds3 = _tmp.rename(columns={smi: 'SMILES', tg: 'Tg'})[['SMILES', 'Tg']].dropna(subset=['SMILES', 'Tg'])
            break
    except Exception:
        pass
else:
    cols_preview = pd.read_csv(ds3_path, sep='\t', header=0, nrows=0).columns.tolist()
    raise KeyError(f"dataset3 无法识别列名，实际列为：{cols_preview}")

ds3_pad = ds3.assign(FFV=np.nan, Tc=np.nan, Density=np.nan, Rg=np.nan)[base_cols]

# dataset4: 只有 FFV（列名可能是 FFV 或 FFV_mean）
ds4_path = '/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv'
_read_attempts = [
    dict(sep='\t', header=0),
    dict(sep=r'\s+', header=0, engine='python'),  # 代替 delim_whitespace=True
    dict(sep=',', header=0, engine='python'),
]
for args in _read_attempts:
    try:
        _tmp = pd.read_csv(ds4_path, **args)
        _tmp.columns = _tmp.columns.str.strip().str.lstrip('\ufeff')
        cols = {c.lower(): c for c in _tmp.columns}
        smi = cols.get('smiles')
        ffv = (cols.get('ffv') or cols.get('ffv_mean') or
               cols.get('free volume fraction') or cols.get('fractional free volume'))
        if smi and ffv:
            ds4 = (_tmp.rename(columns={smi: 'SMILES', ffv: 'FFV'})
                        [['SMILES', 'FFV']]
                        .dropna(subset=['SMILES', 'FFV']))
            break
    except Exception:
        pass
else:
    cols_preview = pd.read_csv(ds4_path, sep='\t', header=0, nrows=0).columns.tolist()
    raise KeyError(f"dataset4 无法识别列名，实际列为：{cols_preview}")

ds4_pad = ds4.assign(Tg=np.nan, Tc=np.nan, Density=np.nan, Rg=np.nan)[base_cols]


# --- 合并：按 SMILES 去重，冲突时以主训练集为准 ---
all_train = pd.concat([train_base, ds1_pad, ds3_pad, ds4_pad], ignore_index=True)
all_train = (all_train
             .groupby('SMILES', as_index=False)
             .agg({'Tg':'first','FFV':'first','Tc':'first','Density':'first','Rg':'first'}))


  _tmp = pd.read_csv(ds3_path, **args)


In [27]:


# --- 指纹函数（保持你的写法，只修正一个小typo: np.zero -> np.zeros）---
def smiles_to_fp(smi, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return np.zeros((nBits,), dtype=np.uint8)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    arr = np.zeros((nBits,), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

# --- 生成特征矩阵 ---
X      = np.vstack(all_train["SMILES"].apply(smiles_to_fp))
X_test = np.vstack(test["SMILES"].apply(smiles_to_fp))

# --- 目标列表（按要求顺序）---
targets = ["Tg", "FFV", "Tc", "Density", "Rg"]

# --- 初始化提交 DataFrame ---
submission = pd.DataFrame({"id": test["id"].values})

# --- 对每个目标重复同样流程：筛mask -> 训练 -> 预测 -> 拼列 ---
for tgt in targets:
    mask = all_train[tgt].notna()
    X_train = X[mask]
    y_train = all_train.loc[mask, tgt].values

 
    model = RandomForestRegressor(
        n_estimators=1000,
        max_depth=18,
        max_features="sqrt",
        min_samples_leaf=2,
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    submission[tgt] = preds



In [32]:
submission = submission[["id"] + targets]
submission.to_csv("/kaggle/working/submission.csv", index=False)