In [None]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import pandas as pd
from tqdm import tqdm

In [None]:
tqdm.pandas() 

In [3]:
pretrain_path = "../../data/processed/npatlas.csv"
test_path = "../../data/processed/CMNPD2.0_test_set.csv"

In [4]:
pretrain_df = pd.read_csv(pretrain_path)
test_df = pd.read_csv(test_path)

In [6]:
def canonicalize_no_stereo(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    Chem.RemoveStereochemistry(mol)
    return Chem.MolToSmiles(mol, isomericSmiles=False, canonical=True)

pretrain_df["std_smiles"] = pretrain_df["SMILES"].progress_apply(canonicalize_no_stereo)
test_df["std_smiles"] = test_df["SMILES"].progress_apply(canonicalize_no_stereo)

pretrain_smiles_set = set(pretrain_df["std_smiles"].dropna())
filtered_test_df = test_df[~test_df["std_smiles"].isin(pretrain_smiles_set)].copy()

filtered_test_df.drop(columns=["std_smiles"], inplace=True)

filtered_test_df.to_csv("../../data/processed/final_testset.csv", index=False)

print(f"original testset：{len(test_df)}")
print(f"new testset：{len(filtered_test_df)}")
print(f"remove：{len(test_df) - len(filtered_test_df)}")

100%|██████████| 36454/36454 [00:09<00:00, 3881.38it/s]
100%|██████████| 5922/5922 [00:01<00:00, 3781.77it/s]

original testset：5922
new testset：3659
remove：2263



