In [1]:
import os, time
from tqdm import tqdm
import pandas as pd
from joblib import Parallel, delayed
from ogb.utils.url import download_url, extract_zip

from molecules.physics import generate_physics_dict_structure
from molecules.parse_sdf import sdf_to_mols

In [2]:
USE_MICRO_DATASET = False # Set to false to use the FULL dataset, otherwise a dummy smaller dataset
ROOT = "."
micro_name = "_micro" if USE_MICRO_DATASET else ""
SDF_URL = f"https://datasets-public-research.s3.us-east-2.amazonaws.com/PCQM4M/pcqm4m-v2_sdf{micro_name}.zip"
SDF_LOCAL_ZIP = f"{ROOT}/pcqm4m-v2_sdf{micro_name}.zip"
SDF_LOCAL = os.path.join(os.path.splitext(SDF_LOCAL_ZIP)[0])
RAW_URL = "http://ogb-data.stanford.edu/data/lsc/pcqm4m-v2.zip"
RAW_CSV = f"{ROOT}/pcqm4m-v2/raw/data.csv.gz"
SKIP_DOWNLOAD = True # Set to false for downloading and unzipping the file. If already downloaded, set to True

In [3]:
if not SKIP_DOWNLOAD:
    # This can take many minutes
    path = download_url(SDF_URL, ROOT)
    extract_zip(path, ROOT)

In [4]:
if not SKIP_DOWNLOAD:
    raw_path = download_url(RAW_URL, ROOT)
    extract_zip(raw_path, ROOT)

In [5]:
def get_files_from_dir(dir, ext):
    found_files = []
    all_dirs = os.walk(dir)
    for root, dirs, files in all_dirs:
        for filename in files:
            if filename.endswith(ext):
                found_files.append(os.path.join(root, filename))
    return found_files


In [6]:
sdf_files = get_files_from_dir(SDF_LOCAL, ".sdf")
time.sleep(1)
print(sdf_files[:10])

['./pcqm4m-v2_sdf/00250000_00259999/252506.sdf', './pcqm4m-v2_sdf/00250000_00259999/253520.sdf', './pcqm4m-v2_sdf/00250000_00259999/253300.sdf', './pcqm4m-v2_sdf/00250000_00259999/259104.sdf', './pcqm4m-v2_sdf/00250000_00259999/256366.sdf', './pcqm4m-v2_sdf/00250000_00259999/254181.sdf', './pcqm4m-v2_sdf/00250000_00259999/256379.sdf', './pcqm4m-v2_sdf/00250000_00259999/252575.sdf', './pcqm4m-v2_sdf/00250000_00259999/255187.sdf', './pcqm4m-v2_sdf/00250000_00259999/257670.sdf']


In [7]:
mols = {}
for file in tqdm(sdf_files):
    try:
        idx = int(os.path.basename(os.path.splitext(file)[0]))
        mols[idx] = sdf_to_mols(file)[0]
    except Exception as e:
        print(f"Error loading file {file}")
        pass

[20:54:19] Conflicting single bond directions around double bond at index 4.
[20:54:19]   BondStereo set to STEREONONE and single bond directions set to NONE.
100%|██████████| 3378606/3378606 [20:53<00:00, 2695.41it/s]


In [8]:
list_struct = Parallel(n_jobs=-1)(delayed(generate_physics_dict_structure)(mol, idx) for idx, mol in tqdm(mols.items()))
# list_struct = []
# for idx, mol in tqdm(mols.items()):
#     list_struct.append(generate_physics_dict_structure(mol, idx))
dict_struct = {k: [dic[k] for dic in list_struct if dic is not None] for k in list_struct[0]}
df = pd.DataFrame(dict_struct)
df

  1%|          | 32704/3378606 [00:11<03:47, 14696.78it/s]

Failed for molecule idx=2774781 SMILES=C


  3%|▎         | 98240/3378606 [00:15<03:01, 18043.55it/s]

Failed for molecule idx=3342150 SMILES=N
Failed for molecule idx=3347837 SMILES=O
Failed for molecule idx=3349913 SMILES=O
Failed for molecule idx=3348987 SMILES=P


  3%|▎         | 106432/3378606 [00:16<03:03, 17823.61it/s]

Failed for molecule idx=3342147 SMILES=Br
Failed for molecule idx=3347108 SMILES=O
Failed for molecule idx=3346961 SMILES=C
Failed for molecule idx=3341898 SMILES=Cl
Failed for molecule idx=3346957 SMILES=Br


  5%|▍         | 159680/3378606 [00:19<03:03, 17569.17it/s]

Failed for molecule idx=1743406 SMILES=C


  5%|▍         | 163776/3378606 [00:19<02:58, 17988.87it/s]

Failed for molecule idx=1749315 SMILES=P


  6%|▌         | 192448/3378606 [00:20<02:48, 18897.95it/s]

Failed for molecule idx=1646655 SMILES=S


  6%|▌         | 196544/3378606 [00:21<02:55, 18137.11it/s]

Failed for molecule idx=2278265 SMILES=Cl


  6%|▌         | 200640/3378606 [00:21<02:58, 17837.02it/s]

Failed for molecule idx=2278189 SMILES=N


  9%|▉         | 303040/3378606 [00:26<02:48, 18253.10it/s]

Failed for molecule idx=1711137 SMILES=N


  9%|▉         | 307136/3378606 [00:27<02:48, 18200.41it/s]

Failed for molecule idx=1711136 SMILES=N


 12%|█▏        | 389056/3378606 [00:32<02:49, 17604.87it/s]

Failed for molecule idx=1857468 SMILES=F
Failed for molecule idx=1853781 SMILES=P
Failed for molecule idx=1855419 SMILES=P
Failed for molecule idx=1858155 SMILES=C


 12%|█▏        | 393152/3378606 [00:32<02:46, 17974.53it/s]

Failed for molecule idx=1857471 SMILES=N
Failed for molecule idx=1857470 SMILES=F


 12%|█▏        | 397248/3378606 [00:32<02:36, 19042.59it/s]

Failed for molecule idx=1857469 SMILES=F


 12%|█▏        | 417728/3378606 [00:33<02:43, 18142.35it/s]

Failed for molecule idx=3366711 SMILES=S


 12%|█▏        | 421824/3378606 [00:34<02:49, 17406.71it/s]

Failed for molecule idx=3362848 SMILES=S
Failed for molecule idx=3364037 SMILES=F


 15%|█▍        | 491456/3378606 [00:38<02:44, 17503.86it/s]

Failed for molecule idx=1752280 SMILES=C
Failed for molecule idx=1755906 SMILES=Br


 16%|█▌        | 540608/3378606 [00:40<02:39, 17827.55it/s]

Failed for molecule idx=3355884 SMILES=F


 18%|█▊        | 597952/3378606 [00:44<02:35, 17862.47it/s]

Failed for molecule idx=3144495 SMILES=F


 23%|██▎       | 774080/3378606 [00:54<02:27, 17656.45it/s]

Failed for molecule idx=2904023 SMILES=S


 24%|██▍       | 815040/3378606 [00:56<02:21, 18070.92it/s]

Failed for molecule idx=85975 SMILES=O


 26%|██▌       | 876480/3378606 [01:00<02:24, 17264.20it/s]

Failed for molecule idx=3286940 SMILES=C


 26%|██▌       | 880576/3378606 [01:00<02:24, 17321.76it/s]

Failed for molecule idx=3286931 SMILES=C


 27%|██▋       | 913344/3378606 [01:02<02:21, 17450.70it/s]

Failed for molecule idx=2932345 SMILES=Cl


 28%|██▊       | 937920/3378606 [01:03<02:18, 17671.06it/s]

Failed for molecule idx=3159107 SMILES=[SiH4]


 37%|███▋      | 1261504/3378606 [01:22<01:52, 18857.97it/s]

Failed for molecule idx=1818416 SMILES=N
Failed for molecule idx=1812920 SMILES=C


 38%|███▊      | 1298368/3378606 [01:24<01:56, 17911.52it/s]

Failed for molecule idx=2980688 SMILES=N


 39%|███▊      | 1306560/3378606 [01:24<01:52, 18357.14it/s]

Failed for molecule idx=2980855 SMILES=F
Failed for molecule idx=2984530 SMILES=C
Failed for molecule idx=2985105 SMILES=S
Failed for molecule idx=2985102 SMILES=O


 41%|████      | 1376192/3378606 [01:28<01:53, 17705.27it/s]

Failed for molecule idx=2265330 SMILES=C


 41%|████▏     | 1396672/3378606 [01:29<01:51, 17811.62it/s]

Failed for molecule idx=1736514 SMILES=P
Failed for molecule idx=1730492 SMILES=P


 42%|████▏     | 1404864/3378606 [01:30<01:48, 18109.16it/s]

Failed for molecule idx=1736513 SMILES=P
Failed for molecule idx=1734210 SMILES=Cl
Failed for molecule idx=1734544 SMILES=[NH]


 46%|████▌     | 1560512/3378606 [01:39<01:49, 16625.65it/s]

Failed for molecule idx=3117773 SMILES=C


 57%|█████▋    | 1941440/3378606 [02:01<02:14, 10660.95it/s]

Failed for molecule idx=1827685 SMILES=Cl


 63%|██████▎   | 2138048/3378606 [02:12<01:09, 17880.13it/s]

Failed for molecule idx=3175805 SMILES=N
Failed for molecule idx=3176384 SMILES=C
Failed for molecule idx=3176385 SMILES=C


 63%|██████▎   | 2142144/3378606 [02:12<01:08, 18141.98it/s]

Failed for molecule idx=3176386 SMILES=C
Failed for molecule idx=3176383 SMILES=C


 65%|██████▍   | 2191296/3378606 [02:15<01:02, 18859.25it/s]

Failed for molecule idx=137779 SMILES=P


 65%|██████▌   | 2211776/3378606 [02:16<01:00, 19136.32it/s]

Failed for molecule idx=3254373 SMILES=[BH]


 70%|███████   | 2379712/3378606 [02:26<00:55, 18099.20it/s]

Failed for molecule idx=2991141 SMILES=Cl


 77%|███████▋  | 2592704/3378606 [02:38<00:47, 16497.95it/s]

Failed for molecule idx=3208582 SMILES=C


 78%|███████▊  | 2645952/3378606 [02:41<00:39, 18756.99it/s]

Failed for molecule idx=3274341 SMILES=C
Failed for molecule idx=3274651 SMILES=C


 80%|███████▉  | 2691008/3378606 [02:43<00:36, 19010.78it/s]

Failed for molecule idx=1768820 SMILES=P


 84%|████████▍ | 2846656/3378606 [02:53<00:31, 16902.85it/s]

Failed for molecule idx=1662286 SMILES=C
Failed for molecule idx=1662363 SMILES=P


 84%|████████▍ | 2854848/3378606 [02:53<00:29, 17834.56it/s]

Failed for molecule idx=1665176 SMILES=S
Failed for molecule idx=1664201 SMILES=C
Failed for molecule idx=1664472 SMILES=C


 85%|████████▍ | 2858944/3378606 [02:53<00:28, 18168.65it/s]

Failed for molecule idx=1665177 SMILES=S


 87%|████████▋ | 2944960/3378606 [02:58<00:22, 19224.93it/s]

Failed for molecule idx=3138019 SMILES=O


 88%|████████▊ | 2961344/3378606 [02:59<00:22, 18372.98it/s]

Failed for molecule idx=1773339 SMILES=[BH]
Failed for molecule idx=1773342 SMILES=[NH]


 92%|█████████▏| 3116992/3378606 [03:08<00:15, 17243.38it/s]

Failed for molecule idx=2627608 SMILES=O


 93%|█████████▎| 3155694/3378606 [03:10<00:22, 9743.05it/s] 

Failed for molecule idx=2633315 SMILES=Br


 95%|█████████▌| 3219392/3378606 [03:14<00:08, 18201.75it/s]

Failed for molecule idx=1693988 SMILES=N


 97%|█████████▋| 3276736/3378606 [03:17<00:05, 18202.14it/s]

Failed for molecule idx=1838508 SMILES=N
Failed for molecule idx=1831559 SMILES=P


 97%|█████████▋| 3289024/3378606 [03:18<00:04, 18092.59it/s]

Failed for molecule idx=1841135 SMILES=S
Failed for molecule idx=1849751 SMILES=[NH]


100%|██████████| 3378606/3378606 [03:23<00:00, 16637.40it/s]


Unnamed: 0,I_a,I_b,I_c,IH_a,IH_b,IH_c,len_a,len_b,len_c,idx
0,1.801022,1.205763,0.703477,3.332717,2.230740,1.277720,0.785694,0.564729,0.180997,252506
1,0.848249,0.668207,0.180042,1.054542,0.906995,0.147547,0.781616,0.382377,0.000281,253520
2,1.723672,1.429719,0.451579,2.734154,2.413071,0.522670,0.860926,0.523788,0.239813,253300
3,1.479145,1.422294,0.184963,2.109702,2.044756,0.159232,0.976637,0.314264,0.220982,259104
4,0.799458,0.749364,0.148133,0.601163,0.524731,0.162983,0.673110,0.281897,0.213451,256366
...,...,...,...,...,...,...,...,...,...,...
3378521,2.039659,2.003402,0.355868,1.830906,1.700403,0.312575,1.006012,0.353471,0.361731,1175534
3378522,2.090909,1.976536,0.304126,4.400455,4.142253,0.713854,1.070125,0.503605,0.283449,1173024
3378523,1.033674,0.909330,0.271496,1.491752,1.324474,0.221520,0.769898,0.373006,0.267498,1177412
3378524,1.218129,1.101899,0.249071,1.317298,1.287553,0.284384,0.800118,0.361007,0.249172,1178625


In [9]:
raw_df = pd.read_csv(RAW_CSV)
raw_df

Unnamed: 0,idx,smiles,homolumogap
0,0,O=C1[N]c2ccncc2[CH][C@@H]1c1ccc(cc1)C,3.047675
1,1,COc1cc(OC)ccc1/C=C/N(C(=O)C)C,4.410966
2,2,C=CCN(C(=O)C)/C=C/c1ccccc1C,4.639541
3,3,C=CCN(C(=O)C)/C=C/c1ccccc1F,4.492600
4,4,C=CCN(C(=O)C)/C=C/c1ccccc1Cl,4.612330
...,...,...,...
3746615,3746615,CCn1cnc2c1ncnc2N,
3746616,3746616,O=N(=O)c1ccc(c(c1)N(=O)=O)Cl,
3746617,3746617,NCC(=O)COP(=O)(O)O,
3746618,3746618,C[C@@H](CN)O,


In [10]:
merged_df = raw_df.merge(df, how="inner", on=["idx"])
merged_df

Unnamed: 0,idx,smiles,homolumogap,I_a,I_b,I_c,IH_a,IH_b,IH_c,len_a,len_b,len_c
0,0,O=C1[N]c2ccncc2[CH][C@@H]1c1ccc(cc1)C,3.047675,1.813462,1.601682,0.401664,1.510950,1.436966,0.173702,0.961845,0.452937,0.254786
1,1,COc1cc(OC)ccc1/C=C/N(C(=O)C)C,4.410966,2.308566,1.932109,0.403518,2.688305,1.975483,0.773145,1.053264,0.561198,0.126349
2,2,C=CCN(C(=O)C)/C=C/c1ccccc1C,4.639541,1.691103,1.338676,0.398564,1.848389,1.220168,0.714471,0.869384,0.583278,0.161675
3,3,C=CCN(C(=O)C)/C=C/c1ccccc1F,4.492600,1.753497,1.367217,0.411020,1.552199,1.053055,0.540122,0.875695,0.588322,0.126418
4,4,C=CCN(C(=O)C)/C=C/c1ccccc1Cl,4.612330,1.872057,1.467153,0.527033,1.661891,1.365148,0.350127,0.891042,0.562013,0.210463
...,...,...,...,...,...,...,...,...,...,...,...,...
3378521,3378601,Cc1ccc(c(c1)C)N[C@H](/C(=N\C1CC1)/O)C,5.347037,1.348275,1.288381,0.529860,2.074233,1.926117,0.849037,0.828846,0.462148,0.401456
3378522,3378602,C[C@@H](/C(=N\C1CC1)/O)Nc1cccc(c1C)C,5.809631,1.667726,1.532551,0.408586,2.339997,1.954943,0.658524,0.886473,0.473849,0.327503
3378523,3378603,C[C@H](/C(=N\C(=N)O)/O)Nc1cccc(c1C)C,5.064039,1.450947,1.335399,0.385102,1.659290,1.385724,0.449171,0.919293,0.435652,0.331783
3378524,3378604,C[C@@H](/C(=N\C(=N)O)/O)Nc1cccc(c1C)C,5.336153,1.459332,1.360086,0.370713,1.663606,1.398784,0.437598,0.925686,0.434268,0.347544


In [11]:
merged_df = merged_df.sort_values(by="idx")
csv_path = os.path.join(ROOT, f"pcqm4m-v2-physics{micro_name}.csv.gz")
merged_df.to_csv(csv_path, index=False)