In [1]:
from openbabel import openbabel
import os
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
from ogb.utils.url import decide_download, download_url, extract_zip
from shutil import make_archive, rmtree

In [2]:
XYZ_URL = "http://ogb-data.stanford.edu/data/lsc/pcqm4m-v2_xyz.zip"
ROOT = "."
XYZ_LOCAL_ZIP = f"{ROOT}/pcqm4m-v2_xyz.zip"
XYZ_LOCAL = os.path.splitext(XYZ_LOCAL_ZIP)[0]
SDF_ZIP = "pcqm4m-v2_sdf.zip"
RAW_URL = "http://ogb-data.stanford.edu/data/lsc/pcqm4m-v2.zip"
SKIP_DOWNLOAD = True # Set to false for downloading and unzipping the file

In [3]:
def get_files_from_dir(dir, ext):
    found_files = []
    all_dirs = os.walk(dir)
    for root, dirs, files in all_dirs:
        for filename in files:
            if filename.endswith(ext):
                found_files.append(os.path.join(root, filename))
    return found_files

def convert_subfolders_mol_type(xyz_files, in_type, out_type, out_folder, to_zip_folder, delete_unzipped_folder=True):
    unique_dirs, unique_inv = np.unique([os.path.dirname(file) for file in xyz_files], return_inverse=True)
    for ii, this_dir in enumerate(unique_dirs):
        print(this_dir)
        new_dir = os.path.join(out_folder, os.path.basename(this_dir))
        dir_idx = np.where(unique_inv == ii)[0]
        these_files = [xyz_files[jj] for jj in dir_idx]
        out = Parallel(n_jobs=-1)(delayed(convert_molecular_files)(file, new_dir, in_type, out_type) for file in tqdm(these_files))
        if sum(out) > 0:
            if to_zip_folder:
                make_archive(base_name=new_dir, format="zip", root_dir=os.path.dirname(new_dir), base_dir=this_dir)
                if delete_unzipped_folder:
                    rmtree(new_dir)
    return out

def convert_molecular_files(filename, new_dir, in_type, out_type):
    out_name = os.path.splitext(os.path.basename(filename))[0] + f".{out_type}"
    out_file = os.path.join(new_dir, out_name)
    os.makedirs(new_dir, exist_ok=True)
    conv=openbabel.OBConversion()
    conv.SetInAndOutFormats(in_type, out_type)
    conv.OpenInAndOutFiles(filename, out_file)
    return conv.Convert()

In [4]:
if not SKIP_DOWNLOAD:
    path = download_url(XYZ_URL, ROOT)
    extract_zip(path, ROOT)

In [5]:
# Read the xyz files and get the id, positions, atoms, and pair-wise distances
xyz_files = get_files_from_dir(XYZ_LOCAL, ext=".xyz")
print(xyz_files[:10])

['./pcqm4m-v2_xyz/00250000_00259999/254739.xyz', './pcqm4m-v2_xyz/00250000_00259999/251510.xyz', './pcqm4m-v2_xyz/00250000_00259999/255885.xyz', './pcqm4m-v2_xyz/00250000_00259999/250663.xyz', './pcqm4m-v2_xyz/00250000_00259999/259108.xyz', './pcqm4m-v2_xyz/00250000_00259999/254636.xyz', './pcqm4m-v2_xyz/00250000_00259999/259202.xyz', './pcqm4m-v2_xyz/00250000_00259999/252969.xyz', './pcqm4m-v2_xyz/00250000_00259999/253559.xyz', './pcqm4m-v2_xyz/00250000_00259999/259909.xyz']


In [6]:
print("This will take many minutes to complete!")
base_name, format = os.path.splitext(SDF_ZIP)
out = convert_subfolders_mol_type(xyz_files, 
        in_type="xyz", out_type="sdf", out_folder=base_name, to_zip_folder=False)

This will take many minutes to complete!


In [None]:
make_archive(base_name=base_name, format=format[1:], root_dir=".", base_dir=base_name)