In [1]:

from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
import pyarrow as pa
import multiprocessing

In [2]:
#in_memory_stream = pa.input_stream('/home/lyg/data/pubchem/arrow/pubchem_sorted.arrow')
#opened_stream = pa.ipc.open_stream(in_memory_stream)
#table = opened_stream.read_all()

table = pa.ipc.RecordBatchFileReader(pa.memory_map('/home/lyg/data/pubchem/arrow/pubchem_best.arrow')).read_all()

smiles = table.column('smiles').to_pylist()

smiles[0]

'N'

In [3]:
radius=2
nBits=2048

mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=nBits)

def smiles_to_fp(smiles):
  """
  Worker function to create an RDKit fingerprint from a SMILES string.
  """
  mol = Chem.MolFromSmiles(smiles)
  if mol is None:
    # return an empty fingerprint if the SMILES is invalid
    return DataStructs.cDataStructs.ExplicitBitVect(nBits)
  else:
    return mfpgen.GetFingerprint(mol)

with multiprocessing.Pool() as pool:
    arr = pool.map(smiles_to_fp, smiles)

[09:47:33] Explicit valence for atom # 0 B, 4, is greater than permitted
[09:47:34] Explicit valence for atom # 2 O, 2, is greater than permitted
[09:47:35] Explicit valence for atom # 0 B, 4, is greater than permitted
[09:47:35] Explicit valence for atom # 0 B, 4, is greater than permitted
[09:47:37] Conflicting single bond directions around double bond at index 6.
[09:47:37]   BondStereo set to STEREONONE and single bond directions set to NONE.
[09:47:37] Explicit valence for atom # 10 O, 2, is greater than permitted
[09:47:37] Conflicting single bond directions around double bond at index 3.
[09:47:37]   BondStereo set to STEREONONE and single bond directions set to NONE.
[09:47:38] Conflicting single bond directions around double bond at index 8.
[09:47:38]   BondStereo set to STEREONONE and single bond directions set to NONE.
[09:47:38] Conflicting single bond directions around double bond at index 8.
[09:47:38]   BondStereo set to STEREONONE and single bond directions set to NONE

In [4]:
assert len(arr) == len(smiles) == table.num_rows

In [6]:
# read in /home/lyg/data/pubchem/arrow/pubchem_best_cluster_picks.txt into a list
with open('/home/lyg/data/pubchem/arrow/pubchem_best_cluster_picks_raw.txt', 'r') as f:
    picks = [int(line.strip()) for line in f]


In [17]:
def find_similar_index(args):
    pick_idx, pick_fp, pick_num_atoms, arr, num_atoms_arr = args
    lower = pick_num_atoms * 0.83
    upper = pick_num_atoms * 1.17
    for idx, fp in enumerate(arr):
        if idx == pick_idx:
            continue
        num_atoms = num_atoms_arr[idx]
        if not (lower <= num_atoms <= upper):
            continue
        sim = DataStructs.TanimotoSimilarity(pick_fp, fp)
        if 0.45 <= sim < 0.65:
            return idx
    return None

num_atoms_arr = table['num_atoms'].to_pylist()
args_list = [(pick_idx, arr[pick_idx], num_atoms_arr[pick_idx], arr, num_atoms_arr) for pick_idx in picks]

with multiprocessing.Pool(processes=3) as pool:
    similar_indices = pool.map(find_similar_index, args_list)

# Remove None values and keep only found indices
similar_indices = [idx for idx in similar_indices if idx is not None]
merged = list(set(picks + similar_indices))

In [14]:
table.column_names

['cid',
 'complexity',
 'hba',
 'hbd',
 'rotatable_bonds',
 'tpsa',
 'logp',
 'monoisotopic_mass',
 'exact_mass',
 'formula',
 'molecular_weight',
 'charge',
 'num_atoms',
 'num_def_stereo',
 'num_undef_stereo',
 'num_def_double',
 'num_undef_double',
 'num_isotopic_atoms',
 'fragments',
 'num_tautomers',
 'num_complexity',
 'iupac_openeye',
 'iupac_cas',
 'iupac',
 'iupac_systematic',
 'iupac_traditional',
 'smiles',
 'split']

In [None]:
# with open('cluster_labels.txt', 'w') as f:
#     for i in picks:
#         smiles = table['smiles'][i].as_py()
#         iupac = table['iupac'][i].as_py()
#         cid = table['cid'][i].as_py()
#         formula = table['formula'][i].as_py()
#         num_atoms = table['num_atoms'][i].as_py()
#         # f.write(f"{smiles}\t{iupac}\t{labels[i]}\n")
#         f.write(f"{cid}\t{smiles}\t{iupac}\t{formula}\t{num_atoms}\n")

def convert_string_view_to_string(table):
    for column in table.schema:
        if pa.types.is_string_view(column.type):
            # Recreate the column as pa.string
            table = table.set_column(
                table.schema.get_field_index(column.name),
                column.name,
                pa.array(table[column.name].to_pylist(), type=pa.string())
            )
    return table

table2 = table.take(pa.array(merged))

# huggingface only supports string, not string_view
table2 = convert_string_view_to_string(table2)

with pa.OSFile('/home/lyg/data/pubchem/arrow/pubchem_best_cluster_similar.arrow', 'wb') as sink:
    with pa.RecordBatchFileWriter(sink, table2.schema) as writer:
        writer.write_table(table2)
print(f"Finished writing {len(table2)} picks to pubchem_best_cluster_similar.arrow")

Finished writing 474603 picks to pubchem_best_cluster_similar.arrow


In [19]:
with open('/home/lyg/data/pubchem/arrow/pubchem_best_cluster_similar_picks.txt', 'w') as f:   
    for i in merged:
        f.write(f"{i}\n")