# Uni-Mol Molecular Represitation

This is the old-version of molecular representation generation methond, but eary to custome

In [1]:
import os
import numpy as np
import pandas as pd
import lmdb
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm
import pickle
import glob

### Load peptide library and SMILES list

In [2]:
# Each sequence in this library contains 5 kinds of amino acids(Val, Leu, Gly, Ala, Pro)
smi_list = []
# pep_lib = Chem.SmilesMolSupplier('./peplib_5aa_aliphatic.smi', delimiter='\t')
pep_lib = Chem.SmilesMolSupplier('./Pep_lib_VPGLA.smi', delimiter='\t')
pep_smi = [Chem.MolToSmiles(mol) for mol in pep_lib] # get smiles of peptides in silico library
pep_seq = [mol.GetProp('sequence') for mol in pep_lib] # get sequence of peptides in silico library
print(len(pep_smi))

625


In [3]:
# Each sequence in this library contains 6 kinds of amino acids(Val, Leu, Gly, Ala, Pro, D-Pro)
smi_list2 = []
pep_lib2 = Chem.SmilesMolSupplier('./Pep_lib_VPpGLA.smi', delimiter='\t')
pep_smi2 = [Chem.MolToSmiles(mol) for mol in pep_lib2] # get smiles of peptides in silico library
pep_seq2 = [mol.GetProp('sequence') for mol in pep_lib2] # get sequence of peptides in silico library
print(len(pep_smi2))

1296


### Generate conformations from SMILES and save to .lmdb

In [4]:
def get_structure(mol,n_confs):
  mol = Chem.AddHs(mol)
  new_mol = Chem.Mol(mol)

  AllChem.EmbedMultipleConfs(mol,numConfs=n_confs,useExpTorsionAnglePrefs=True,useBasicKnowledge=True, numThreads=0)
  energies = AllChem.MMFFOptimizeMoleculeConfs(mol,maxIters=2000, nonBondedThresh=100.0, numThreads=0)

  energies_list = [e[1] for e in energies]
  min_e_index = energies_list.index(min(energies_list))

  new_mol.AddConformer(mol.GetConformer(min_e_index))

  return new_mol

def smi2coords(smi, seed):
    """
    Custome setting
    """
    mol = Chem.MolFromSmiles(smi)
    # change the number to adjust the number of conformers generation
    new_mol = get_structure(mol, 50)  # set 50 to get better performance, but slower
    # mol = AllChem.AddHs(mol)
    atoms = [atom.GetSymbol() for atom in new_mol.GetAtoms()]
    coordinate_list = []
    # res = AllChem.EmbedMolecule(mol, randomSeed=seed)
    coordinates = new_mol.GetConformer().GetPositions()

    assert len(atoms) == len(coordinates), "coordinates shape is not align with {}".format(smi)
    coordinate_list.append(coordinates.astype(np.float32))
    return pickle.dumps({'atoms': atoms, 'coordinates': coordinate_list, 'smi': smi}, protocol=-1)

def write_lmdb(smiles_list, job_name, seed=42, outpath='./results'):
    os.makedirs(outpath, exist_ok=True)
    output_name = os.path.join(outpath,'{}.lmdb'.format(job_name))
    try:
        os.remove(output_name)
    except:
        pass
    env_new = lmdb.open(
        output_name,
        subdir=False,
        readonly=False,
        lock=False,
        readahead=False,
        meminit=False,
        max_readers=1,
        map_size=int(100e9),
    )
    txn_write = env_new.begin(write=True)
    for i, smiles in tqdm(enumerate(smiles_list)):
        inner_output = smi2coords(smiles, seed=seed)
        txn_write.put(f"{i}".encode("ascii"), inner_output)
    txn_write.commit()
    env_new.close()

In [5]:
# Setting configs

seed = 42
job_name = 'VPGLA_repr'   # replace to your custom name
data_path = './Reprs'  # replace to your data path
weight_path='./Model/mol_pre_no_h_220816.pt'  # replace to your ckpt path
only_polar=0  # with Hydrogen
dict_name='dict.txt'
batch_size=16
results_path=data_path   # replace to your save path
write_lmdb(pep_smi, job_name=job_name, seed=seed, outpath=data_path)

625it [05:48,  1.79it/s]


### Infer from ckpt

In [6]:
# NOTE: Currently, the inference is only supported to run on a single GPU. You can add CUDA_VISIBLE_DEVICES="0" before the command.
!cp ./Model/molecule/$dict_name $data_path
!CUDA_VISIBLE_DEVICES="0" python ./Model/unimol/infer.py --user-dir ./Model/unimol $data_path --valid-subset $job_name \
       --results-path $results_path \
       --num-workers 8 --ddp-backend=c10d --batch-size $batch_size \
       --task unimol --loss unimol_infer --arch unimol_base \
       --path $weight_path \
       --fp16 --fp16-init-scale 4 --fp16-scale-window 256 \
       --only-polar $only_polar --dict-name $dict_name \
       --log-interval 50 --log-format simple --random-token-prob 0 --leave-unmasked-prob 1.0 --mode infer

fused_multi_tensor is not installed corrected
fused_rounding is not installed corrected
fused_layer_norm is not installed corrected
fused_softmax is not installed corrected
2024-03-20 15:22:04 | INFO | numexpr.utils | Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-03-20 15:22:04 | INFO | numexpr.utils | NumExpr defaulting to 8 threads.
2024-03-20 15:22:04 | INFO | unimol.inference | loading model(s) from ./Model/mol_pre_no_h_220816.pt
2024-03-20 15:22:04 | INFO | unimol.tasks.unimol | dictionary: 30 types
2024-03-20 15:22:06 | INFO | unimol.inference | Namespace(activation_dropout=0.0, activation_fn='gelu', adam_betas='(0.9, 0.999)', adam_eps=1e-08, all_gather_list_size=16384, allreduce_fp32_grad=False, arch='unimol_base', attention_dropout=0.1, batch_size=16, batch_size_valid=16, bf16=False, bf16_sr=False, broadcast_buffers=False, bucket_cap_mb=25, cpu=False, curriculum=0, data='./Reprs', data_buffer_size=10, ddp_backend='c10d', d

### Read .pkl and save results to .csv

In [8]:
def get_csv_results(predict_path, results_path):
    predict = pd.read_pickle(predict_path)
    print(predict[0])
    smi_list, mol_repr_list, pair_repr_list = [], [], []
    for batch in predict:
        sz = batch["bsz"]
        for i in range(sz):
            smi_list.append(batch["smi_name"][i])
            mol_repr_list.append(batch["mol_repr_cls"][i].tolist())
            pair_repr_list.append(batch["pair_repr"][i])
    predict_df = pd.DataFrame({"SMILES": smi_list, "mol_repr": mol_repr_list, "pair_repr": pair_repr_list})
    # print(predict_df.head(1),predict_df.info())
    predict_df.to_csv(results_path+'/mol_repr.csv',index=False)

pkl_path = glob.glob(f'{results_path}/*_{job_name}.out.pkl')[0]
get_csv_results(pkl_path, results_path)

{'mol_repr_cls': array([[-1.179   , -0.0422  , -0.3052  , ..., -0.4775  ,  0.86    ,
        -0.0231  ],
       [-1.384   , -0.01617 , -0.1293  , ..., -0.654   ,  0.825   ,
        -0.09863 ],
       [-1.283   , -0.1232  , -0.2266  , ..., -0.5356  ,  0.84    ,
         0.002018],
       ...,
       [-1.094   , -0.0784  , -0.314   , ..., -0.6196  ,  0.7954  ,
        -0.12213 ],
       [-1.126   , -0.06256 , -0.5347  , ..., -0.2234  ,  0.843   ,
        -0.07367 ],
       [-1.183   ,  0.02124 , -0.2712  , ..., -0.549   ,  0.763   ,
        -0.1212  ]], dtype=float16), 'pair_repr': [array([[[ 16.7    ,   4.535  ,   3.828  , ...,   7.58   ,   3.059  ,
          -6.973  ],
        [ -0.4077 ,   0.5576 ,   2.656  , ...,   2.453  ,  -1.161  ,
          -0.4326 ],
        [ -0.1196 ,   1.033  ,   3.37   , ...,   1.601  ,   0.1242 ,
          -1.202  ],
        ...,
        [ -0.05078,   0.3035 ,   3.346  , ...,   1.355  ,  -0.738  ,
          -0.2642 ],
        [ -0.706  ,   0.557  ,  -4.01  