In [9]:
import os
import cv2
import json
#import rdkit
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem.Draw.MolDrawing import DrawingOptions
from rdkit.Chem import Draw

import cairosvg
import subprocess

ModuleNotFoundError: No module named 'rdkit'

In [None]:
def save_comp_imgs_from_smiles(tar_id, comp_id, smiles, rot=0, SIZE=200, rot_size=300):
    mol = Chem.MolFromSmiles(smiles)
    DrawingOptions.atomLabelFontSize = 55
    DrawingOptions.dotsPerAngstrom = 100
    DrawingOptions.bondLineWidth = 1.5
    # Use MolToFile(mol, path, size, imageType="png", fitImage=True)
    
    # For higher quality of image
    path_to_give_svg = os.path.join(prediction_files_path, "target_prediction_dataset", 
                                tar_id, "imgs", "{}.svg".format(comp_id))
    
    path_to_give_png = os.path.join(prediction_files_path, "target_prediction_dataset", 
                                    tar_id, "imgs", "{}.png".format(comp_id))
    
    Draw.MolToFile(mol, path_to_give_svg , size = (SIZE, SIZE ))
    cairosvg.svg2png(url = path_to_give_svg, write_to = path_to_give_png)
    subprocess.call(["rm", path_to_give_svg])
    
    # Make it larger with padding to prevent data loss while rotation
    image = cv2.imread(path_to_give_png)
    
    white_color = (255,255,255)
    full_image = np.full((rot_size, rot_size, 3), white_color, dtype = np.uint8)
    # compute center offset
    gap = rot_size - SIZE
    (cX, cY) = (gap // 2, gap // 2)
    
    # copy image into center of result image
    full_image[cY:cY + SIZE, cX:cX + SIZE] = image
    
    if rot != 0:
        # Rotate it
        (cX, cY) = (rot_size // 2, rot_size // 2)
        M = cv2.getRotationMatrix2D((cX, cY), rot, 1.0)
        full_image = cv2.warpAffine(full_image, M, (rot_size, rot_size), borderMode=cv2.INTER_LINEAR, #cv2.BORDER_CONSTANT, 
                                    borderValue = white_color)
    
    # save result
    cv2.imwrite(path_to_give_png, full_image)

In [2]:
protein_name = "CHEMBL4282_all"

In [3]:
pwd

'/home/hayriye/DEEPScreen2.0/chembl_31'

In [4]:
prediction_files_path = "/home/hayriye/DEEPScreen2.0/prediction_files/"

In [5]:
target_prediction_dataset_path = prediction_files_path + "target_prediction_dataset/"

In [6]:
smiles_path = prediction_files_path + "molecule_smiles_dataset/"

In [12]:
smiles_file = smiles_path + "filtered_all_generated_molecules.csv"

In [14]:
all_df = pd.read_csv(smiles_file, sep=";")
print(len(all_df))
all_df.head()

43340


Unnamed: 0,MOL_ID,SMILES
0,MOL_01_000001,O=C(O)C(C=1)=Nc(c12)ncc3c2=NN(C3=O)c4ccccc4
1,MOL_01_000002,N1=CC(C)=c(c12)cc3c(c2C)=NC(=O)C(=C3C)CC(=O)NC...
2,MOL_01_000003,CC1CCC(CC1)NC(=O)C(C)N(N=C2C)C(=O)C(C23C)=NN3c...
3,MOL_01_000004,c1ccccc1C2C(=N\c(c3)ccc(c34)O4)\N(C(=O)CC2C(=O...
4,MOL_01_000005,n1cccc(c12)cccc2C(=O)N(C3)CCC(C34)CCc5c(O4)c6c...


In [39]:
all_df.index

RangeIndex(start=0, stop=14737, step=1)

In [45]:
all_df.iloc[0]

MOL_ID                                      MOL_01_000779
SMILES_x    CC1(C)CN(C(O)Cc2nc(O)cc(N3CCOCC3)n2)c2ccccc21
Name: 0, dtype: object

In [38]:
if not os.path.exists(target_prediction_dataset_path + protein_name + "/imgs"):
    os.makedirs(target_prediction_dataset_path + protein_name + "/imgs")

f = open(target_prediction_dataset_path + protein_name + "/prediction_dict.json", "w+")

json_dict = {"prediction": list()}
json_object = json.dumps(json_dict) 

f.write(json_object)
f.close()

In [46]:
total_image_count = 0
angle_list = [str(angle) for angle in range(10,360,10)]

from tqdm import tqdm

for i in tqdm(range(0, len(all_df)), desc="Predictions…", ascii=False, ncols=120):
    
    current_smiles = all_df.iloc[i]["SMILES_x"]

    current_compound_id = all_df.iloc[i]["MOL_ID"]
    
    try:
        save_comp_imgs_from_smiles(protein_name, current_compound_id, current_smiles)
        total_image_count += 1
        #print(total_image_count, current_compound_id, current_smiles)
        
        for angle in angle_list:
        
            save_comp_imgs_from_smiles(protein_name, current_compound_id+"_"+angle, current_smiles, int(angle))
            total_image_count += 1
            #print(total_image_count, current_compound_id+"_"+angle, current_smiles, int(angle))
            
    except Exception as e:
        print(e, current_compound_id, current_smiles)
    
    if i % 1000 == 0:
        print(current_compound_id, current_smiles)
        
print(total_image_count)

Predictions…:   0%|                                                                 | 1/14737 [00:01<6:39:16,  1.63s/it]

MOL_01_000779 CC1(C)CN(C(O)Cc2nc(O)cc(N3CCOCC3)n2)c2ccccc21


Predictions…:   7%|████▏                                                         | 1001/14737 [26:32<5:53:37,  1.54s/it]

MOL_01_025057 Nc1c2cc(C(N)Cc3ccc(F)cc3F)cc-2nc2ccccn12


Predictions…:  14%|████████▍                                                     | 2001/14737 [56:22<6:10:48,  1.75s/it]

MOL_01_026065 Cc1cccc(N2CCN(C(=O)CN3C(=O)N=C4C=CC=C4C3=O)CC2C)c1


Predictions…:  20%|████████████▏                                               | 3001/14737 [1:23:58<4:41:35,  1.44s/it]

MOL_01_027075 NC(=O)CCCCC(=O)c1ccc(CN2C(=O)N=C3C=CC=C3C2=O)cc1


Predictions…:  27%|████████████████▎                                           | 4001/14737 [1:49:20<4:30:48,  1.51s/it]

MOL_01_028080 CC(C)N1C(=N)N(c2ccc(CCc3ccccc3)cn2)CC1(C)C


Predictions…:  34%|████████████████████▎                                       | 5001/14737 [2:14:16<4:16:30,  1.58s/it]

MOL_01_029085 Cc1nc(CCC(N)Nc2ccccc2Cl)cc(-c2ccccc2)n1


Predictions…:  41%|████████████████████████▍                                   | 6001/14737 [2:39:42<4:06:06,  1.69s/it]

MOL_01_030086 Cc1cc(Cl)cc(NC(=N)Cc2ccc(NC3(C(=N)N)CCC(C)C3)cc2)c1


Predictions…:  48%|████████████████████████████▌                               | 7001/14737 [3:08:48<4:13:44,  1.97s/it]

MOL_01_031090 N=C(NCc1ccc(Cl)cc1)NNC(=N)C1CN(N)Cc2ccccc21


Predictions…:  54%|████████████████████████████████▌                           | 8001/14737 [3:37:22<2:59:40,  1.60s/it]

MOL_01_032090 N=C(Nc1cccnc1Oc1ccc(ON)cc1)N1CCC(N)C1


Predictions…:  61%|████████████████████████████████████▋                       | 9001/14737 [4:01:13<2:47:45,  1.75s/it]

MOL_01_037543 CCC1(NC)*c2ccc(Cl)cc2C(CC(=O)Nc2cnc3c(c2)CC(=O)C(OCO)C3)C1


Predictions…:  68%|████████████████████████████████████████                   | 10001/14737 [4:29:37<2:18:27,  1.75s/it]

MOL_01_038705 Nc1cccc(C(=O)CC2CC(c3ncncc3F)c3ccccc3N(N)C2=O)c1


Predictions…:  75%|████████████████████████████████████████████               | 11001/14737 [4:58:03<1:45:11,  1.69s/it]

MOL_01_039763 NC(CCNC(=O)c1cnc2ccccc2c1)CN1CCN(c2ccnc3ccccc23)CN1


Predictions…:  81%|████████████████████████████████████████████████           | 12001/14737 [5:24:41<1:05:49,  1.44s/it]

MOL_01_040821 C#CCC(O)NN1C=CN(NCC2CCN(c3ccnc4c3OCC4)CC2)CN1


Predictions…:  88%|█████████████████████████████████████████████████████▊       | 13001/14737 [5:49:18<41:26,  1.43s/it]

MOL_01_041882 COc1cc(-c2ccc3cccc(Nc4ccc(C)c(N)c4F)c3c2)ccc1C


Predictions…:  95%|█████████████████████████████████████████████████████████▉   | 14001/14737 [6:12:21<14:27,  1.18s/it]

MOL_01_042966 *=S(N)(=O)c1ccccc1CC=C


Predictions…: 100%|█████████████████████████████████████████████████████████████| 14737/14737 [6:28:39<00:00,  1.58s/it]

530532





In [10]:
# removing the new line characters
with open(smiles_file) as f:
    smiles_list = [line.rstrip() for line in f]

    
print(len(smiles_list))
print(smiles_list[:10])

38820
['CCOc(cc1)ccc1CCNC(=O)Cn(cc2CC)c(=O)c(c2C3=4)C=C3N=C(C4)CC', 'CCC1CCCCN1Cc(c2)n(c(c23)c(=O)n(C)c(=O)n3C)Cc4c(N)cccc4O', 'O=C(O)C(C1)=Nc2ncc(c=3c=12)C(=O)N(N3)c4ccccc4', 'CC(C1)=NC2=Cc(c3C=12)c(=O)n(cc3CC)CC(=O)NCCCN(C)Cc4ccccc4', 'CC(C)CC(C(=O)O)NC(=O)CC(C(N1)=O)=C(C)c(c2)c=1c(C)c(c23)N=CC=3C', 'c1cc(N)ccc1C(=O)Nc(c2)ccc(O)c2C34C(CCO4)CCC(=N3)N', 'CC1CCC(CC1)NC(=O)C(C)N(N=C2C)C(=O)C(C23C)=NN3c4ccccc4', 'c12c(O2)ccc(c1)\\N=C3\\C(c4ccccc4)C(C(=O)NCC)CC(=O)N3Cc(c5)ccc(c56)OCO6', 'NC(=O)CC(C)NC(C12CC1)c3c(O2)c(c(N)cc3)C(=O)c4ccncc4', 'C1CCCN1Cc2c(O)cc(cc2)C(c3c(O4)cc(N)cc3)Nc5ccnc(c6)c5cc(O7)c6C47O']


In [11]:
smiles_list[67]

'c1cccc(c12)NC(\\C2)=C(\\CC)c3ccnc(n3)N4CCOCC4'

In [12]:
compound_prefix = "GANt"
GAN_name_count = 0
total_image_count = 0
angle_list = [str(angle) for angle in range(10,360,10)]


from tqdm import tqdm

for i in tqdm(range(0, len(smiles_list)), desc="Predictions…", ascii=False, ncols=120):
    current_smiles = smiles_list[i]

#for current_smiles in smiles_list:
    
    compound_id = compound_prefix + str(GAN_name_count)
    
    try:
        save_comp_imgs_from_smiles(protein_name, compound_id, current_smiles)
        total_image_count += 1
        #print(total_image_count, compound_id, current_smiles)
        
        for angle in angle_list:
        
            save_comp_imgs_from_smiles(protein_name, compound_id+"_"+angle, current_smiles, int(angle))
            total_image_count += 1
            #print(total_image_count, compound_id+"_"+angle, current_smiles, int(angle))
            
    except Exception as e:
        print(e, GAN_name_count, compound_id, current_smiles)
    
    #if GAN_name_count % 1000 == 0:
        #print(GAN_name_count, compound_id, current_smiles)
    GAN_name_count += 1

0 GANt0 CCOc(cc1)ccc1CCNC(=O)Cn(cc2CC)c(=O)c(c2C3=4)C=C3N=C(C4)CC
100 GANt100 c1cc(C)ccc1CCC(N2CC(N3)=O)=Nc(c4C2=3)cccc4
200 GANt200 Cc1c(C)cn(c12)ccnc2Nc(cc3)cc(O)c3C


KeyboardInterrupt: 