# Imports and stationary stats

In [1]:
import os
import time
import pickle

from tqdm import tqdm

dir1 = '/rds-d2/user/wjm41/hpc-work/datasets/ZINC/real/'
dir2 = '/rds-d7/project/rds-ZNFRY9wKoeE/EnamineREAL/'
data_dir = dir2 + 'data/'

file = open(dir1+'all_tranches.txt', 'r')
all_tranches = file.read().splitlines()
all_tranches = [line.split('/')[-1].split('.')[0] for line in all_tranches]

def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0

def file_size(file_path):
    """
    this function will return the file size
    """
    if os.path.isfile(file_path):
        file_info = os.stat(file_path)
        return convert_bytes(file_info.st_size)
        


In [2]:
existing_dirs = [data_dir+x for x in os.listdir(data_dir) if os.path.isdir(data_dir+x)]

print('{:.2f}% ({}/{}) Downloaded!'.format(
    100*(len(list(set(existing_dirs)))/len(all_tranches)), len(existing_dirs), len(all_tranches)))

time.sleep(1)
empty_dirs = []
pickle_dirs = []
smi_dirs = []
pickle_and_smi = []
scored_dirs = []
dirs_to_score = []
mpi_dirs_to_score = []

for folder in tqdm(existing_dirs, smoothing=0):
    if os.path.isfile(folder+'/mols.sdf'):
        if os.stat(folder+'/mols.sdf').st_size == 0:
            empty_dirs.append(folder)

    if os.path.isfile(folder+'/pairs.pickle'): # check file existence 
        pickle_dirs.append(folder)  

    if os.path.isfile(folder+'/mols.smi'):
        smi_dirs.append(folder)

    if os.path.isfile(folder+'/pairs.pickle') and os.path.isfile(folder+'/mols.smi'):
        real_pairs = pickle.load(open(folder+'/pairs.pickle', 'rb'))
        file = open(folder+'/mols.smi', 'r')
        real_smi = file.read().splitlines()    
        if len(real_smi) == len(real_pairs):
            pickle_and_smi.append(folder)

    pairs = [file for file in os.listdir(
        folder) if "pairs_mpi_" in file]
    scores = [file for file in os.listdir(
        folder) if "_mac" in file and ".csv" in file]

    not_scored = True

    if os.path.isfile(folder+'/scores_mac.csv'):
        not_scored = False
    elif len(pairs) <= len(scores) and len(pairs)!=0: # some dirs had MPI processes without pharmacophores
        not_scored = False
    elif len(pairs) > len(scores):
        mpi_dirs_to_score.append(folder)
    else:
        dirs_to_score.append(folder)

    if not not_scored:
        scored_dirs.append(folder)
nonempty_dirs = [x for x in existing_dirs if x not in empty_dirs]

print('{:.2f}% ({}/{}) NonEmpty!'.format(100 *
      (len(list(set(nonempty_dirs)))/len(existing_dirs)), len(nonempty_dirs), len(existing_dirs)))
print('{:.2f}% ({}/{}) Has Pickles!'.format(100 *
      (len(list(set(pickle_dirs)))/len(pickle_dirs)), len(pickle_dirs), len(existing_dirs)))
print('{:.2f}% ({}/{}) Has Smiles!'.format(100 *
      (len(list(set(smi_dirs)))/len(existing_dirs)), len(smi_dirs), len(existing_dirs)))
print('{:.2f}% ({}/{}) Has Pickle lengths matching Smiles!'.format(100 *
      (len(list(set(pickle_and_smi)))/len(existing_dirs)), len(pickle_and_smi), len(existing_dirs)))
print('{:.2f}% ({}/{}) Scored!'.format(100 *
      (len(list(set(scored_dirs)))/len(existing_dirs)), len(scored_dirs), len(existing_dirs)))
print('len(dirs_to_score): {}'.format(len(dirs_to_score)))
print('len(mpi_dirs_to_score): {}'.format(len(mpi_dirs_to_score)))


100.00% (46570/46570) Downloaded!


 28%|██▊       | 12906/46570 [33:56<1:28:33,  6.34it/s]

In [4]:
import time
### SCORING ###

def count_scoring(path, write=False):
    if path==dir1:
        print('rds-d2')
        dirs = nonempty_dirs1
    elif path==dir2:
        print('rds-d7')
        dirs = nonempty_dirs2
    else:
        raise Exception('Wrong directory supplied!')
    time.sleep(1)
    scored_dirs = []
    dirs_to_score = []
    mpi_dirs_to_score = []
    for i,folder in tqdm(enumerate(dirs), total = len(dirs)):

            pairs = [file for file in os.listdir(path+folder) if "pairs_mpi_" in file]
            scores = [file for file in os.listdir(path+folder) if "_mac" in file and ".csv" in file]

            not_scored = True

            if os.path.isfile(path+folder+'/scores_mac.csv'):
                not_scored = False
            elif len(pairs) <= len(scores) and len(pairs)!=0: # some dirs had MPI processes without pharmacophores
                not_scored = False
            elif len(pairs) > len(scores):
#                 print(len(pairs))
#                 print(len(scores))
                mpi_dirs_to_score.append(folder)
            else:
                dirs_to_score.append(folder)

            if not not_scored:
                scored_dirs.append(folder)
        
    print('{:.2f}% Scored!'.format(100*(len(list(set(scored_dirs)))/len(dirs))))

    print('len(dirs_to_score): {}'.format(len(dirs_to_score)))
    print('len(mpi_dirs_to_score): {}'.format(len(mpi_dirs_to_score)))
    
    if write:
        f = open(path+'scored_dirs.txt', 'w')
        for line in scored_dirs:
            f.write(line+'\n')
        f.close()
        f = open(path+'to_score_mac.txt', 'w')
        for line in dirs_to_score:
            f.write(line+'\n')
        f.close()

        f = open(path+'to_score_mpi_mac.txt', 'w')
        for line in mpi_dirs_to_score:
            f.write(line+'\n')
        f.close()

count_scoring(dir1, write=True)
count_scoring(dir2, write=True)

rds-d2


100%|██████████| 15143/15143 [02:39<00:00, 94.92it/s] 


92.23% Scored!
len(dirs_to_score): 532
len(mpi_dirs_to_score): 645
rds-d7


100%|██████████| 29941/29941 [05:39<00:00, 88.28it/s] 

94.09% Scored!
len(dirs_to_score): 194
len(mpi_dirs_to_score): 1575





In [3]:
def find_nonempty(path):
    if path==dir1:
        print('rds-d2')
        dirs = dirs1
    elif path==dir2:
        print('rds-d7')
        dirs = dirs2
    else:
        raise Exception('Wrong directory supplied!')
        
    empty_dirs = []
    for folder in dirs:
        if os.path.isfile(path+folder+'/mols.sdf'):
            if os.stat(path+folder+'/mols.sdf').st_size==0:
                empty_dirs.append(folder)
    nonempty_dirs = [x for x in dirs if x not in empty_dirs]

    print('{:.2f}% NonEmpty!'.format(100*(len(list(set(nonempty_dirs)))/len(dirs))))
    
    return nonempty_dirs

nonempty_dirs1 = find_nonempty(dir1)
nonempty_dirs2 = find_nonempty(dir2)

rds-d2
97.33% NonEmpty!
rds-d7
96.43% NonEmpty!


In [8]:
import time

def count_empty(path, write=False):
    if path==dir1:
        print('rds-d2')
        dirs = dirs1
    elif path==dir2:
        print('rds-d7')
        dirs = dirs2
    else:
        raise Exception('Wrong directory supplied!')
        
    time.sleep(1)
    empty_dirs = []
    nonempty_dirs = []
    nosdf_dirs = []
    dirs_to_score = open(path+'to_score_mac.txt', 'r').read().splitlines()
    mpi_dirs_to_score = open(path+'to_score_mpi_mac.txt', 'r').read().splitlines()
    
    for i,folder in tqdm(enumerate(dirs_to_score), total = len(dirs_to_score)):

            if os.path.isfile(path+folder+'/mols.sdf'):
                if os.stat(path+folder+'/mols.sdf').st_size==0:
                    empty_dirs.append(folder)
                else:
                    nonempty_dirs.append(folder)
            else:
                nosdf_dirs.append(folder)
                    
    for i,folder in tqdm(enumerate(mpi_dirs_to_score), total = len(mpi_dirs_to_score)):

            if os.path.isfile(path+folder+'/mols.sdf'):
                if os.stat(path+folder+'/mols.sdf').st_size==0:       
                    empty_dirs.append(folder)
                else:
                    nonempty_dirs.append(folder)
            else:
                nosdf_dirs.append(folder)                    
                
    print('{:.2f}% Empty!'.format(100*(len(list(set(empty_dirs)))/len(dirs))))

    print('len(empty_dirs): {}'.format(len(empty_dirs)))
    print('len(nonempty_dirs): {}'.format(len(nonempty_dirs)))
    print('len(nosdf_dirs): {}'.format(len(nosdf_dirs)))
    
#     if write:
#         f = open(path+'scored_dirs.txt', 'w')
#         for line in scored_dirs:
#             f.write(line+'\n')
#         f.close()
#         f = open(path+'to_score_mac.txt', 'w')
#         for line in dirs_to_score:
#             f.write(line+'\n')
#         f.close()

#         f = open(path+'to_score_mpi_mac.txt', 'w')
#         for line in mpi_dirs_to_score:
#             f.write(line+'\n')
#         f.close()

count_empty(dir1, write=True)
count_empty(dir2, write=True)

rds-d2


100%|██████████| 914/914 [00:00<00:00, 4438.41it/s]
100%|██████████| 674/674 [00:00<00:00, 38360.80it/s]


2.64% Empty!
len(empty_dirs): 411
len(nonempty_dirs): 0
len(nosdf_dirs): 1177
rds-d7


100%|██████████| 976/976 [00:00<00:00, 4375.38it/s]
100%|██████████| 1903/1903 [00:00<00:00, 3359.51it/s]

3.57% Empty!
len(empty_dirs): 1109
len(nonempty_dirs): 1
len(nosdf_dirs): 1769





In [6]:

scored_dirs1 = open(dir1+'missing.txt', 'r').read().splitlines()
scored_dirs2 = open(dir2+'missing.txt', 'r').read().splitlines()

def find_missing(dir, dirs):
    missing_dirs = []
    for i,folder in tqdm(enumerate(dirs), total = len(dirs)):
#         logging.info(folder)
        if not os.path.isfile(dir+folder+'/topN_mac.csv'):
            missing_dirs.append(folder)
    print(len(missing_dirs))
#     f = open(dir+'missing.txt', 'w')
#     for line in missing_dirs:
#         f.write(line+'\n')
#     f.close()
    
# find_missing(dir1, scored_dirs1)
find_missing(dir2, scored_dirs2)

100%|██████████| 4/4 [00:00<00:00, 1183.74it/s]

1





In [4]:
import datetime
import pathlib

MB = 1048576

def 

bad_dirs1 = open(dir+'bad_dirs.txt','r').read().split()

n_bad = 0
n_zero = 0
tot_size = 0

to_proc = []
to_proc_big = []

for bad_dir in tqdm(bad_dirs1):
#     print(os.listdir(dir+bad_dir))
#     gz_files = [file for file in os.listdir(dir+bad_dir) if '.gz' in file]
#     if len(gz_files)!=0:
#         n_bad+=1
    if os.path.isfile(dir+bad_dir+'/mols.sdf') and not os.path.isfile(dir+bad_dir+'/done.txt'):
        if os.stat(dir+bad_dir+'/mols.sdf').st_size==0:
            n_zero+=1
        else:
            size = os.stat(dir+bad_dir+'/mols.sdf').st_size
#             tot_size += size
            if size > MB:
                if not os.path.isfile(dir+bad_dir+'/scores11_mac.csv'):
                    tot_size += size
                    to_proc_big.append(bad_dir)
                else:
                    c_date = datetime.datetime.fromtimestamp(pathlib.Path(dir+bad_dir+'/scores11_mac.csv').stat().st_ctime)
                    if c_date.day!=16 and c_date!=15:
                        print(c_date.day)
                        tot_size += size
                        to_proc_big.append(bad_dir)
            else:
#                 print(bad_dir)
                to_proc.append(bad_dir)
#             print(file_size(dir+bad_dir+'/mols.sdf'))
#         print(bad_dir)
#         print(datetime.datetime.fromtimestamp(pathlib.Path(dir+bad_dir+'/mols.sdf').stat().st_mtime))
#         print(datetime.datetime.fromtimestamp(pathlib.Path(dir+bad_dir+'/mols.sdf').stat().st_ctime))
#     if not os.path.isfile(dir+bad_dir+'/mols.sdf'):
#         n_bad+=1
        
# print('{} bad dirs left'.format(n_bad))
tot_len = len(to_proc) + len(to_proc_big)
print('rds-d2')
print('{} empty dirs'.format(n_zero))
print('{} small dirs'.format(len(to_proc)))
print('{} big dirs'.format(len(to_proc_big)))
print('Total file size: {}'.format(convert_bytes(tot_size)))        
if tot_len!=0:
    print('Average file size: {}'.format(convert_bytes(tot_size/tot_len)))

# f = open(dir+'small_mac.txt','w')
# for line in to_proc:
#     f.write(line+'\n')
# f.close()
f = open(dir+'big_mac.txt','w')
for line in to_proc_big:
    f.write(line+'\n')
f.close()

bad_dirs2 = open(dir2+'bad_dirs.txt','r').read().split()

n_bad = 0
n_zero = 0
tot_size = 0

to_proc = []
to_proc_big = []
# big_bad = []
for bad_dir in tqdm(bad_dirs2):
    if os.path.isfile(dir2+bad_dir+'/mols.sdf'):
        if os.stat(dir2+bad_dir+'/mols.sdf').st_size==0:
            n_zero+=1
        else:
            size = os.stat(dir2+bad_dir+'/mols.sdf').st_size
            if size > MB:
                if not os.path.isfile(dir2+bad_dir+'/scores11_mac.csv'):
                    tot_size += size
                    to_proc_big.append(bad_dir)
                else:
                    c_date = datetime.datetime.fromtimestamp(pathlib.Path(dir2+bad_dir+'/scores11_mac.csv').stat().st_ctime)
                    if c_date.day not in [15, 16, 17, 18]:
#                         print(c_date.day)
                        tot_size += size
                        to_proc_big.append(bad_dir)
            else:
                to_proc.append(bad_dir)
#             print(file_size(dir2+bad_dir+'/mols.sdf'))

# #     print(os.listdir(dir+bad_dir))
# #     gz_files = [file for file in os.listdir(dir+bad_dir) if '.gz' in file]
# #     if len(gz_files)!=0:
# #         n_bad+=1
# #     if os.path.isfile(dir2+bad_dir+'/mols.sdf'):
# #         print(bad_dir)
# #         print(datetime.datetime.fromtimestamp(pathlib.Path(dir2+bad_dir+'/mols.sdf').stat().st_mtime))
# #         print(datetime.datetime.fromtimestamp(pathlib.Path(dir2+bad_dir+'/mols.sdf').stat().st_ctime))
    if not os.path.isfile(dir2+bad_dir+'/mols.sdf'):
        n_bad+=1
#         big_bad.append(bad_dir)
tot_len = len(to_proc) + len(to_proc_big)
print('rds-d7')
print('{} empty dirs'.format(n_zero))
print('{} bad dirs left'.format(n_bad))
print('{} small dirs'.format(len(to_proc)))
print('{} big dirs'.format(len(to_proc_big)))
print('Total file size: {}'.format(convert_bytes(tot_size)))        
if tot_len!=0:
    print('Average file size: {}'.format(convert_bytes(tot_size/tot_len)))

# bad_tranches = []
# for line in data:
#     for bad_dir in big_bad:
#         if bad_dir in line:
#             bad_tranches.append(line)
# print(len(bad_tranches))
# f = open(dir2+'bad_tranches.txt','w')
# for line in bad_tranches:
#     f.write(line+'\n')
# f.close()
# f = open(dir2+'small_mac.txt','w')
# for line in to_proc:
#     f.write(line+'\n')
# f.close()
f = open(dir2+'big_mac.txt','w')
for line in to_proc_big:
    f.write(line+'\n')
f.close()

100%|██████████| 1035/1035 [00:00<00:00, 1857.26it/s]
  2%|▏         | 128/6144 [00:00<00:04, 1274.55it/s]

rds-d2
415 empty dirs
0 small dirs
0 big dirs
Total file size: 0.0 bytes


100%|██████████| 6144/6144 [00:03<00:00, 1686.48it/s]

rds-d7
1110 empty dirs
1244 bad dirs left
1 small dirs
0 big dirs
Total file size: 0.0 bytes
Average file size: 0.0 bytes





In [None]:
bad_dirs1 = open(dir+'bad_dirs.txt','r').read().split()
print(len(bad_dirs1))
bad_tranches = []
for line in data:
    for bad_dir in bad_dirs1:
        if bad_dir in line:
            bad_tranches.append(line)
print(len(bad_tranches))
f = open(dir+'bad_tranches.txt','w')
for line in bad_tranches:
    f.write(line+'\n')
    
bad_dirs2 = open(dir2+'bad_dirs.txt','r').read().split()
print(len(bad_dirs2))
bad_tranches = []
for line in data:
    for bad_dir in bad_dirs2:
        if bad_dir in line:
            bad_tranches.append(line)
print(len(bad_tranches))
f = open(dir2+'bad_tranches.txt','w')
for line in bad_tranches:
    f.write(line+'\n')
f.close()

In [12]:
import numpy as np
import pickle

rescore_dirs = []
bad_dirs = []
for path in tqdm(dirs1):
        pairs = [file for file in os.listdir(dir+path) if "pairs_mpi_" in file]
        try:
            if len(pairs)!=0:
                for n in range(len(pairs)):
                    smi_len = len(open(dir+path+'/mols'+str(n)+'.smi','r').read().splitlines())
                    pickle_len = len(pickle.load(open(dir+path+'/pairs_mpi_'+str(n)+'.pickle','rb')))
#                     score_len = len(np.load(dir+path+'/scores'+str(n)+'.npy'))

#                     if smi_len != score_len:
#                         rescore_dirs.append(path)
#                         break
                    if smi_len != pickle_len:
                        bad_dirs.append(path)
                        break
            else:
                smi_len = len(open(dir+path+'/mols.smi','r').read().splitlines())
                pickle_len = len(pickle.load(open(dir+path+'/pairs.pickle','rb')))
#                 score_len = len(np.load(dir+path+'/scores.npy'))
    
#                 if smi_len != score_len:
#                     rescore_dirs.append(path)
                if smi_len != pickle_len:
                    bad_dirs.append(path)
        except FileNotFoundError as ex:
#             bad_dirs.append(path)
#             print(dir+path)
#             print(ex)
            continue
print('Num bad dirs: {}'.format(len(rescore_dirs + bad_dirs)))
print(len(rescore_dirs))
print(len(bad_dirs))
f = open(dir+'bad_dirs.txt', 'w')
for line in rescore_dirs + bad_dirs:
    f.write(line+'\n')
f.close()


100%|██████████| 15558/15558 [1:48:43<00:00,  2.38it/s]  

Num bad dirs: 651
0
651





In [17]:
import numpy as np
import pickle

rescore_dirs = []
bad_dirs = []
for path in tqdm(dirs2):
        pairs = [file for file in os.listdir(dir2+path) if "pairs_mpi_" in file]
        try:
            if len(pairs)!=0:
                for n in range(len(pairs)):
                    smi_len = len(open(dir2+path+'/mols'+str(n)+'.smi','r').read().splitlines())
                    pickle_len = len(pickle.load(open(dir2+path+'/pairs_mpi_'+str(n)+'.pickle','rb')))
#                     score_len = len(np.load(dir+path+'/scores'+str(n)+'.npy'))

#                     if smi_len != score_len:
#                         rescore_dirs.append(path)
#                         break
                    if smi_len != pickle_len:
                        bad_dirs.append(path)
                        break
            else:
                smi_len = len(open(dir2+path+'/mols.smi','r').read().splitlines())
                pickle_len = len(pickle.load(open(dir2+path+'/pairs.pickle','rb')))
#                 score_len = len(np.load(dir+path+'/scores.npy'))
    
#                 if smi_len != score_len:
#                     rescore_dirs.append(path)
                if smi_len != pickle_len:
                    bad_dirs.append(path)
        except FileNotFoundError as ex:
            bad_dirs.append(path)
#             print(dir+path)
#             print(ex)
            continue
print('Num bad dirs: {}'.format(len(rescore_dirs + bad_dirs)))
print(len(rescore_dirs))
print(len(bad_dirs))
f = open(dir2+'bad_dirs.txt', 'w')
for line in rescore_dirs + bad_dirs:
    f.write(line+'\n')
f.close()

100%|██████████| 31051/31051 [4:55:00<00:00,  1.75it/s]   


Num bad dirs: 5199
0
5199


In [None]:
import numpy as np

rescore_dirs2 = []
bad_dirs2 = []
for path in tqdm(dirs2):
        pairs = [file for file in os.listdir(dir2+path) if "pairs_mpi_" in file]
        try:
            if len(pairs)!=0:
                for n in range(len(pairs)):
                    smi_len = len(open(dir2+path+'/mols'+str(n)+'.smi','r').read().splitlines())
#                     pickle_len = len(pickle.load(open(dir+path+'/pairs_mpi_'+str(n)+'.pickle','rb')))
                    score_len = len(np.load(dir2+path+'/scores'+str(n)+'.npy'))

                    if smi_len != score_len:
                        rescore_dirs2.append(path)
                        break
            else:
                smi_len = len(open(dir2+path+'/mols.smi','r').read().splitlines())
#                 pickle_len = len(pickle.load(open(dir+path+'/pairs.pickle','rb')))
                score_len = len(np.load(dir2+path+'/scores.npy'))
    
                if smi_len != score_len:
                    rescore_dirs2.append(path)
        except FileNotFoundError as ex:
            bad_dirs2.append(path)
#             print(dir2+path)
#             print(ex)
            continue
print('Num bad dirs: {}'.format(len(rescore_dirs2 + bad_dirs)))
print(len(rescore_dirs2))
print(len(bad_dirs2))
f = open(dir2+'bad_dirs.txt', 'w')
for line in rescore_dirs2 + bad_dirs2:
    f.write(line+'\n')
f.close()

In [None]:
f = open(dir+'dirs1.txt', 'w')
for fold in dirs1:
    f.write(fold+'\n')
f.close()
f = open(dir2+'dirs2.txt', 'w')
for fold in dirs2:
    f.write(fold+'\n')
f.close()

# DIR1 = /rds-d2/

In [None]:
big_dirs = 0
big_size = 0
big_folds = []

med_dirs = 0
med_size = 0
med_folds = []

small_dirs = 0
small_size = 0
small_folds = []

hundred = 104857600

for folder in tqdm(folders):
    if os.path.isfile(dir+folder+'/mols.sdf') and not os.path.isfile(dir+folder+'/mols_mpi_0.pickle'):
        if os.stat(dir+folder+'/mols.sdf').st_size > 4*hundred:
            big_dirs+=1
            big_size+=os.stat(dir+folder+'/mols.sdf').st_size
            big_folds.append(folder)
        elif os.stat(dir+folder+'/mols.sdf').st_size > hundred:
            med_dirs+=1
            med_size+=os.stat(dir+folder+'/mols.sdf').st_size
            med_folds.append(folder)
        elif os.stat(dir+folder+'/mols.sdf').st_size!=0:
            small_dirs+=1
            small_size+=os.stat(dir+folder+'/mols.sdf').st_size
            small_folds.append(folder)
            
print('\nBig sdf Prop: {:.2f}%'.format(100*big_dirs/len(folders)))
print('Big sdf Size: {}'.format(convert_bytes(big_size)))
print('\nMed sdf Prop: {:.2f}%'.format(100*med_dirs/len(folders)))
print('Med sdf Size: {}'.format(convert_bytes(med_size)))
print('\nSmall sdf Prop: {:.2f}%'.format(100*small_dirs/len(folders)))
print('Small sdf Size: {}'.format(convert_bytes(small_size)))

print(len(big_folds))
f = open(dir+'bigs.txt', 'w')
for line in big_folds:
    f.write(line+'\n')
print(len(med_folds))
f = open(dir+'meds.txt', 'w')
for line in med_folds:
    f.write(line+'\n')
print(len(small_folds))
f = open(dir+'smalls.txt', 'w')
for line in small_folds:
    f.write(line+'\n')
f.close()

big_dirs = 0
big_size = 0
big_folds = []

med_dirs = 0
med_size = 0
med_folds = []

small_dirs = 0
small_size = 0
small_folds = []

hundred = 104857600

for folder in tqdm(folders):
    if os.path.isfile(dir+folder+'/mols.pickle') and not os.path.isfile(dir+folder+'/mols0.pickle'):
        if os.stat(dir+folder+'/mols.pickle').st_size > 4*hundred:
            big_dirs+=1
            big_size+=os.stat(dir+folder+'/mols.pickle').st_size
            big_folds.append(folder)
        elif os.stat(dir+folder+'/mols.pickle').st_size > hundred:
            med_dirs+=1
            med_size+=os.stat(dir+folder+'/mols.pickle').st_size
            med_folds.append(folder)
        elif os.stat(dir+folder+'/mols.pickle').st_size!=0:
            small_dirs+=1
            small_size+=os.stat(dir+folder+'/mols.pickle').st_size
            small_folds.append(folder)
for folder in tqdm(folders):
    if os.path.isfile(dir+folder+'/mols.pickle') and not os.path.isfile(dir+folder+'/mols0.pickle'):
        if os.stat(dir+folder+'/mols.pickle').st_size > 4*hundred:
            big_dirs+=1
            big_size+=os.stat(dir+folder+'/mols.pickle').st_size
            big_folds.append(folder)
        elif os.stat(dir+folder+'/mols.pickle').st_size > hundred:
            med_dirs+=1
            med_size+=os.stat(dir+folder+'/mols.pickle').st_size
            med_folds.append(folder)
        elif os.stat(dir+folder+'/mols.pickle').st_size!=0:
            small_dirs+=1
            small_size+=os.stat(dir+folder+'/mols.pickle').st_size
            small_folds.append(folder)           
print('\nBig pickle Prop: {:.2f}%'.format(100*big_dirs/len(folders)))
print('Big pickle Size: {}'.format(convert_bytes(big_size)))
print('\nMed pickle Prop: {:.2f}%'.format(100*med_dirs/len(folders)))
print('Med pickle Size: {}'.format(convert_bytes(med_size)))
print('\nSmall pickle Prop: {:.2f}%'.format(100*small_dirs/len(folders)))
print('Small pickle Size: {}'.format(convert_bytes(small_size)))

print(len(big_folds))
f = open(dir+'bigs.txt', 'a')
for line in big_folds:
    f.write(line+'\n')
print(len(med_folds))
f = open(dir+'meds.txt', 'a')
for line in med_folds:
    f.write(line+'\n')
print(len(small_folds))
f = open(dir+'smalls.txt', 'a')
for line in small_folds:
    f.write(line+'\n')
f.close()

In [None]:
big_folds = open(dir+'/meds.txt').read().splitlines()


print(len(big_folds))
del_folds = []

for folder in tqdm(big_folds):
    if os.path.isfile(dir+folder+'/mols.pickle'):
        should_keep=False
        
        if not os.path.isfile(dir+folder+'/pairs.pickle'):
            should_keep=True
        else:    
            for n in range(4):
                if not os.path.isfile(dir+folder+'/pairs_mpi_'+str(n)+'.pickle'):
                    should_keep=True
                    break
        if not should_keep:
            del_folds.append(folder)
            
print(len(del_folds))

f = open(dir+'to_del.txt', 'w')
for line in del_folds:
    f.write(line+'\n')
f.close()




# DIR2 = /rds-d7/

In [None]:
to_clean = open(dir2+'/to_clean.txt').read().splitlines()

del_folds = []
for folder in tqdm(to_clean):
#     if os.path.isfile(dir2+folder+'/mols.sdf'):
        
    should_del=True
    for n in range(11):
        if os.stat(dir2+folder+'/mols'+str(n)+'.smi').st_size==0:
            should_del=False
            break
    if not should_del:
        del_folds.append(folder)
print(del_folds)

f = open(dir2+'to_clean.txt', 'w')
for line in del_folds:
    f.write(line+'\n')
f.close()



In [None]:
pickle_dirs = []
pickle_dirs2 = []
pickle_mpi_dirs = []
pickle_mpi_dirs2 = []
no_pickledirs = []

for i,folder in tqdm(enumerate(folders), total = len(folders)):
    if folder in existing_dirs:
        if os.path.isfile(dir+folder+'/pairs.pickle'):
            pickle_dirs.append(folder)
        elif os.path.isfile(dir+folder+'/pairs_mpi_0.pickle') or os.path.isfile(dir+folder+'/pairs_mpi_4.pickle'):
            pickle_mpi_dirs.append(folder)
        elif os.path.isfile(dir2+folder+'/pairs.pickle'): 
            pickle_dirs2.append(folder)
        elif os.path.isfile(dir2+folder+'/pairs_mpi_0.pickle') or os.path.isfile(dir2+folder+'/pairs_mpi_4.pickle'):
            pickle_mpi_dirs2.append(folder)
        else:
            no_pickledirs.append(folder)
            
done_dirs = pickle_dirs + pickle_dirs2 + pickle_mpi_dirs + pickle_mpi_dirs2
print(len(done_dirs))

print('{:.2f}% Pickled!'.format(100*(len(list(set(done_dirs)))/len(folders))))

print('Not yet pickled:')
print(no_pickledirs)

In [None]:
f = open(dir+'to_check.txt', 'w')
for line in pickle_mpi_dirs:
    f.write(line+'\n')
print(len(pickle_mpi_dirs))
f = open(dir2+'to_check.txt', 'w')
for line in pickle_mpi_dirs2:
    f.write(line+'\n')
print(len(pickle_mpi_dirs2))
f.close()

In [None]:
# from frag_funcs import return_pcore_dataframe

import pickle 

fail_dirs = []
fail_dirs2 = []

for i,folder in tqdm(enumerate(folders), total = len(folders)):
    if os.path.isfile(dir+folder+'/pairs.pickle') and not os.path.isfile(dir+folder+'/scores.npy'):
        
        with open(dir+folder+'/pairs.pickle', 'rb') as handle:
            try:
                zinc_pairs = pickle.load(handle)     
            except Exception:
                fail_dirs.append(i)
                
    if os.path.isfile(dir2+folder+'/pairs.pickle') and not os.path.isfile(dir2+folder+'/scores.npy'): 
        with open(dir2+folder+'/pairs.pickle', 'rb') as handle:
            try:
                zinc_pairs = pickle.load(handle)     
            except Exception:
                fail_dirs2.append(i)
            
failed_dirs = fail_dirs + fail_dirs2
print(len(failed_dirs))

print('{:.2f}% Failed pickles'.format(100*(len(list(set(failed_dirs)))/len(folders))))

f = open(dir+'fail_dirs.txt', 'w')
for i in fail_dirs:
    f.write(data[i]+'\n')
print(len(fail_dirs))

f = open(dir2+'fail_dirs.txt', 'w')
for i in fail_dirs2:
    f.write(data[i]+'\n')
print(len(fail_dirs2))
f.close()

In [None]:
pickle_dirs = []
pickle_dirs2 = []
pickle_mpi_dirs = []
pickle_mpi_dirs2 = []

for i,folder in tqdm(enumerate(folders), total = len(folders)):
    if folder in existing_dirs and folder not in failed_dirs:
        if os.path.isfile(dir+folder+'/pairs.pickle') and not os.path.isfile(dir+folder+'/scores.npy'):
            pickle_dirs.append(folder)
        elif os.path.isfile(dir+folder+'/pairs_mpi_0.pickle') and not os.path.isfile(dir+folder+'/scores0.npy'):
            pickle_mpi_dirs.append(folder)
        elif os.path.isfile(dir2+folder+'/pairs.pickle') and not os.path.isfile(dir2+folder+'/scores.npy'): 
            pickle_dirs2.append(folder)
        elif os.path.isfile(dir2+folder+'/pairs_mpi_0.pickle') and not os.path.isfile(dir2+folder+'/scores0.npy'):
            pickle_mpi_dirs2.append(folder)
            
scored_dirs = pickle_dirs + pickle_dirs2 + pickle_mpi_dirs + pickle_mpi_dirs2
print(len(scored_dirs))

print('{:.2f}% Pickled but not Scored!'.format(100*(len(list(set(scored_dirs)))/len(folders))))

In [None]:
f = open(dir+'to_score.txt', 'w')
for line in pickle_dirs:
    f.write(line+'\n')
print(len(pickle_dirs))
f = open(dir+'to_score_mpi.txt', 'w')
for line in pickle_mpi_dirs:
    f.write(line+'\n')
print(len(pickle_mpi_dirs))
f = open(dir2+'to_score.txt', 'w')
for line in pickle_dirs2:
    f.write(line+'\n')
print(len(pickle_dirs2))
f = open(dir2+'to_score_mpi.txt', 'w')
for line in pickle_mpi_dirs2:
    f.write(line+'\n')
print(len(pickle_mpi_dirs2))
f.close()

In [None]:
scored_dirs = []
scored_dirs2 = []
for i,folder in tqdm(enumerate(folders), total = len(folders)):
    if folder in existing_dirs:
        if os.path.isfile(dir+folder+'/scores.npy'):
            scored_dirs.append(folder)
        elif os.path.isfile(dir2+folder+'/scores.npy'):
            scored_dirs2.append(folder)
        elif os.path.isfile(dir+folder+'/scores0.npy'):
            scored_dirs.append(folder)
        elif os.path.isfile(dir2+folder+'/scores0.npy'):
            scored_dirs2.append(folder)
            
scored_dirs = scored_dirs + scored_dirs2
print(len(scored_dirs))

print('{:.2f}% Scored!'.format(100*(len(list(set(scored_dirs)))/len(folders))))

In [None]:
import time

# print(time.localtime(os.path.getmtime(dir2+folders[0]+'/scores.npy')))

old_dirs = []
old_dirs_mpi = []
for i,folder in tqdm(enumerate(folders), total = len(folders)):
    if folder in existing_dirs:
        if os.path.isfile(dir+folder+'/scores.npy'):
            if time.localtime(os.path.getmtime(dir+folder+'/scores.npy'))[2]<10:
                if os.path.isfile(dir+folder+'/pairs.pickle'):
                    old_dirs.append(folder)
print(len(old_dirs))

print('{:.2f}% Need Rescoring!'.format(100*(len(list(set(old_dirs)))/len(folders))))

In [None]:
f = open(dir+'rescore.txt', 'w')
for line in old_dirs:
    f.write(line+'\n')
print(len(old_dirs))
f.close()

In [None]:
fail_size = 0
fail_size2 = 0

weird_dirs = []
weird_dirs2 = []
print('DIR #1')
for fold in fail_dirs:
    try:
        fail_size+=os.stat(dir+fold+'/mols.pickle').st_size
    except Exception:
        weird_dirs.append(fold)

print('DIR #2')
for fold in fail_dirs2:
    try:
        fail_size2+=os.stat(dir2+fold+'/mols.pickle').st_size
    except Exception:
        weird_dirs2.append(fold)
        
print('Fail Size: {}'.format(convert_bytes(fail_size)))
print('Fail2 Size: {}'.format(convert_bytes(fail_size2)))

f = open(dir+'weird_dirs.txt', 'w')
for line in weird_dirs:
    f.write(line+'\n')
print(len(weird_dirs))

f = open(dir2+'weird_dirs.txt', 'w')
for line in weird_dirs2:
    f.write(line+'\n')
print(len(weird_dirs2))



# Progress Measurement

In [None]:
from tqdm import tqdm

def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0

def file_size(file_path):
    """
    this function will return the file size
    """
    if os.path.isfile(file_path):
        file_info = os.stat(file_path)
        return convert_bytes(file_info.st_size)
        
dirs = 0

smi_dir = 0
sdf_dir = 0
pickle_dir = 0
score_dir = 0

smi_size = 0
sdf_size = 0
pickle_size = 0
score_size = 0

pickle_dirs = []
for folder in tqdm(folders):
    if os.path.isdir(dir+folder):
        dirs+=1
    if os.path.isfile(dir+folder+'/mols.smi'):
        smi_dir+=1
        smi_size+=os.stat(dir+folder+'/mols.smi').st_size
    if os.path.isfile(dir+folder+'/mols.sdf'):
        if os.stat(dir+folder+'/mols.sdf').st_size!=0:
            sdf_dir+=1
            sdf_size+=os.stat(dir+folder+'/mols.sdf').st_size
    if os.path.isfile(dir+folder+'/pairs.pickle'):
        pickle_dir+=1
        pickle_size+=os.stat(dir+folder+'/pairs.pickle').st_size
        pickle_dirs.append(folder)
    if os.path.isfile(dir+folder+'/scores.npy'):
        score_dir+=1
        score_size+=os.stat(dir+folder+'/scores.npy').st_size
        
print('Download Completion: {:.2f}%'.format(100*dirs/len(folders)))

print('\nSMI Completion: {:.2f}%'.format(100*smi_dir/len(folders)))
print('SMI Size: {}'.format(convert_bytes(smi_size)))
print('Estimated SMI Size: {}'.format(convert_bytes(smi_size*len(folders)/score_dir)))

print('\nSDF Completion: {:.2f}%'.format(100*sdf_dir/len(folders)))
print('SDF Size: {}'.format(convert_bytes(sdf_size)))
print('Estimated SDF Size: {}'.format(convert_bytes(sdf_size*len(folders)/sdf_dir)))

print('\nPickle Completion: {:.2f}%'.format(100*pickle_dir/len(folders)))
print('Pickle Size: {}'.format(convert_bytes(pickle_size)))
print('Estimated Pickle Size: {}'.format(convert_bytes(pickle_size*len(folders)/pickle_dir)))

print('\nScore Completion: {:.2f}%'.format(100*score_dir/len(folders)))
print('Estimated Score Size: {}'.format(convert_bytes(score_size*len(folders)/score_dir)))