In [63]:
# Prepare file names
def file_paths(file_name, model, pool, data_folder = '../data_test'):
    import os
    
    mabr = model.split('_')[1]
    file_fa = f'{file_name}.fa'
    path_fa = os.path.join(data_folder, 'data_fa', file_fa)
    
    file_h5 = f'{file_name}_{mabr}_{pool}.h5'
    path_h5 = os.path.join(data_folder, 'data_h5', file_h5)
    
    return path_h5, path_fa


In [24]:
%run ../prose/embed_sequences --help

usage: embed_sequences.py [-h] [-m MODEL] [-o OUTPUT]
                          [--pool {none,sum,max,avg}] [-d DEVICE]
                          path

positional arguments:
  path

optional arguments:
  -h, --help            show this help message and exit
  -m MODEL, --model MODEL
                        pretrained model to load, prose_mt loads the
                        pretrained ProSE MT model, prose_dlm loads the
                        pretrained Prose DLM model, otherwise unpickles torch
                        model directly (default: prose_mt)
  -o OUTPUT, --output OUTPUT
  --pool {none,sum,max,avg}
                        apply some sort of pooling operation over each
                        sequence (default: none)
  -d DEVICE, --device DEVICE
                        compute device to use


### ProSE MT model - prose_mt

In [69]:
model = 'prose_mt'
file_base = 'train_small1'

Pooling Operation:  sum

In [85]:
pool = 'sum'  
path_h5, path_fa = file_paths(file_base, model, pool)

In [71]:
%run ../prose/embed_sequences --model "{model}" --pool "{pool}" -o "{path_h5}" "{path_fa}"

# loading the pre-trained ProSE MT model
# writing: ../data_test/data_h5/train_small1_mt_sum.h5
# embedding with pool=sum
                                                                                

Pooling Operation:  avg

In [86]:
pool = 'avg'  
path_h5, path_fa = file_paths(file_base, model, pool)

In [76]:
%run ../prose/embed_sequences --model "{model}" --pool "{pool}" -o "{path_h5}" "{path_fa}"

# loading the pre-trained ProSE MT model
# writing: ../data_test/data_h5/train_small1_mt_avg.h5
# embedding with pool=avg
                                                                                

Pooling Operation:  max

In [87]:
pool = 'max'  
path_h5, path_fa = file_paths(file_base, model, pool)

In [78]:
%run ../prose/embed_sequences --model "{model}" --pool "{pool}" -o "{path_h5}" "{path_fa}"

# loading the pre-trained ProSE MT model
# writing: ../data_test/data_h5/train_small1_mt_max.h5
# embedding with pool=max
                                                                                

Pooling Operation:  none

In [88]:
pool = 'none'  
path_h5, path_fa = file_paths(file_base, model, pool)

In [80]:
%run ../prose/embed_sequences --model "{model}" --pool "{pool}" -o "{path_h5}" "{path_fa}"

# loading the pre-trained ProSE MT model
# writing: ../data_test/data_h5/train_small1_mt_none.h5
# embedding with pool=none
                                                                                

In [84]:
# Check the files
!tree -DhL 1 "{os.path.join(data_folder, 'data_h5')}"

[34;42m../data_test/data_h5[00m
├── [225K Aug  4 17:47]  [01;32mdemo.h5[00m
├── [223K Aug 15 14:17]  [01;32mtrain_small1_mt_avg.h5[00m
├── [223K Aug 15 14:17]  [01;32mtrain_small1_mt_max.h5[00m
├── [ 79M Aug 15 14:17]  [01;32mtrain_small1_mt_none.h5[00m
└── [223K Aug 15 14:15]  [01;32mtrain_small1_mt_sum.h5[00m

0 directories, 5 files


### ProSE DLM model - prose_dlm

In [89]:
model = 'prose_dlm'
file_base = 'train_small1'

Pooling Operation:  sum

In [90]:
pool = 'sum'  
path_h5, path_fa = file_paths(file_base, model, pool)

In [91]:
%run ../prose/embed_sequences --model "{model}" --pool "{pool}" -o "{path_h5}" "{path_fa}"

# loading the pre-trained ProSE DLM model
# writing: ../data_test/data_h5/train_small1_dlm_sum.h5
# embedding with pool=sum
                                                                                

Pooling Operation:  avg

In [92]:
pool = 'avg'  
path_h5, path_fa = file_paths(file_base, model, pool)

In [93]:
%run ../prose/embed_sequences --model "{model}" --pool "{pool}" -o "{path_h5}" "{path_fa}"

# loading the pre-trained ProSE DLM model
# writing: ../data_test/data_h5/train_small1_dlm_avg.h5
# embedding with pool=avg
                                                                                

Pooling Operation:  max

In [94]:
pool = 'max'  
path_h5, path_fa = file_paths(file_base, model, pool)

In [95]:
%run ../prose/embed_sequences --model "{model}" --pool "{pool}" -o "{path_h5}" "{path_fa}"

# loading the pre-trained ProSE DLM model
# writing: ../data_test/data_h5/train_small1_dlm_max.h5
# embedding with pool=max
                                                                                

Pooling Operation:  none

In [96]:
pool = 'none'  
path_h5, path_fa = file_paths(file_base, model, pool)

In [97]:
%run ../prose/embed_sequences --model "{model}" --pool "{pool}" -o "{path_h5}" "{path_fa}"

# loading the pre-trained ProSE DLM model
# writing: ../data_test/data_h5/train_small1_dlm_none.h5
# embedding with pool=none
                                                                                

In [98]:
# Check the files
!tree -DhL 1 "{os.path.join(data_folder, 'data_h5')}"

[34;42m../data_test/data_h5[00m
├── [225K Aug  4 17:47]  [01;32mdemo.h5[00m
├── [223K Aug 15 14:23]  [01;32mtrain_small1_dlm_avg.h5[00m
├── [223K Aug 15 14:23]  [01;32mtrain_small1_dlm_max.h5[00m
├── [ 79M Aug 15 14:23]  [01;32mtrain_small1_dlm_none.h5[00m
├── [223K Aug 15 14:23]  [01;32mtrain_small1_dlm_sum.h5[00m
├── [223K Aug 15 14:17]  [01;32mtrain_small1_mt_avg.h5[00m
├── [223K Aug 15 14:17]  [01;32mtrain_small1_mt_max.h5[00m
├── [ 79M Aug 15 14:17]  [01;32mtrain_small1_mt_none.h5[00m
└── [223K Aug 15 14:15]  [01;32mtrain_small1_mt_sum.h5[00m

0 directories, 9 files


### DNA Binding Proteins

In [102]:
# Prepare file names
def file_paths(file_name, model, pool, data_folder = '../data_test'):
    import os
    
    mabr = model.split('_')[1]
    file_fa = f'{file_name}.fa'
    path_fa = os.path.join(data_folder, file_fa)
    
    file_h5 = f'{file_name}_{mabr}_{pool}.h5'
    path_h5 = os.path.join(data_folder, 'train_h5', file_h5)
    
    return path_h5, path_fa


In [3]:
model = 'prose_dlm'
file_base = 'train'
data_path = '../data/dna_binding'

In [104]:
pool = 'sum'  
path_h5, path_fa = file_paths(file_base, model, pool, data_folder=data_path)

In [105]:
%run ../prose/embed_sequences --model "{model}" --pool "{pool}" -o "{path_h5}" "{path_fa}"

# loading the pre-trained ProSE DLM model
# writing: ../data/dna_binding/data_h5/train_dlm_sum.h5
# embedding with pool=sum
                                                                                

Processed 14189 sequences in 42m 3.78s using GPU

Renamed h5 folder - data_h5 --> train_h5  08/27/22   
add the folder for the model source: prose or esm after train_h5

In [30]:
# Check the files
!tree -DhL 2 "{os.path.join(data_path, 'train_h5')}"

[34;42m../data/dna_binding/train_h5[00m
├── [4.0K Aug 27 22:51]  [34;42mesm[00m
└── [4.0K Aug 27 23:56]  [34;42mprose[00m
    └── [339M Aug 16 11:19]  [01;32mdbp_train_dlm_sum.h5[00m

2 directories, 1 file


## !!CHANGE file name and add folder

start with the task name:   train_dlm_sum.h5 --> dbp_train_dlm_sum.h5



In [102]:
# Prepare file names
def file_paths(file_name, model, pool, data_folder = '../data'):
    import os
    
    mabr = model.split('_')[1]
    file_fa = f'{file_name}.fa'
    path_fa = os.path.join(data_folder, file_fa)
    
    file_h5 = f'{file_name}_{mabr}_{pool}.h5'
    path_h5 = os.path.join(data_folder, 'train_h5', file_h5)
    
    return path_h5, path_fa


In [8]:
model = 'prose_dlm'
file_base = 'train'
data_path = '../data/dna_binding'

In [17]:
task = 'acp'
pool = 'sum'  
path_h5, path_fa = file_paths(file_base, model, pool, data_folder=data_path)
print(path_fa, '\n', path_h5)

NameError: name 'file_paths' is not defined

In [26]:
data_folder = '../data'
file_fa = f'{file_base}.fa'

task = 'dbp'
task_folder = task
if task == 'dbp':
    task_folder = 'dna_binding'
    
data_path = os.path.join(data_folder, task_folder, )
print(data_path)
path_fa = os.path.join(data_path, file_fa)
path_fa

../data/dna_binding


'../data/dna_binding/train.fa'

In [10]:
files =os.listdir(data_path)
files

['test_data.csv',
 'test_data.fa',
 'test_h5',
 'test_parquet',
 'train_data.csv',
 'train_data.fa',
 'train_h5',
 'train_parquet']

In [12]:
for f in files:
    if file_base in f and 'fa' in f:
        file_fa = f
print(file_fa)

train_data.fa


In [13]:
path_fa = os.path.join(data_path, file_fa)
path_fa

'../data/acp/train_data.fa'

In [19]:
 mabr = model.split('_')[1]
file_h5 = f'{task}_{file_base}_{mabr}_{pool}.h5'
dir_h5 = f'{file_base}_h5'

path_h5 = os.path.join(data_path, 'train_h5', source, file_h5)
print(path_h5)

../data/acp/train_h5/prose/acp_train_dlm_sum.h5


In [5]:
import os

In [24]:
source = 'prose'
task = 'dbp'
file_base = 'train'
model = 'prose_dlm'
pool = 'sum'  



In [20]:
# Prepare file names
def file_paths(source, task, file_base, model, pool, data_folder = '../data'):
  
    mabr = model.split('_')[1]
    
    task_folder = task
    if task == 'dbp':
        task_folder = 'dna_binding'
    
    data_path = os.path.join(data_folder, task_folder)
    # find fasta file for train/test/all
    files = os.listdir(data_path)
    for f in files:
        if file_base in f and 'fa' in f:
            file_fa = f
 
    path_fa = os.path.join(data_path, file_fa)
    
    
    dir_h5 = f'{file_name}_h5'
    file_h5 = f'{task}_{file_name}_{mabr}_{pool}.h5'
    path_h5 = os.path.join(data_path, dir_h5, source, file_h5)
    
    return path_h5, path_fa


In [21]:
print(path_fa, '\n', path_h5)

../data/acp/train_data.fa 
 ../data/acp/train_h5/prose/acp_train_dlm_sum.h5


In [46]:
data_path

'../data/dna_binding'