## Working with saved model and Sequence

In [5]:
import os

In [6]:
# Import the script from different folder
import sys  
sys.path.append('../scripts')

import file_utilities as fu

## ProSE embeddings


Initiaize arguments

In [12]:
# Define arguments for the file_paths function
task = 'acp'
ptmodel = 'prose'
file_base = 'train'
model = 'prose_dlm'
emb_layer = 'layer'
pool = 'avg'  


<br>

## Train Dataset

### ProSE DLM model - prose_dlm

- **Pooling Operation:  `avg`**

Run the script `file_paths` to prepare paths. The default root data folder is *../data*.

In [18]:
# Prepare paths
path_pt, _, path_fa = fu.file_paths(ptmodel, task, file_base, model, pool)
print('', path_fa, '\n', path_pt)

 ../data/acp/train_data.fa 
 ../data/acp/prose/train/acp_train_dlm_avg


### Creating classifier path

In [60]:
# check what to use joblib(.sav), PyTorch (.pt), Keras(.h5) or something else

clf = ['rf', 'lr', 'xgb', 'dl', 'knn']

tail = os.path.split(path_pt)[1].split('_')
tail.pop(1)

model_name = f'{"_".join(tail)}_{clf[2]}.h5'
model_path = os.path.join('../saved_models', model_name)
model_path

'../saved_models/acp_dlm_avg_xgb.h5'

In [61]:
toks = ['L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O']
len(toks)

25

### Get the length and generate the sequence

In [77]:

import random
def seq_generator(n_toks):
    toks = ['L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O']
    seq = ''
    start, end = 0, 24
    for i in range(n_toks):
        seq = f'{seq}{toks[random.randint(start, end)]}'
    return seq

In [86]:
n_toks = 145
seq_generator(n_toks)

'CYASGYYRDKIFNKZZMEUXYEZETAVVTKDLEFQOBQNTOXULTHWHAMDEIQIVACZOKTWWBKXQSZFGZVDXOQHTFOQQSCBIBCYWAOPWVROFMKAQSZRVCGQDHKQCIEGGZBODHMVHOAMARPUHYGPEYXIAR'

In [98]:
path_fa = 'in_sequence.fa'
path_h5 = 'in_sequence.h5'
path_pt = 'in_sequence'

### Create fasta file

In [None]:
Seq = {}
Seq['header'] = seq_generator(n_toks)
Seq

{'header': 'FHUTKMNTRXQFMRWHQCSAQMNGPCCHWTUIDYYHVBLVOMDIURIDFLYBWVMYLKLNMLPOSHAFDBFEGXZCZEPAMNIOYBTKZZTXVXQBEIBIYKZCTMKHGFELCYMWAQBWPISIKKGBYAFFKMOZBGZCZDFQE'}

In [89]:
with open(path_fa, 'w') as fout:
        for header, sequence in Seq.items():
            # Separate sequence in 60-characters chunks
           #### sequence = separate_seq(sequence)
            # Write code in one line and the sequence below in one or more lines
            fout.write(f">{header}\n{sequence}\n")

### Prose

In [91]:
%%time
# Run embedding script
%run ../prose_main/embed_sequences --model "{model}" --pool "{pool}" -o "{path_h5}" "{path_fa}"

# loading the pre-trained ProSE DLM model
# writing: in_sequence.h5
# embedding with pool=avg


CPU times: user 2.87 s, sys: 1.66 s, total: 4.53 s
Wall time: 5.96 s


                                                                                

### Embeddings from h5

In [95]:
with h5py.File(path_h5, 'r') as hf:
    for key in hf.keys():
        n = np.array(hf.get(key))        
n

array([ 0.03448276,  0.02068966,  0.02758621, ..., -0.19872151,
        0.01572483,  0.13779174], dtype=float32)

In [96]:
n.shape

(6165,)

### esm

In [97]:
import esm
import torch

In [99]:
# Path for extract.py
esm_scripts_path = '/home/damir/.cache/torch/hub/facebookresearch_esm_main/scripts'
extract = os.path.join(esm_scripts_path, 'extract.py')
extract

'/home/damir/.cache/torch/hub/facebookresearch_esm_main/scripts/extract.py'

In [100]:
ptmodel = 'esm'
task = 'acp'
pool = 'mean'  
# Last 3 arguments we might be changing through the notebook
file_base = 'train'
model = 'esm1v_t33_650M_UR90S_1'
emb_layer = 33

In [101]:
%%time
# Run embedding script
%run "{extract}" "{model}" "{path_fa}" "{path_pt}" --repr_layers "{emb_layer}" --include "{pool}" 



Transferred model to GPU
Read in_sequence.fa with 1 sequences
Processing 1 of 1 batches (1 sequences)
CPU times: user 8.25 s, sys: 9.45 s, total: 17.7 s
Wall time: 20.7 s


### Embeddings from .pt

In [109]:
file_pt = os.path.join(path_pt, 'header.pt')
torch.load(file_pt)['mean_representations'][emb_layer].numpy()


array([-0.1391747 , -0.08677424, -0.260222  , ..., -0.4417132 ,
       -0.15612462,  0.25370103], dtype=float32)

### Test from untitled_2

In [7]:
path_fa = 'in_sequence.fa'
path_h5 = 'in_sequence.h5'
path_pt = 'in_sequence'

In [8]:
Seq = {}
Seq['header'] = 'GVGDIFRKIVSTIKNVV'
Seq

{'header': 'GVGDIFRKIVSTIKNVV'}

In [9]:
with open(path_fa, 'w') as fout:
        for header, sequence in Seq.items():
            fout.write(f">{header}\n{sequence}\n")

In [13]:
%%time
# Run embedding script
%run ../prose_main/embed_sequences --model "{model}" --pool "{pool}" -o "{path_h5}" "{path_fa}"

# loading the pre-trained ProSE DLM model
# writing: in_sequence.h5
# embedding with pool=avg


CPU times: user 2.39 s, sys: 1.34 s, total: 3.73 s
Wall time: 6.24 s


                                                                                

In [31]:
import h5py
with h5py.File(path_h5, 'r') as hf:
    for key in hf.keys():
        n = np.array(hf.get(key)).reshape(1, -1)        
n

array([[ 0.        ,  0.05882353,  0.05882353, ..., -0.06649267,
         0.03789606,  0.0419427 ]], dtype=float32)

In [32]:
# check what to use joblib(.sav), PyTorch (.pt), Keras(.h5) or something else

clf = ['rf', 'lr', 'xgb', 'dl', 'knn']

tail = os.path.split(path_pt)[1].split('_')
tail.pop(1)

model_name = f'{"_".join(tail)}_{clf[4]}.sav'
model_path = os.path.join('../saved_models', model_name)
model_path

'../saved_models/acp_dlm_avg_knn.sav'

In [36]:
import joblib

In [37]:
knn_saved = joblib.load(model_path)
#print(knn_saved.score(X_test_s, y_test))

In [38]:
knn_saved.predict(n)

array([1])

In [39]:
n

array([[ 0.        ,  0.05882353,  0.05882353, ..., -0.06649267,
         0.03789606,  0.0419427 ]], dtype=float32)