In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from transformers import T5Tokenizer, T5Model

import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import os
from tqdm import tqdm
import math
import re
import random
from termcolor import colored
import dataframe_image as dfi
import warnings
import wandb
warnings.filterwarnings('ignore')
warnings.warn('DelftStack')
warnings.warn('Do not show this message')

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from IPython.display import Audio, display
def allDone():
    display(Audio(url='https://www.mediacollege.com/downloads/sound-effects/beep/beep-10.wav', autoplay=True))

embed_path = 'data_test'
result_path = 'predicted_results'
wt_mt_path = 'data_test/wt_mt'

In [11]:
protein_seq = pd.read_csv('./data/sequence_for_embedding_3w.csv')

In [12]:
# add space between each amino aicds
protein_seq['wt_seq'] = protein_seq['wt_seq'].apply(lambda x: ' '.join(x)).apply(
        lambda x: re.sub(r"[UZOB]", "X", x))
protein_seq['mt_seq'] = protein_seq['mt_seq'].apply(lambda x: ' '.join(x)).apply(
        lambda x: re.sub(r"[UZOB]", "X", x))

In [13]:
protein_seq['label'].astype(str)
label_names = set(protein_seq['label'])
list(label_names)

[0, 1]

In [14]:
# protein_seq['label'].value_counts()

In [15]:
# protein_seq = protein_seq.head(30)

In [16]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
elif torch.has_mps:
    torch.cuda.manual_seed(2020)
    device = torch.device('mps')
    print('Device name: MPS')
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Device name: MPS


In [17]:
from transformers import T5Tokenizer, T5EncoderModel
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)
model = model.eval()

In [36]:
import pickle
def get_embedding(protein_seq, start=None, stop=None, save_path=embed_path, device=device):
    
    xs = []
    result = None
    count = 0
    embed_error_count = 0
    protein_seq = protein_seq[start:stop]
    data_len = len(protein_seq)

    for index, seq in tqdm(protein_seq.iterrows(), total=protein_seq.shape[0]):
        s_len = len(seq['wt_seq'].replace(" ",'')) + 1
        aa_index = seq['aa_index']
        label = seq['label']
        wt_aa = seq['wt']
        mt_aa = seq['mt']
        wt_seq = seq['wt_seq']
        mt_seq = seq['mt_seq']
        # AF_DB = seq['AlphaFoldDB']
        # PDB = seq['PDB']
        # pathogenicity = seq['pathogenicity']
        
        # add_special_tokens adds extra token at the end of each sequence
        # token_encoding = tokenizer.batch_encode_plus([seq['wt_seq'], seq['mt_seq']], add_special_tokens=True, padding="longest")
        wt_token_encoding = tokenizer.batch_encode_plus([seq['wt_seq']], add_special_tokens=True, padding="longest")
        wt_input_ids      = torch.tensor(wt_token_encoding['input_ids']).to(device)
        wt_attention_mask = torch.tensor(wt_token_encoding['attention_mask']).to(device)
        
        mt_token_encoding = tokenizer.batch_encode_plus([seq['mt_seq']], add_special_tokens=True, padding="longest")
        mt_input_ids      = torch.tensor(mt_token_encoding['input_ids']).to(device)
        mt_attention_mask = torch.tensor(mt_token_encoding['attention_mask']).to(device)

        with torch.no_grad():
            # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
            wt_embedding_repr =model(wt_input_ids, attention_mask=wt_attention_mask)
            wt_emb = wt_embedding_repr.last_hidden_state[:, :s_len]
            wt_emb = wt_emb[:, aa_index, :]
            wt = wt_emb.detach().cpu().numpy().squeeze()
            
            mt_embedding_repr =model(mt_input_ids, attention_mask=mt_attention_mask)
            mt_emb = mt_embedding_repr.last_hidden_state[:, :s_len]
            mt_emb = mt_emb[:, aa_index, :]
            mt = mt_emb.detach().cpu().numpy().squeeze()

            # break
            # try:
            #     emb = emb[:, aa_index, :]
            # except:
            #     embed_error_count += 1
            #     print(f'embedding error: index: {index}, aa_index:{aa_index}, aa_length: {s_len} , error_count:{embed_error_count}')
                
            # print(aa_index)
            # x = emb.detach().cpu().numpy().squeeze()

            xs.append({'wt':wt.reshape(1,-1),'mt':mt.reshape(1,-1), 'label':label})
            
    # Save results
    if not os.path.isdir(f'{save_path}'):
        os.mkdir(f'{save_path}')
            
    if start is None:
        # result.to_csv(f'{save_path}/emb_({data_len}).csv', index=False)
        with open(f'./data_test/emb({data_len}).pkl', 'wb') as f:
            pickle.dump(xs, f)
    else:
        # result.to_csv(f'{save_path}/emb_{stop}.pkl.csv', index=False)
        with open(f'{save_path}/emb_{stop}.pkl', 'wb') as f:
            pickle.dump(xs, f)
    
# get_embedding(seq)  
    

In [26]:
protein_seq = pd.read_csv('./data/sequence_for_embedding_3w.csv')
# add space between each amino aicds
protein_seq['wt_seq'] = protein_seq['wt_seq'].apply(lambda x: ' '.join(x)).apply(
        lambda x: re.sub(r"[UZOB]", "X", x))
protein_seq['mt_seq'] = protein_seq['mt_seq'].apply(lambda x: ' '.join(x)).apply(
        lambda x: re.sub(r"[UZOB]", "X", x))
protein_seq['label'].astype(str)
label_names = set(protein_seq['label'])

In [27]:
protein_seq = protein_seq[protein_seq['Length']
                                 <=1000]

In [28]:
protein_seq = protein_seq[5:20]

In [29]:
protein_seq

Unnamed: 0,gene_id,aa_index,Length,wt,mt,wt_seq,mt_seq,label
45,NP_001108220.1,115,173,V,M,M S M S A N T M I F M I L G A S V V M A I A C ...,M S M S A N T M I F M I L G A S V V M A I A C ...,1
46,NP_001108220.1,109,173,G,S,M S M S A N T M I F M I L G A S V V M A I A C ...,M S M S A N T M I F M I L G A S V V M A I A C ...,0
47,NP_001108220.1,66,173,G,R,M S M S A N T M I F M I L G A S V V M A I A C ...,M S M S A N T M I F M I L G A S V V M A I A C ...,1
193,NP_068835.1,45,139,S,F,M E T N V F H L M L C V T S A R T H K S T S L ...,M E T N V F H L M L C V T S A R T H K S T S L ...,0
307,NP_006163.1,118,151,L,M,M S S F S T T T V S F L L L L A F Q L L G Q T ...,M S S F S T T T V S F L L L L A F Q L L G Q T ...,0
308,NP_006163.1,98,151,G,R,M S S F S T T T V S F L L L L A F Q L L G Q T ...,M S S F S T T T V S F L L L L A F Q L L G Q T ...,0
309,NP_006163.1,91,151,Q,R,M S S F S T T T V S F L L L L A F Q L L G Q T ...,M S S F S T T T V S F L L L L A F Q L L G Q T ...,0
310,NP_006163.1,64,151,S,R,M S S F S T T T V S F L L L L A F Q L L G Q T ...,M S S F S T T T V S F L L L L A F Q L L G Q T ...,0
311,NP_006163.1,28,151,M,T,M S S F S T T T V S F L L L L A F Q L L G Q T ...,M S S F S T T T V S F L L L L A F Q L L G Q T ...,0
312,NP_002512.1,94,134,V,F,M D P Q T A P S R A L L L L L F L H L A F L G ...,M D P Q T A P S R A L L L L L F L H L A F L G ...,0


In [37]:
def embed_in_batch(protein_seq, amount):
    value_input = amount
    data_len = len(protein_seq)
    fold = data_len // value_input
    remainder = data_len - data_len % value_input

    for i in range(fold):
        get_embedding(protein_seq,  start = i* value_input, stop = (i+1)*value_input)
    
    get_embedding(protein_seq, start = remainder, stop = data_len)

In [38]:
embed_in_batch(protein_seq,10)

100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.71it/s]
100%|█████████████████████████████████████████████| 5/5 [00:01<00:00,  3.72it/s]


In [59]:
import os
def data_for_downstream():
    path = os.getcwd() + '/data_test/'
    concat = []
    for pkl in os.listdir(path):
        if(".pkl" in pkl):
            file_path = path + pkl
            with open(file_path, 'rb') as file:
                y = pickle.load(file)
                concat += y
    data_y = []
    data_wt = []
    data_mt = []
    for i in range(len(concat)):
        data_wt.append(concat[i]['wt'][0])
        data_mt.append(concat[i]['mt'][0])
        data_y.append(int(concat[i]['label']))
    data_wt = np.array(data_wt)
    data_mt = np.array(data_mt)
    data_X = np.hstack((data_wt,data_mt))
    return data_X, data_y

In [65]:
path = os.getcwd() + '/data_test/'
concat = []
for pkl in os.listdir(path):
    if(".pkl" in pkl):
        file_path = path + pkl
        with open(file_path, 'rb') as file:
            y = pickle.load(file)
            concat += y
data_y = []
data_wt = []
data_mt = []
for i in range(len(concat)):
    data_wt.append(concat[i]['wt'][0])
    data_mt.append(concat[i]['mt'][0])
    data_y.append(int(concat[i]['label']))
data_wt = np.array(data_wt)
data_mt = np.array(data_mt)
data_X = np.hstack((data_wt,data_mt))

In [66]:
data_y

[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [67]:
data_X.shape

(15, 2048)

In [47]:
data_y = []
data_wt = []
data_mt = []
for i in range(len(y)):
    data_wt.append(y[i]['wt'][0])
    data_mt.append(y[i]['mt'][0])
    data_y.append(int(y[i]['label']))
data_wt = np.array(data_wt)
data_mt = np.array(data_mt)
data_X = np.hstack((data_wt,data_mt))

In [49]:
data_X.shape

(10, 2048)

In [68]:
from xgboost import XGBClassifier
# eval_s = [(X_train, y_train), (X_test, y_test)]
xgb = XGBClassifier()
xgb.fit(data_X, data_y)
y_xgb = xgb.predict(data_X)
accuracy = (y_xgb - data_y)



In [70]:
y_xgb

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Mt_Wt

In [19]:
def get_embedding(protein_seq, tokenizer= tokenizer, model = model, start=None, stop=None, input_type=None, device=device, save_path=embed_path, wt_mt_path = wt_mt_path ):
    tokenizer = tokenizer
    model = model

    xs = []
    result = None
    count = 0
    embed_error_count = 0
    protein_seq = protein_seq[start:stop]
    data_len = len(protein_seq)

    for index, seq in tqdm(protein_seq.iterrows(), total=protein_seq.shape[0]):

        s_len = len(seq['wt_seq'].replace(" ", '')) + 1
        aa_index = seq['aa_index']
        label = seq['label']
        wt_aa = seq['wt']
        mt_aa = seq['mt']
        wt_seq = seq['wt_seq']
        mt_seq = seq['mt_seq']
        # AF_DB = seq['AlphaFoldDB']
        # PDB = seq['PDB']
        # pathogenicity = seq['pathogenicity']

        if input_type is None:
            token_encoding = tokenizer.batch_encode_plus([seq['wt_seq'], seq['mt_seq']], add_special_tokens=True,
                                                         padding="longest")
        else:
            if input_type == 'wt':
                input_seq = seq['wt_seq']
            elif input_type == 'mt':
                input_seq = seq['mt_seq']
            else:
                print("type can be either 'wt or 'mt'. ")
            # add_special_tokens adds extra token at the end of each sequence
            token_encoding = tokenizer.batch_encode_plus([input_seq], add_special_tokens=True,
                                                         padding="longest")
            

        input_ids = torch.tensor(token_encoding['input_ids']).to(device)
        # print('input_ids:', input_ids.shape)
        attention_mask = torch.tensor(
            token_encoding['attention_mask']).to(device)

        with torch.no_grad():
            # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
            embedding_repr = model(
                input_ids, attention_mask=attention_mask)
            emb = embedding_repr.last_hidden_state[:, :s_len]

            try:
                emb = emb[:, aa_index,:]
                
            except Exception as e:
                os.system('tput bel')
                print(e)
                embed_error_count += 1
                print(
                    f'embedding error: index: {index}, aa_index:{aa_index}, aa_length: {s_len} , error_count:{embed_error_count}')

            # print(aa_index)
            x = emb.detach().cpu().numpy().squeeze()
            xs.append({'x': x.reshape(1,-1), 'label':label})
            
    # Save results
    if input_type is None:
        # create a folder to save embeddings (large GPU)
        if not os.path.isdir(f'{save_path}'):
            os.mkdir(f'{save_path}')
        
        if start is None:
        # result.to_csv(f'{save_path}/sequence_embeddings({data_len}).csv', index=False)
            with open(f'./{save_path}/emb({data_len}).pkl', 'wb') as f:
                pickle.dump(xs, f)
        else:
            with open(f'./{save_path}/emb_{stop}.pkl', 'wb') as f:
                pickle.dump(xs, f)
    else:
        if start is None:
        # result.to_csv(f'{save_path}/sequence_embeddings({data_len}).csv', index=False)
            with open(f'./{save_path}/emb({data_len})_{input_type}.pkl', 'wb') as f:
                pickle.dump(xs, f)
        else:
            if not os.path.isdir(f'./{wt_mt_path}'):
                os.mkdir(f'./{wt_mt_path}')
        # result.to_csv(f'{save_path}/sequence_{stop}_embeddings.csv', index=False)
            with open(f'./{wt_mt_path}/{stop}_emb_{input_type}.pkl', 'wb') as f:
                pickle.dump(xs, f)
    

In [20]:
protein_seq = pd.read_csv('./data/sequence_for_embedding_3w.csv')
# add space between each amino aicds
protein_seq['wt_seq'] = protein_seq['wt_seq'].apply(lambda x: ' '.join(x)).apply(
        lambda x: re.sub(r"[UZOB]", "X", x))
protein_seq['mt_seq'] = protein_seq['mt_seq'].apply(lambda x: ' '.join(x)).apply(
        lambda x: re.sub(r"[UZOB]", "X", x))
protein_seq['label'].astype(str)
label_names = set(protein_seq['label'])

In [21]:
protein_seq = protein_seq[protein_seq['Length']
                                 <=200]

In [22]:
protein_seq = protein_seq[5:20]

In [23]:
def embed_in_batch(protein_seq, amount, input_type = None):
    value_input = amount
    data_len = len(protein_seq)
    fold = data_len // value_input
    remainder = data_len - data_len % value_input
    input_type = input_type

    for i in range(fold):
        get_embedding(protein_seq, input_type = input_type,  start = i* value_input, stop = (i+1)*value_input)
    
    get_embedding(protein_seq, input_type=input_type, start = remainder, stop = data_len)

In [24]:
# get_embedding(protein_seq,input_type = 'wt')

In [26]:
embed_in_batch(protein_seq,10, input_type='wt')

100%|███████████████████████████████████████████| 10/10 [00:01<00:00,  7.64it/s]
100%|█████████████████████████████████████████████| 5/5 [00:00<00:00,  9.21it/s]


In [40]:
with open('./data_test/wt_mt/emb_10_mt.pkl', 'rb') as file:
    y_10_mt = pickle.load(file)
with open('./data_test/wt_mt/emb_10_wt.pkl', 'rb') as file:
    y_10_wt = pickle.load(file)

In [41]:
data_mt = []
data_wt = []
for i in range(len(y_10_mt)):
    data_mt.append(y_10_mt[i]['x'][0])
for i in range(len(y_10_wt)):
    data_wt.append(y_10_wt[i]['x'][0])
data_mt = np.array(data_mt)
data_wt = np.array(data_wt)

In [42]:
data_wt.shape, data_mt.shape

((10, 1024), (10, 2048))

In [102]:
test = np.hstack((data_wt,data_mt))

In [103]:
test.shape

(10, 2048)

In [176]:
def data_for_downstream():
    path = os.getcwd() + '/data_test/wt_mt/'
    data_mt = []
    data_wt = []
    for pkl in os.listdir(path):
        if(".pkl" in pkl):
            file_path = path + pkl
            with open(file_path, 'rb') as file:
                y = pickle.load(file)
                data_mt += y
    data_X = []
    data_y = []
    for i in range(len(data_mt)):
        data_X.append(data_mt[i]['x'][0])
        data_y.append(int(data_mt[i]['label']))
    data_X = np.array(data_X)
    return data_X, data_y

In [21]:
def data_for_downstream():
    path = os.getcwd() + '/data_test/wt_mt/'
    concat_mt = []
    concat_wt = []
    concat = []
    for pkl in os.listdir(path):
        if("mt.pkl" in pkl):
            file_path = path + pkl
            with open(file_path, 'rb') as file:
                y = pickle.load(file)
                concat_mt += y
        if("wt.pkl" in pkl):
            file_path = path + pkl
            with open(file_path, 'rb') as file:
                y = pickle.load(file)
                concat_wt += y
        if("wt.pkl" not in pkl) & ("mt.pkl" not in pkl) & (".pkl" in pkl):
            file_path = path + pkl
            with open(file_path, 'rb') as file:
                y = pickle.load(file)
                concat += y
    if len(concat) > 0:
        data_X = []
        data_y = []
        for i in range(len(concat)):
            data_X.append(concat[i]['x'][0])
            data_y.append(int(concat[i]['label']))
        data_X = np.array(data_X)
    
    else:
        data_y = []
        data_wt = []
        data_mt = []
        for i in range(len(concat_mt)):
            data_wt.append(concat_wt[i]['x'][0])
            data_mt.append(concat_mt[i]['x'][0])
            data_y.append(int(concat_mt[i]['label']))
        data_wt = np.array(data_wt)
        data_mt = np.array(data_mt)
        data_X = np.hstack((data_wt,data_mt))
    return data_X, data_y

In [22]:
data_X, data_y = data_for_downstream()

In [23]:
data_X.shape, len(data_y)

((15, 2048), 15)

In [27]:
test_path = os.getcwd() + '/data_test/wt_mt/'
concat_mt = []
files = os.listdir(test_path)
files.sort()
for pkl in files:
    if("mt.pkl" in pkl):
        print('mt:', pkl)
        concat_mt += pkl
    # if("wt.pkl" in pkl):
    #     print('wt:', pkl)
    #     concat_wt += pkl
    # if("wt.pkl" not in pkl) & ("mt.pkl" not in pkl) & (".pkl" in pkl):
    #     print('pkl file:', pkl)

mt: 10_emb_mt.pkl
mt: 15_emb_mt.pkl


In [9]:
import os
files = os.listdir('./data_test/wt_mt/').sort()
files

In [11]:
print(files)

None


In [28]:
test_path = os.getcwd() + '/data_test/wt_mt/'
concat_wt = []
for pkl in os.listdir(test_path):
    if("wt.pkl" in pkl):
        print('wt:', pkl)
        concat_wt += pkl


wt: 15_emb_wt.pkl
wt: 10_emb_wt.pkl


In [29]:
''.join(concat_mt)

'10_emb_mt.pkl15_emb_mt.pkl'

In [30]:
''.join(concat_wt)

'15_emb_wt.pkl10_emb_wt.pkl'

In [26]:
from xgboost import XGBClassifier
# eval_s = [(X_train, y_train), (X_test, y_test)]
xgb = XGBClassifier()
xgb.fit(data_X, data_y)
y_xgb = xgb.predict(data_X)
accuracy = (y_xgb - data_y)



In [27]:
accuracy

array([ 0,  0,  0,  0,  0, -1,  0, -1,  0,  0,  0,  0,  0,  0,  0])

In [28]:
data_y

[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]

In [29]:
y_xgb

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])