In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from transformers import T5Tokenizer, T5Model

import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F

import os
from tqdm import tqdm
import math
import pickle
import re
import random
from termcolor import colored
import dataframe_image as dfi
import warnings
warnings.filterwarnings('ignore')
warnings.warn('DelftStack')
warnings.warn('Do not show this message')

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from IPython.display import Audio, display
def allDone():
    display(Audio(url='https://www.mediacollege.com/downloads/sound-effects/beep/beep-10.wav', autoplay=True))

embed_path = '../data/imbalance_same_seq/Embedding_results/model_1_embeds'
result_path = 'predicted_results'
wt_mt_path = 'data_test/wt_mt'

In [60]:
import wandb

In [61]:
wandb.init(project='wandb_on_ucl_server')

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

# 数据预处理

In [2]:
protein_seq = pd.read_csv('../data/mode1_for_embed.csv')
# add space between each amino aicds
protein_seq['wt_seq'] = protein_seq['wt_seq'].apply(lambda x: ' '.join(x)).apply(
        lambda x: re.sub(r"[UZOB]", "X", x))
protein_seq['mt_seq'] = protein_seq['mt_seq'].apply(lambda x: ' '.join(x)).apply(
        lambda x: re.sub(r"[UZOB]", "X", x))

In [3]:
protein_seq['label'].astype(str)

0        0
1        0
2        0
3        0
4        0
        ..
27745    0
27746    0
27747    1
27748    1
27749    1
Name: label, Length: 27750, dtype: object

In [4]:
protein_seq['label'].value_counts()

0    20655
1     7095
Name: label, dtype: int64

In [5]:
label_names = set(protein_seq['label'])

In [6]:
label_names

{0, 1}

## Embedding

In [7]:
# config
# max_seq_len = 380
batch_size = 16
SEED = 2022

embed_path = '../data'
result_path = './predicted_results'

#使用GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
elif torch.has_mps:
    torch.cuda.manual_seed(SEED)
    device = torch.device('mps')
    print('Device name: MPS')
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


Device name: MPS


In [8]:
from transformers import T5Tokenizer, T5EncoderModel
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)
model = model.eval()

In [9]:
# for name, param in model.named_parameters():
#     print(name)
# print(model.encoder.block[23], model.encoder.final_layer_norm)

In [10]:
# print(model)

In [110]:
protein_test = protein_seq[:4]

In [111]:
protein_test

Unnamed: 0.1,Unnamed: 0,gene_id,aa_index,Length,wt,mt,wt_seq,mt_seq,label
0,0,NP_689699.2,56,681,G,S,M S K G I L Q V H P P I C D C P G C R I S S P ...,M S K G I L Q V H P P I C D C P G C R I S S P ...,0
1,1,NP_689699.2,665,681,G,A,M S K G I L Q V H P P I C D C P G C R I S S P ...,M S K G I L Q V H P P I C D C P G C R I S S P ...,0
2,2,NP_056473.3,203,749,A,V,M A A A G S R K R R L A E L T V D E F L A S G ...,M A A A G S R K R R L A E L T V D E F L A S G ...,0
3,6,NP_001354481.1,358,623,G,D,M G N S H C V P Q A P R R L R A S F S R K P S ...,M G N S H C V P Q A P R R L R A S F S R K P S ...,0


In [113]:
# soft gate

# tokenizer = config.tokenizer
# model = config.model

aa_emb = []
seq_emb = []
result = None
count = 0
embed_error_count = 0
# protein_seq = protein_seq[start:stop]
data_len = len(protein_test)

for index, seq in tqdm(protein_test.iterrows(),total=protein_test.shape[0]):
    s_len = len(seq['wt_seq'].replace(" ",'')) + 1
    aa_index = seq['aa_index']
    label = seq['label']
    wt_aa = seq['wt']
    mt_aa = seq['mt']
    wt_seq = seq['wt_seq']
    mt_seq = seq['mt_seq']
    print(aa_index)
    # AF_DB = seq['AlphaFoldDB']
    # PDB = seq['PDB']
    # pathogenicity = seq['pathogenicity']

    # add_special_tokens adds extra token at the end of each sequence
    # token_encoding = tokenizer.batch_encode_plus([seq['wt_seq'], seq['mt_seq']], add_special_tokens=True, padding="longest")
    wt_token_encoding = tokenizer.batch_encode_plus([seq['wt_seq']], add_special_tokens=True, padding="longest")
    wt_input_ids      = torch.tensor(wt_token_encoding['input_ids']).to(device)
    # print(wt_input_ids)
    wt_attention_mask = torch.tensor(wt_token_encoding['attention_mask']).to(device)

    mt_token_encoding = tokenizer.batch_encode_plus([seq['mt_seq']], add_special_tokens=True, padding="longest")
    mt_input_ids      = torch.tensor(mt_token_encoding['input_ids']).to(device)
    mt_attention_mask = torch.tensor(mt_token_encoding['attention_mask']).to(device)

    with torch.no_grad():
        # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
        wt_embedding_repr =model(wt_input_ids, attention_mask=wt_attention_mask)
        wt_emb = wt_embedding_repr.last_hidden_state[:, :s_len]
        print('wt_emb: ', wt_emb.shape)
        wt_seq = wt_emb.clone()
        wt_aa = wt_emb[:, aa_index, :]
        print('wt_aa shape: ', wt_aa.shape)
        wt = wt_aa.detach().cpu().numpy().squeeze()

        mt_embedding_repr =model(mt_input_ids, attention_mask=mt_attention_mask)
        mt_emb = mt_embedding_repr.last_hidden_state[:, :s_len]
        mt_seq = mt_emb.clone()
        mt_aa = mt_emb[:, aa_index, :]
        mt = mt_aa.detach().cpu().numpy().squeeze()

        aa_emb.append({'wt':wt.reshape(1,-1),'mt':mt.reshape(1,-1), 'label':label})
        seq_emb.append({'wt_seq':wt_seq.reshape(1,-1),'mt_seq':mt_seq.reshape(1,-1), 'label':label})

# Save results
#     if not os.path.isdir(f'{save_path}'):
#         os.mkdir(f'{save_path}')

#     if start is None:
#         # result.to_csv(f'{save_path}/emb_({data_len}).csv', index=False)
#         with open(f'{save_path}/emb({data_len}).pkl', 'wb') as f:
#             pickle.dump(xs, f)
#     else:
#         # result.to_csv(f'{save_path}/emb_{stop}.pkl.csv', index=False)
#         with open(f'{save_path}/emb_{stop}.pkl', 'wb') as f:
#             pickle.dump(xs, f)
    



  0%|                                                     | 0/4 [00:00<?, ?it/s]

56
wt_emb:  torch.Size([1, 682, 1024])
wt_aa shape:  torch.Size([1, 1024])


 25%|███████████▎                                 | 1/4 [00:01<00:03,  1.29s/it]

665
wt_emb:  torch.Size([1, 682, 1024])
wt_aa shape:  torch.Size([1, 1024])


 50%|██████████████████████▌                      | 2/4 [00:02<00:02,  1.25s/it]

203
wt_emb:  torch.Size([1, 750, 1024])
wt_aa shape:  torch.Size([1, 1024])


 75%|█████████████████████████████████▊           | 3/4 [00:03<00:01,  1.28s/it]

358
wt_emb:  torch.Size([1, 624, 1024])
wt_aa shape:  torch.Size([1, 1024])


100%|█████████████████████████████████████████████| 4/4 [00:05<00:00,  1.26s/it]


In [27]:
wt.shape

(1024,)

In [55]:
from transformers import T5Tokenizer, T5EncoderModel
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)

In [114]:
protein_seq = pd.read_csv('../data/mode1_for_embed.csv')
# add space between each amino aicds
protein_seq['wt_seq'] = protein_seq['wt_seq'].apply(lambda x: ' '.join(x)).apply(
        lambda x: re.sub(r"[UZOB]", "X", x))
protein_seq['mt_seq'] = protein_seq['mt_seq'].apply(lambda x: ' '.join(x)).apply(
        lambda x: re.sub(r"[UZOB]", "X", x))
neutral_df = protein_seq[protein_seq['label']==0]
pathogenic_df = protein_seq[protein_seq['label']==1]

In [27]:
neutral_df

Unnamed: 0.1,Unnamed: 0,gene_id,aa_index,Length,wt,mt,wt_seq,mt_seq,label
0,0,NP_689699.2,56,681,G,S,M S K G I L Q V H P P I C D C P G C R I S S P ...,M S K G I L Q V H P P I C D C P G C R I S S P ...,0
1,1,NP_689699.2,665,681,G,A,M S K G I L Q V H P P I C D C P G C R I S S P ...,M S K G I L Q V H P P I C D C P G C R I S S P ...,0
2,2,NP_056473.3,203,749,A,V,M A A A G S R K R R L A E L T V D E F L A S G ...,M A A A G S R K R R L A E L T V D E F L A S G ...,0
3,6,NP_001354481.1,358,623,G,D,M G N S H C V P Q A P R R L R A S F S R K P S ...,M G N S H C V P Q A P R R L R A S F S R K P S ...,0
4,7,NP_001354481.1,494,623,A,T,M G N S H C V P Q A P R R L R A S F S R K P S ...,M G N S H C V P Q A P R R L R A S F S R K P S ...,0
...,...,...,...,...,...,...,...,...,...
27693,72408,NP_000123.1,795,2351,R,G,M Q I E L S T C F F L C L L R F C F S A T R R ...,M Q I E L S T C F F L C L L R F C F S A T R R ...,0
27714,72429,NP_000123.1,503,2351,R,H,M Q I E L S T C F F L C L L R F C F S A T R R ...,M Q I E L S T C F F L C L L R F C F S A T R R ...,0
27717,72432,NP_000123.1,463,2351,H,Y,M Q I E L S T C F F L C L L R F C F S A T R R ...,M Q I E L S T C F F L C L L R F C F S A T R R ...,0
27745,72463,NP_060666.1,235,421,T,N,M W Y H R L S H L H S R L Q D L L K G G V I Y ...,M W Y H R L S H L H S R L Q D L L K G G V I Y ...,0


In [122]:
pathogenic_df[pathogenic_df['gene_id'] == 'NP_000178.2']

Unnamed: 0.1,Unnamed: 0,gene_id,aa_index,Length,wt,mt,wt_seq,mt_seq,label
5156,16301,NP_000178.2,401,445,E,Q,M A E L K Y I S G F G N E C S S E D P R C P G ...,M A E L K Y I S G F G N E C S S E D P R C P G ...,1
5158,16303,NP_000178.2,368,445,M,V,M A E L K Y I S G F G N E C S S E D P R C P G ...,M A E L K Y I S G F G N E C S S E D P R C P G ...,1
5159,16304,NP_000178.2,360,445,G,R,M A E L K Y I S G F G N E C S S E D P R C P G ...,M A E L K Y I S G F G N E C S S E D P R C P G ...,1
5162,16307,NP_000178.2,300,445,V,G,M A E L K Y I S G F G N E C S S E D P R C P G ...,M A E L K Y I S G F G N E C S S E D P R C P G ...,1
5163,16308,NP_000178.2,270,445,G,R,M A E L K Y I S G F G N E C S S E D P R C P G ...,M A E L K Y I S G F G N E C S S E D P R C P G ...,1
5164,16309,NP_000178.2,230,445,P,S,M A E L K Y I S G F G N E C S S E D P R C P G ...,M A E L K Y I S G F G N E C S S E D P R C P G ...,1
5165,16310,NP_000178.2,225,445,R,H,M A E L K Y I S G F G N E C S S E D P R C P G ...,M A E L K Y I S G F G N E C S S E D P R C P G ...,1
5166,16311,NP_000178.2,161,445,G,R,M A E L K Y I S G F G N E C S S E D P R C P G ...,M A E L K Y I S G F G N E C S S E D P R C P G ...,1
5167,16312,NP_000178.2,122,445,A,V,M A E L K Y I S G F G N E C S S E D P R C P G ...,M A E L K Y I S G F G N E C S S E D P R C P G ...,1
5168,16313,NP_000178.2,120,445,C,W,M A E L K Y I S G F G N E C S S E D P R C P G ...,M A E L K Y I S G F G N E C S S E D P R C P G ...,1


In [29]:


# tokenizer = config.tokenizer
# model = config.model

aa_emb = []
seq_emb = []
result = None
count = 0
embed_error_count = 0
# protein_seq = protein_seq[start:stop]
data_len = len(protein_test)

for index, seq in tqdm(protein_test.iterrows(),total=protein_test.shape[0]):
    s_len = len(seq['wt_seq'].replace(" ",'')) + 1
    aa_index = seq['aa_index']
    label = seq['label']
    wt_aa = seq['wt']
    mt_aa = seq['mt']
    wt_seq = seq['wt_seq']
    mt_seq = seq['mt_seq']
    gene_id = seq['gene_id']
    # AF_DB = seq['AlphaFoldDB']
    # PDB = seq['PDB']
    # pathogenicity = seq['pathogenicity']

    # add_special_tokens adds extra token at the end of each sequence
    # token_encoding = tokenizer.batch_encode_plus([seq['wt_seq'], seq['mt_seq']], add_special_tokens=True, padding="longest")
    wt_token_encoding = tokenizer.batch_encode_plus([seq['wt_seq']], add_special_tokens=True, padding="longest")
    wt_input_ids      = torch.tensor(wt_token_encoding['input_ids']).to(device)
    wt_attention_mask = torch.tensor(wt_token_encoding['attention_mask']).to(device)

    mt_token_encoding = tokenizer.batch_encode_plus([seq['mt_seq']], add_special_tokens=True, padding="longest")
    mt_input_ids      = torch.tensor(mt_token_encoding['input_ids']).to(device)
    mt_attention_mask = torch.tensor(mt_token_encoding['attention_mask']).to(device)

    with torch.no_grad():
        # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
        wt_embedding_repr =model(wt_input_ids, attention_mask=wt_attention_mask)
        wt_emb = wt_embedding_repr.last_hidden_state[:, :s_len]
        wt_seq = wt_emb.detach().cpu().numpy()
        print(wt_seq.shape)


        mt_embedding_repr =model(mt_input_ids, attention_mask=mt_attention_mask)
        mt_emb = mt_embedding_repr.last_hidden_state[:, :s_len]
        mt_seq = mt_emb.detach().cpu().numpy()
       
        embedding_dict = {'wt_seq':wt_seq,
                         'mt_seq':mt_seq, 
                         'aa_index':aa_index
                        }
        seq_emb.append(embedding_dict)
        # aa_emb.append({'wt':wt.reshape(1,-1),'mt':mt.reshape(1,-1), 'label':label})
        # seq_emb.append({'wt_seq':wt_seq,'mt_seq':mt_seq, 'aa_index':aa_index})
        
#         filename = '../t5_embeds/neutral/'+ gene_id + '_' + wt_aa + str(aa_index) + mt_aa + '.pkl'
        
#         with open(filename, 'wb') as filehandle:
#             pickle.dump(embedding_dict, filehandle)
            
# Save results
#     if not os.path.isdir(f'{save_path}'):
#         os.mkdir(f'{save_path}')

#     if start is None:
#         # result.to_csv(f'{save_path}/emb_({data_len}).csv', index=False)
#         with open(f'{save_path}/emb({data_len}).pkl', 'wb') as f:
#             pickle.dump(xs, f)
#     else:
#         # result.to_csv(f'{save_path}/emb_{stop}.pkl.csv', index=False)
#         with open(f'{save_path}/emb_{stop}.pkl', 'wb') as f:
#             pickle.dump(xs, f)
    



  0%|                                                     | 0/3 [00:00<?, ?it/s]

(1, 682, 1024)


 33%|███████████████                              | 1/3 [00:01<00:02,  1.34s/it]

(1, 682, 1024)


 67%|██████████████████████████████               | 2/3 [00:02<00:01,  1.26s/it]

(1, 750, 1024)


100%|█████████████████████████████████████████████| 3/3 [00:03<00:00,  1.30s/it]


In [37]:
seq_emb[0]['wt_seq'][:,aa_index,:].shape

(1, 1024)

In [32]:
seq_emb[0]['aa_index']

56

In [None]:
def pickler(embedding_dict, output_dir, chain_id):
    filename = output_dir + chain_id + '.pickle'
    with open(filename, 'wb') as filehandle:
        pickle.dump(embedding_dict, filehandle)

In [28]:
embed_path = '../t5_embeds/neutral'

In [46]:
path = embed_path + '/'
concat = []
for pkl in os.listdir(path):
    if(".pkl" in pkl):
        file_path = path + pkl
        with open(file_path, 'rb') as file:
            y = pickle.load(file)
            concat.append(y)

In [48]:
concat[0]

{'wt_seq': array([[[ 0.01025558, -0.10217149,  0.02013365, ...,  0.01055854,
           0.2522351 , -0.02920763],
         [ 0.07315456,  0.04703049,  0.09239218, ..., -0.09874517,
           0.13134111, -0.22873077],
         [ 0.13563967,  0.00838286, -0.01346737, ..., -0.02939852,
           0.11954609, -0.37139788],
         ...,
         [-0.20492937, -0.07644842, -0.14209639, ..., -0.01432389,
          -0.01909379,  0.05864273],
         [ 0.02184598, -0.2073565 ,  0.14557888, ...,  0.01770801,
          -0.04518939,  0.01948687],
         [ 0.06724052, -0.05059998,  0.09423537, ..., -0.07000535,
           0.03879933,  0.00983902]]], dtype=float32),
 'mt_seq': array([[[ 0.01401372, -0.10365199,  0.01554876, ...,  0.00934867,
           0.250978  , -0.02200917],
         [ 0.07283772,  0.04388886,  0.08892835, ..., -0.09372276,
           0.1330796 , -0.22846796],
         [ 0.13466269,  0.00707363, -0.0135285 , ..., -0.02254376,
           0.12361645, -0.36959693],
         ...

In [None]:
def data_for_downstream(embed_path):
    path = embed_path + '/'
    concat = []
    for pkl in os.listdir(path):
        if(".pkl" in pkl):
            file_path = path + pkl
            with open(file_path, 'rb') as file:
                y = pickle.load(file)
                concat.append(y)

In [None]:
def data_for_downstream(embed_path):
    path = embed_path + '/'
    concat = []
    for pkl in os.listdir(path):
        if(".pkl" in pkl):
            file_path = path + pkl
            with open(file_path, 'rb') as file:
                y = pickle.load(file)
                concat.append(y)
    data_y = []
    data_X = []
    for i in range(len(concat)):
        data_X.append(concat[i]['x'][0])
        data_y.append(int(concat[i]['label']))
    data_X = np.array(data_X)
    return data_X, data_y

In [None]:
def unpickler(dir, chain_id):
    if '.pickle' not in chain_id:
        chain_id += '.pickle'
    with open(dir + chain_id, 'rb') as filehandle:
        return pickle.load(filehandle)

In [33]:
def pickler(embedding_dict, output_dir, chain_id):
    filename = output_dir + chain_id + '.pickle'
    with open(filename, 'wb') as filehandle:
        pickle.dump(embedding_dict, filehandle)


def unpickler(dir, chain_id):
    if '.pickle' not in chain_id:
        chain_id += '.pickle'
    with open(dir + chain_id, 'rb') as filehandle:
        return pickle.load(filehandle)

In [34]:
protein_seq = pd.read_csv('../data/mode1_for_embed.csv')
protein_test = protein_seq[:3]

In [35]:
model = HuggingT5(model, tokenizer)

In [36]:
wt_seq, mt_seq, wt_emb, mt_emb = model(protein_test)

In [None]:

embedder = HuggingT5()
df = pd.read_csv('example_df.csv')
output_dir = 't5_sequence_embeddings/'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
try:
    # if running as an array job process one row based on integer passed as argument then quit
    args = sys.argv
    row_index = int(args[1]) - 1
except:
    # if not in array-job mode iterate through whole dataset
    for i, row in df.iterrows():
        wt_seq = row['wt_seq']
        mt_seq = row['mt_seq']
        
        wt_seq, mt_seq, wt_emb, mt_emb, label = embedder(df)

        
        embed_dict = {
            'wt_sequence': wt_seq,
            'wt_embedding': wt_emb,
            'mt_sequence': mt_seq,
            'mt_embedding':mt_emb,
            'label':label
            
        }
        mt_embed_dict = {
            'mt_sequence': mt_seq,
            'mt_embedding':mt_emb
        }
        
        
        pickler(embed_dict, output_dir, chain_id)

In [33]:
# get attention sum 

xs = []
result = None
count = 0
embed_error_count = 0
# protein_seq = protein_seq[start:stop]
data_len = len(protein_seq)

for index, seq in tqdm(protein_seq.iterrows(), total=protein_seq.shape[0]):
    s_len = len(seq['wt_seq'].replace(" ",'')) + 1
    aa_index = seq['aa_index']
    print(aa_index)
    label = seq['label']
    wt_aa = seq['wt_aa']
    mt_aa = seq['mt_aa']
    wt_seq = seq['wt_seq'].replace(" ",'')
    mt_seq = seq['mt_seq'].replace(" ",'')
    # AF_DB = seq['AlphaFoldDB']
    # PDB = seq['PDB']
    # pathogenicity = seq['pathogenicity']

    # add_special_tokens adds extra token at the end of each sequence
    # token_encoding = tokenizer.batch_encode_plus([seq['wt_seq'], seq['mt_seq']], add_special_tokens=True, padding="longest")
    wt_token_encoding = tokenizer.batch_encode_plus([seq['wt_seq']], add_special_tokens=True, padding="longest")
    wt_input_ids      = torch.tensor(wt_token_encoding['input_ids']).to(device)
    wt_attention_mask = torch.tensor(wt_token_encoding['attention_mask']).to(device)
    
    mt_token_encoding = tokenizer.batch_encode_plus([seq['mt_seq']], add_special_tokens=True, padding="longest")
    mt_input_ids      = torch.tensor(mt_token_encoding['input_ids']).to(device)
    mt_attention_mask = torch.tensor(mt_token_encoding['attention_mask']).to(device)

    with torch.no_grad():
        # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
        wt_embedding_repr = model(wt_input_ids, attention_mask=wt_attention_mask, output_attentions = True)
        mt_embedding_repr = model(mt_input_ids, attention_mask=mt_attention_mask, output_attentions = True)
        # _,attention = embedding_repr
        # print(embedding_repr.attentions)
        wt_emb = wt_embedding_repr.attentions[23]
        wt_emb = torch.sum(wt_emb, dim = 1)
        wt_emb = wt_emb.squeeze(0)[:,aa_index]
        # wt_emb = F.normalize(wt_emb, p=2, dim=0)
        wt_emb = wt_emb.detach().cpu().numpy()
        
        mt_emb = mt_embedding_repr.attentions[23]
        mt_emb = torch.sum(mt_emb, dim = 1)
        mt_emb = mt_emb.squeeze(0)[:,aa_index]
        # mt_emb = F.normalize(mt_emb, p=2, dim=0)
        mt_emb = mt_emb.detach().cpu().numpy()
        
        # emb = embedding_repr.last_hidden_state
        # print('last_hidden_state shape: ', emb.shape)
        

        # emb = embedding_repr.last_hidden_state[:, :s_len]
        # emb = emb[:, aa_index, :]

  0%|                                                     | 0/3 [00:00<?, ?it/s]

419


 33%|███████████████                              | 1/3 [00:00<00:01,  1.00it/s]

170


 67%|██████████████████████████████               | 2/3 [00:01<00:00,  1.74it/s]

115


100%|█████████████████████████████████████████████| 3/3 [00:01<00:00,  1.97it/s]


In [34]:
# test = wt_embedding_repr.attentions[23]
# test = torch.sum(test, dim = 1)
# test = test.squeeze(0)[:,0]
# test_norm = F.normalize(test, p=2, dim=0)
# # test = test[:,0,:]

In [37]:
print(wt_emb)

[0.1160432  0.12775989 0.11045069 0.14577973 0.10858408 0.11537667
 0.10975821 0.11394133 0.1444046  0.1745289  0.15099506 0.1781297
 0.17844674 0.1390548  0.17012155 0.17209208 0.17086565 0.17177331
 0.14033851 0.11473133 0.2172845  0.1175368  0.10900705 0.15641291
 0.0896409  0.15674853 0.17854357 0.15464589 0.14511415 0.22579952
 0.17321773 0.14685002 0.111662   0.15953174 0.08071574 0.12249468
 0.12150023 0.15322736 0.09856017 0.1250483  0.10344736 0.13067184
 0.08169708 0.1225986  0.11090991 0.14295068 0.10121445 0.10842209
 0.12657428 0.09247285 0.12581694 0.10699578 0.17590865 0.16513535
 0.11665752 0.10353215 0.11359414 0.11031148 0.10424143 0.16148742
 0.13407706 0.11382663 0.15381092 0.12532783 0.13636453 0.14971527
 0.20012227 0.10632914 0.1322632  0.15494633 0.17363763 0.22133395
 0.16021797 0.1598051  0.18786612 0.17880848 0.16776389 0.18553206
 0.19663599 0.16966838 0.20723915 0.16902375 0.194119   0.19202387
 0.25765848 0.18897063 0.22890988 0.25716606 0.27684712 0.25032

wandb: Network error (ConnectionError), entering retry loop.


In [31]:
cos_sim = F.cosine_similarity(wt_emb, mt_emb, dim=0)
print(cos_sim) 

TypeError: cosine_similarity(): argument 'x1' (position 1) must be Tensor, not numpy.ndarray

In [36]:
from scipy import spatial
wt = wt_emb
mt = mt_emb
cos_sim = 1 - spatial.distance.cosine(wt, mt)
print(cos_sim)

0.9995120167732239


In [192]:
test = emb.squeeze(0)

In [None]:
test.shape

torch.Size([682, 682])

In [None]:
test[:,0]

torch.Size([682])

In [17]:
xs = []
result = None
count = 0
embed_error_count = 0
# protein_seq = protein_seq[start:stop]
data_len = len(protein_seq)

for index, seq in tqdm(protein_seq.iterrows(), total=protein_seq.shape[0]):
    s_len = len(seq['wt_seq'].replace(" ",'')) + 1
    aa_index = seq['aa_index']
    label = seq['label']
    wt_aa = seq['wt_aa']
    mt_aa = seq['mt_aa']
    wt_seq = seq['wt_seq'].replace(" ",'')
    mt_seq = seq['mt_seq'].replace(" ",'')
    # AF_DB = seq['AlphaFoldDB']
    # PDB = seq['PDB']
    # pathogenicity = seq['pathogenicity']

    # add_special_tokens adds extra token at the end of each sequence
    token_encoding = tokenizer.batch_encode_plus([seq['wt_seq'], seq['mt_seq']], add_special_tokens=True, padding="longest")
    input_ids      = torch.tensor(token_encoding['input_ids']).to(device)
    attention_mask = torch.tensor(token_encoding['attention_mask']).to(device)

    with torch.no_grad():
        # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
        embedding_repr = model(input_ids, attention_mask=attention_mask)
        emb = embedding_repr.last_hidden_state[:, :s_len]
        emb = emb[:, aa_index, :]

100%|███████████████████████████████████████████| 10/10 [00:09<00:00,  1.10it/s]


In [18]:
aa_index

164

In [19]:
emb

array([[-0.05453132, -0.10480957,  0.03223428, ...,  0.0036165 ,
         0.07656325, -0.2850541 ],
       [-0.11150735, -0.12776607, -0.00843289, ...,  0.006739  ,
         0.05970911, -0.1674745 ]], dtype=float32)

In [75]:
cos_sim = F.cosine_similarity(wt_emb, mt_emb, dim=0)
print(cos_sim) 

tensor([1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000], device='mps:0')


In [14]:
protein_test = protein_seq[:3]

In [30]:
import pickle
def get_embedding(protein_seq, start=None, stop=None, save_path=embed_path):
    
    xs = []
    result = None
    count = 0
    embed_error_count = 0
    protein_seq = protein_seq[start:stop]
    data_len = len(protein_seq)

    for index, seq in tqdm(protein_seq.iterrows(), total=protein_seq.shape[0]):
        s_len = len(seq['wt_seq'].replace(" ",'')) + 1
        aa_index = seq['aa_index']
        label = seq['label']
        wt_aa = seq['wt_aa']
        mt_aa = seq['mt_aa']
        wt_seq = seq['wt_seq'].replace(" ",'')
        mt_seq = seq['mt_seq'].replace(" ",'')
        # AF_DB = seq['AlphaFoldDB']
        # PDB = seq['PDB']
        # pathogenicity = seq['pathogenicity']
        
        # add_special_tokens adds extra token at the end of each sequence
        token_encoding = tokenizer.batch_encode_plus([seq['wt_seq'], seq['mt_seq']], add_special_tokens=True, padding="longest")
        input_ids      = torch.tensor(token_encoding['input_ids']).to(device)
        attention_mask = torch.tensor(token_encoding['attention_mask']).to(device)

        with torch.no_grad():
            # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
            embedding_repr = model(input_ids, attention_mask=attention_mask)
            emb = embedding_repr.last_hidden_state[:, :s_len]
            # emb = emb[:, aa_index, :]
            try:
                emb = emb[:, aa_index, :]
            except:
                embed_error_count += 1
                print(f'embedding error: index: {index}, aa_index:{aa_index}, aa_length: {s_len} , error_count:{embed_error_count}')
                
            # print(aa_index)
            x = emb.detach().cpu().numpy().squeeze()
            # print(x.shape)
            # print(x[0, :].tolist())
            # print(x.shape)
           
            temp = pd.DataFrame({
                'label':label,
                'mutant_index': aa_index,
                'wt_aa': wt_aa,
                't_aa': mt_aa,
                'wt_seq': wt_seq,
                'mt_seq': mt_seq,
                'wt_emb': [x[0, :].tolist()],
                'mt_emb':[x[1,:].tolist()]
                # 'AF_DB': AF_DB,
                # 'PDB_ID': PDB
            })
            
            if result is None:
                result=temp
            else:
                result = pd.concat([result,temp])

            xs.append({'x':x.reshape(1,-1),'label':label})

    # Save results
    if not os.path.isdir(f'{save_path}'):
        os.mkdir(f'{save_path}')
            
    if start is None:
        result.to_csv(f'{save_path}/sequence_embeddings({data_len}).csv', index=False)
        with open(f'{save_path}/emb({data_len}).pkl', 'wb') as f:
            pickle.dump(xs, f)
    else:
        result.to_csv(f'{save_path}/sequence_{stop}_embeddings.csv', index=False)
        with open(f'{save_path}/emb_{stop}.pkl', 'wb') as f:
            pickle.dump(xs, f)
    
# get_embedding(seq)  
    

In [49]:
embed_path = '../data'
df = get_embedding(protein_test, save_path = embed_path).reset_index()

 33%|███████████████                              | 1/3 [00:01<00:02,  1.30s/it]

(2, 1024)


 67%|██████████████████████████████               | 2/3 [00:02<00:01,  1.25s/it]

(2, 1024)


100%|█████████████████████████████████████████████| 3/3 [00:03<00:00,  1.30s/it]

(2, 1024)





In [56]:
type(df['wt_emb'][0])

list

In [43]:
df

Unnamed: 0,label,mutant_index,wt_aa,t_aa,wt_seq,mt_seq,wt_emb,mt_emb
0,0,56,G,S,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,"[-0.051906075328588486, 0.030050912871956825, ...","[-0.05973760783672333, 0.01039073709398508, 0...."
0,0,665,G,A,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,MSKGILQVHPPICDCPGCRISSPVNRGRLADKRTVALPAARNLKKE...,"[0.09436676651239395, 0.10468544811010361, -0....","[0.015477344393730164, 0.10830564796924591, -0..."
0,0,203,A,V,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,MAAAGSRKRRLAELTVDEFLASGFDSESESESENSPQAETREAREA...,"[0.07445942610502243, -0.0006038720021024346, ...","[0.0457751490175724, 0.019939227029681206, -0...."


In [205]:
test_df = pd.read_csv('../data/sequence_embeddings(5).csv')

In [206]:
test_df

Unnamed: 0,label,mutant_index,wt_aa,t_aa,wt_seq,mt_seq,wt_emb,mt_emb
0,0,235,T,N,MWYHRLSHLHSRLQDLLKGGVIYPALPQPNFKSLLPLAVHWHHTAS...,MWYHRLSHLHSRLQDLLKGGVIYPALPQPNFKSLLPLAVHWHHTAS...,"[0.47407856583595276, -0.004357014782726765, 0...","[0.49423736333847046, -0.00948814395815134, 0...."
1,0,365,R,H,MGLGRCIWEGWTLESEALRRDMGTWLLACICICTCVCLGVSVTGEG...,MGLGRCIWEGWTLESEALRRDMGTWLLACICICTCVCLGVSVTGEG...,"[0.10706128925085068, -0.019705910235643387, -...","[0.11566520482301712, -0.03639891371130943, 0...."
2,1,127,Y,C,MQSYASAMLSVFNSDDYSPAVQENIPALRRSSSFLCTESCNSKYQC...,MQSYASAMLSVFNSDDYSPAVQENIPALRRSSSFLCTESCNSKYQC...,"[-0.18334175646305084, 0.42542409896850586, -0...","[-0.21863877773284912, 0.39868706464767456, 0...."
3,1,76,R,L,MQSYASAMLSVFNSDDYSPAVQENIPALRRSSSFLCTESCNSKYQC...,MQSYASAMLSVFNSDDYSPAVQENIPALRRSSSFLCTESCNSKYQC...,"[0.09536199271678925, 0.08094870299100876, 0.1...","[0.051270924508571625, 0.04820641875267029, 0...."
4,1,68,I,T,MQSYASAMLSVFNSDDYSPAVQENIPALRRSSSFLCTESCNSKYQC...,MQSYASAMLSVFNSDDYSPAVQENIPALRRSSSFLCTESCNSKYQC...,"[0.08919508010149002, -0.23834393918514252, -0...","[0.12021289020776749, -0.27279385924339294, -0..."


In [58]:
str_list = []
str_list.append(test_df['wt_emb'][0].strip('[').strip(']').split(', '))

NameError: name 'test_df' is not defined

In [57]:
a = []
for j in range(len(test_df)):
    wt_list = []
    mt_list = []
    wt_list.append(test_df['wt_emb'][j].strip('[').strip(']').split(', '))
    mt_list.append(test_df['mt_emb'][j].strip('[').strip(']').split(', '))
    wt_float = [float(i) for i in wt_list[0]]
    mt_float = [float(i) for i in mt_list[0]]
    stack = np.hstack((wt_float,mt_float))
    a.append(stack)
arr = np.array(a)
arr = np.concatenate((arr, np.array(test_df['label']).reshape(-1,1)), axis = 1)
arr.shape

NameError: name 'test_df' is not defined

In [222]:
arr

array([[ 4.74078566e-01, -4.35701478e-03,  1.76675133e-02, ...,
        -1.14383489e-01, -4.36267525e-01,  0.00000000e+00],
       [ 1.07061289e-01, -1.97059102e-02, -2.84917559e-02, ...,
         7.81306550e-02,  2.55043417e-01,  0.00000000e+00],
       [-1.83341756e-01,  4.25424099e-01, -1.85506679e-02, ...,
        -1.51874289e-01,  1.37499069e-06,  1.00000000e+00],
       [ 9.53619927e-02,  8.09487030e-02,  1.46531582e-01, ...,
         1.78747680e-02,  6.12068363e-03,  1.00000000e+00],
       [ 8.91950801e-02, -2.38343939e-01, -1.26406759e-01, ...,
         4.66004968e-01, -2.24090338e-01,  1.00000000e+00]])

In [10]:
embed_path = '../data/imbalance_same_seq/Embedding_results_csv/mode_1_embeds'
data_X, data_y = data_for_downstream()

In [13]:
data_X[0]

array([-0.05190601,  0.0300509 ,  0.18701199, ...,  0.11976019,
       -0.10207947, -0.08150594], dtype=float32)

In [118]:
df = pd.read_csv('../data/imbalance_same_seq/Embedding_results_csv/model_1_embeds/sequence_embeddings(27750).csv')

In [124]:
df['wt_emb'][4]

'[ 0.04528007  0.02473227 -0.2799072  ... -0.23358242 -0.02248511\n  0.28593436]'

In [104]:
float_list = [float(i) for i in str_list[0]]

In [109]:
arr = np.array(float_list) 

In [111]:
arr.shape

(1024,)

In [97]:
float(str_list[0][0])

-0.051906075328588486

In [37]:
def embed_in_batch(protein_seq, amount):
    value_input = amount
    data_len = len(protein_seq)
    fold = data_len // value_input
    remainder = data_len - data_len % value_input

    for i in range(fold):
        get_embedding(protein_seq,  start = i* value_input, stop = (i+1)*value_input)
    
    get_embedding(protein_seq, start = remainder, stop = data_len)

In [None]:
try:
    # generate embeddings in bacth
    embed_in_batch(protein_seq, 10)

    # # generate embeddings in whole
    # get_embedding(protein_seq)
except Exception as e:
    print('Error: ' + str(e))
    allDone()
finally:
    allDone()

In [19]:
embed_path = '../data/imbalance_same_seq/Embedding_results_csv/model_1_embeds'

for csv in os.listdir('../data/balanced_same_seq/'):
    if ('.csv' in csv):
        print(csv)

mode_1_test.csv
mode_2_train_3.csv
mode_2_train_2.csv
mode_2_train_1.csv
mode_2_test.csv
mode_1_train_1.csv
mode_1_train_2.csv
mode_1_train_3.csv


In [20]:
mode_1_test = pd.read_csv('../data/balanced_same_seq/mode_1_test.csv')

In [21]:
mode_1_test.head()

Unnamed: 0,wt_emb,mt_emb,label
0,[ 0.28979653 -0.08548378 0.25508213 ... -0.02...,[ 0.40497345 -0.06441736 0.2873508 ... 0.07...,0
1,[ 0.2392044 0.10363419 0.01531088 ... -0.09...,[ 0.23146236 0.10686753 0.03843426 ... -0.08...,1
2,[-0.05516514 0.01958648 0.20229368 ... -0.21...,[-0.07178547 -0.00336192 0.25487044 ... -0.23...,1
3,[-0.37009916 0.008683 0.10970505 ... -0.06...,[-0.35689098 0.03892659 0.077952 ... -0.04...,0
4,[ 0.02989817 0.31884304 -0.25646332 ... -0.01...,[ 0.01292542 0.2915483 -0.13674647 ... -0.04...,1


In [None]:
import pickle
def get_embedding(protein_seq, save_path):
    
    xs = []
    count = 0
    embed_error_count = 0
    data_len = len(protein_seq)

    for index, seq in tqdm(protein_seq.iterrows(), total=protein_seq.shape[0]):
        s_len = len(seq['wt_seq'].replace(" ",'')) + 1
        label = seq['label']
        wt_seq = seq['wt_seq'].replace(" ",'')
        mt_seq = seq['mt_seq'].replace(" ",'')
        # AF_DB = seq['AlphaFoldDB']
        # PDB = seq['PDB']
        # pathogenicity = seq['pathogenicity']
        
        # add_special_tokens adds extra token at the end of each sequence
        token_encoding = tokenizer.batch_encode_plus([seq['wt_seq'], seq['mt_seq']], add_special_tokens=True, padding="longest")
        input_ids      = torch.tensor(token_encoding['input_ids']).to(device)
        attention_mask = torch.tensor(token_encoding['attention_mask']).to(device)

        with torch.no_grad():
            # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
            embedding_repr = model(input_ids, attention_mask=attention_mask)
            emb = embedding_repr.last_hidden_state[:, :s_len]
            # emb = emb[:, aa_index, :]
            try:
                emb = emb[:, aa_index, :]
            except:
                embed_error_count += 1
                print(f'embedding error: index: {index}, aa_index:{aa_index}, aa_length: {s_len} , error_count:{embed_error_count}')
                
            # print(aa_index)
            x = emb.detach().cpu().numpy().squeeze()
           
            temp = pd.DataFrame({
                'label':label,
                'mutant_index': aa_index,
                'wt_aa': wt_aa,
                't_aa': mt_aa,
                'wt_seq': wt_seq,
                'mt_seq': mt_seq,
                'wt_emb': [x[0, :]],
                'mt_emb':[x[1,:]],
                # 'AF_DB': AF_DB,
                # 'PDB_ID': PDB
            })
            
            if result is None:
                result=temp
            else:
                result = pd.concat([result,temp])

            xs.append({'gene_id': 'x':x.reshape(1,-1), 'wt_seq': wt_seq, 'mt_seq': mt_seq ,'label':label})
            
    # Save results
    if not os.path.isdir(f'{save_path}'):
        os.mkdir(f'{save_path}')
            
    if start is None:
        # result.to_csv(f'{save_path}/sequence_embeddings({data_len}).csv', index=False)
        with open(f'./data_test/emb({data_len}).pkl', 'wb') as f:
            pickle.dump(xs, f)
    else:
        # result.to_csv(f'{save_path}/sequence_{stop}_embeddings.csv', index=False)
        with open(f'{save_path}/emb_{stop}.pkl', 'wb') as f:
            pickle.dump(xs, f)
    
# get_embedding(seq)  
    

In [32]:
pd.read_csv(f'{embed_path}/sequence_10_embeddings.csv')

Unnamed: 0,label,mutant_index,wt_aa,t_aa,wt_seq,mt_seq,wt_emb,mt_emb
0,0.0,123,N,S,M Q P R S E R P A G R T Q S P E H G S P G P G ...,M Q P R S E R P A G R T Q S P E H G S P G P G ...,[-0.20791069 0.05744875 0.431307 ... -0.22...,[-0.20496441 0.05060612 0.3893946 ... -0.21...
1,0.0,21,S,N,M G W D L T V K M L A G N E F Q V S L S S S M ...,M G W D L T V K M L A G N E F Q V S L S N S M ...,[-0.05899208 -0.4959501 0.04672551 ... 0.05...,[-0.06410073 -0.5019426 0.09179133 ... 0.08...
2,0.0,83,S,N,M G W D L T V K M L A G N E F Q V S L S S S M ...,M G W D L T V K M L A G N E F Q V S L S S S M ...,[-0.1296031 -0.0422971 -0.1627439 ... 0.28...,[-0.13209993 -0.04053592 -0.12918288 ... 0.29...
3,0.0,141,G,S,M G W D L T V K M L A G N E F Q V S L S S S M ...,M G W D L T V K M L A G N E F Q V S L S S S M ...,[-0.32480806 0.16869994 -0.0275444 ... -0.52...,[-0.3464439 0.12891982 0.02962563 ... -0.49...
4,0.0,164,R,P,M G W D L T V K M L A G N E F Q V S L S S S M ...,M G W D L T V K M L A G N E F Q V S L S S S M ...,[-0.05453132 -0.10480957 0.03223428 ... 0.00...,[-0.11150735 -0.12776607 -0.00843289 ... 0.00...
5,0.0,217,A,T,M C V G A R R L G R G P C A A L L L L G L G L ...,M C V G A R R L G R G P C A A L L L L G L G L ...,[-0.02458479 -0.17859626 -0.06197904 ... -0.22...,[ 0.00635099 -0.17698371 -0.00687718 ... -0.21...
6,0.0,207,V,M,M C V G A R R L G R G P C A A L L L L G L G L ...,M C V G A R R L G R G P C A A L L L L G L G L ...,[-0.07218366 -0.08828887 0.42139104 ... -0.18...,[-0.05772915 -0.07945231 0.463516 ... -0.16...
7,0.0,165,I,T,M C V G A R R L G R G P C A A L L L L G L G L ...,M C V G A R R L G R G P C A A L L L L G L G L ...,[ 0.34089744 -0.03680613 0.17513815 ... -0.16...,[ 0.35381633 -0.04757376 0.11673401 ... -0.16...
8,0.0,157,P,L,M C V G A R R L G R G P C A A L L L L G L G L ...,M C V G A R R L G R G P C A A L L L L G L G L ...,[-0.06020525 -0.06952492 -0.13853367 ... 0.10...,[-0.05828763 -0.07256152 -0.08972525 ... 0.10...
9,0.0,98,L,P,M C V G A R R L G R G P C A A L L L L G L G L ...,M C V G A R R L G R G P C A A L L L L G L G L ...,[-0.20321272 -0.07306463 0.21406977 ... 0.07...,[-0.18795833 -0.04739751 0.16771509 ... 0.07...


## Read embedding data

In [133]:
embed_path = '../data/gene_not_constrain/imbalance_same_seq/Embedding_results_csv/mode_1_embeds'



In [134]:
def data_for_downstream():
    path = embed_path + '/'
    concat = []
    for pkl in os.listdir(path):
        if(".pkl" in pkl):
            file_path = path + pkl
            with open(file_path, 'rb') as file:
                y = pickle.load(file)
                concat += y
    data_y = []
    data_X = []
    for i in range(len(concat)):
        data_X.append(concat[i]['x'][0])
        data_y.append(int(concat[i]['label']))
    data_X = np.array(data_X)
    return data_X, data_y

In [132]:
# read residue_embeddings
import pickle

with open('emb_19.pkl', 'rb') as file:
    y_19 = pickle.load(file)
data_y = []
data_X = []
for i in range(len(y)):
    data_X.append(y[i]['x'][0])
    data_y.append(int(y[i]['label']))
# turn residue_enbeddings (tensors) into numpy array
data_X = np.array(data_X)
data_X.shape

FileNotFoundError: [Errno 2] No such file or directory: 'emb_19.pkl'

In [136]:
data_X, data_y = data_for_downstream()

In [137]:
data_X.shape, len(data_y)

((27750, 2048), 27750)

In [150]:
data_X[45]

array([-0.08853678, -0.0639109 , -0.1026528 , ..., -0.09638178,
        0.04749429,  0.04319501], dtype=float32)

In [156]:
type(data_X)

numpy.ndarray

In [None]:
type(data_X)

numpy.ndarray

## Dataset and Traditional ML method

In [17]:
# 切分数据集
X_train, X_test, y_train, y_test= train_test_split(data_X, data_y,
                                                    test_size=0.2,
                                                    stratify=data_y,
                                                   random_state=42)
# 切分出valid数据集
X_valid, X_test, y_valid, y_test = train_test_split(X_test,y_test,
                                               test_size=0.3,
                                               shuffle=True,
                                               stratify=y_test,
                                               random_state=42)

len(X_train), len(X_test),len(y_train),len(y_test), len(X_valid),len(y_valid)

(16952, 1272, 16952, 1272, 2966, 2966)

In [18]:
# Traditional ML training

result_path = './predicted_results'

def traditional_model(name, X_train, y_train, X_test, y_test):
    model_name = re.search(r"(.*)(Classifier|Regression)", str(name))
    model_name = model_name.group(1)

    name.fit(X_train, y_train)
    y_pred = name.predict(X_test)
    
    report_save(y_test, y_pred, model_name)
    
def report_save(y_true, y_pred, name, label_names=None, *args, **kv):
    result_path = './predicted_results'
    # print the classification report here
    report = classification_report(y_true, y_pred, target_names=label_names)
    print(colored(f'\n\t\t\t\t *** {name}_report ***:\n\n\n', 'blue', attrs=['bold']), report)

    # create report dataframe
    report_for_save = classification_report(y_true, y_pred, target_names=label_names, output_dict=True)
    report_csv = pd.DataFrame(report_for_save).transpose()

    # style.background_gradient or highlight_max
    report_styled = report_csv.style.background_gradient(subset=['precision', 'recall', 'f1-score'])
    
    # Save results
    if not os.path.isdir(f'{result_path}'):
        os.mkdir(f'{result_path}')

    # export dataframe to .png
    dfi.export(report_styled, f'{result_path}/{name}_report.png')

    # report_csv.to_csv(f'{name}_report_save.csv')


In [19]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import classification_report

eval_s = [(X_train, y_train), (X_test, y_test)]

rfc=RandomForestClassifier(random_state=0, n_estimators = 10)
gbt=GradientBoostingClassifier(random_state=0, n_estimators = 8)
# xgb = XGBClassifier()

rfc.fit(X_train, y_train)
gbt.fit(X_train, y_train)
# xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_s, verbose=False)

y_rfc = rfc.predict(X_test)
y_gbt = gbt.predict(X_test)
# y_xgb = xgb.predict(X_test)

KeyboardInterrupt: 

In [20]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import classification_report

rfc = RandomForestClassifier(random_state=42, n_estimators=10)
gbt=GradientBoostingClassifier(random_state=0, n_estimators = 8)
traditional_model(rfc,X_train,y_train,X_test,y_test)
traditional_model(gbt,X_train,y_train,X_test,y_test)

# XGBoost
from xgboost import XGBClassifier
eval_s = [(X_train, y_train), (X_test, y_test)]
xgb = XGBClassifier()
xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_s, verbose=False)
y_xgb = xgb.predict(X_test)
report_save(y_test, y_xgb, 'XGBoost')

[1m[34m
				 *** RandomForest_report ***:


[0m               precision    recall  f1-score   support

           0       0.71      0.81      0.76       682
           1       0.74      0.62      0.67       590

    accuracy                           0.72      1272
   macro avg       0.72      0.71      0.72      1272
weighted avg       0.72      0.72      0.72      1272



objc[92974]: Class WebSwapCGLLayer is implemented in both /System/Library/Frameworks/WebKit.framework/Versions/A/Frameworks/WebCore.framework/Versions/A/Frameworks/libANGLE-shared.dylib (0x210e01b50) and /Applications/Google Chrome.app/Contents/Frameworks/Google Chrome Framework.framework/Versions/104.0.5112.79/Libraries/libGLESv2.dylib (0x10aa05d08). One of the two will be used. Which one is undefined.
[0807/002243.108705:INFO:headless_shell.cc(660)] Written to file /var/folders/yf/b5jj2z454vx8gz_ppz2wrtdc0000gn/T/tmp4uyuei0p/temp.png.


[1m[34m
				 *** GradientBoosting_report ***:


[0m               precision    recall  f1-score   support

           0       0.74      0.76      0.75       682
           1       0.71      0.69      0.70       590

    accuracy                           0.73      1272
   macro avg       0.73      0.72      0.72      1272
weighted avg       0.73      0.73      0.73      1272



objc[93016]: Class WebSwapCGLLayer is implemented in both /System/Library/Frameworks/WebKit.framework/Versions/A/Frameworks/WebCore.framework/Versions/A/Frameworks/libANGLE-shared.dylib (0x210e01b50) and /Applications/Google Chrome.app/Contents/Frameworks/Google Chrome Framework.framework/Versions/104.0.5112.79/Libraries/libGLESv2.dylib (0x10a93dd08). One of the two will be used. Which one is undefined.
[0807/002350.474261:INFO:headless_shell.cc(660)] Written to file /var/folders/yf/b5jj2z454vx8gz_ppz2wrtdc0000gn/T/tmp2fpk18bw/temp.png.


[1m[34m
				 *** XGBoost_report ***:


[0m               precision    recall  f1-score   support

           0       0.81      0.80      0.81       682
           1       0.77      0.78      0.78       590

    accuracy                           0.79      1272
   macro avg       0.79      0.79      0.79      1272
weighted avg       0.79      0.79      0.79      1272



objc[93055]: Class WebSwapCGLLayer is implemented in both /System/Library/Frameworks/WebKit.framework/Versions/A/Frameworks/WebCore.framework/Versions/A/Frameworks/libANGLE-shared.dylib (0x210e01b50) and /Applications/Google Chrome.app/Contents/Frameworks/Google Chrome Framework.framework/Versions/104.0.5112.79/Libraries/libGLESv2.dylib (0x1082c5d08). One of the two will be used. Which one is undefined.
[0807/002447.404936:INFO:headless_shell.cc(660)] Written to file /var/folders/yf/b5jj2z454vx8gz_ppz2wrtdc0000gn/T/tmptwoj0njj/temp.png.


In [46]:
# RandomForest report
report_save(y_test,y_rfc,'RandomForest')

              precision    recall  f1-score   support

           0       0.75      0.83      0.79      1204
           1       0.72      0.61      0.66       857

    accuracy                           0.74      2061
   macro avg       0.74      0.72      0.73      2061
weighted avg       0.74      0.74      0.74      2061



In [47]:
# GradientBoosting report
report_save(y_test,y_gbt,'GradientBoosting')

              precision    recall  f1-score   support

           0       0.73      0.86      0.79      1204
           1       0.74      0.55      0.63       857

    accuracy                           0.74      2061
   macro avg       0.74      0.71      0.71      2061
weighted avg       0.74      0.74      0.73      2061



In [44]:
# XGBoost report
report_save(y_test,y_xgb,'XGBoost')

              precision    recall  f1-score   support

           0       0.82      0.85      0.83      1204
           1       0.77      0.73      0.75       857

    accuracy                           0.80      2061
   macro avg       0.80      0.79      0.79      2061
weighted avg       0.80      0.80      0.80      2061



## 随机生成蛋白序列

In [2]:
wild_type_seq = []
mutant_type_seq = []
aa_index = []
sub_aa = []
origin_aa=[]
label = []
generate_length = 15000
random.seed(55)

alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
alphabet = [char for char in alphabet]
alphabet_for_mutant = 'ACDEFGHIJKLMNPQRSTVWXY'
alphabet_for_mutant = [char for char in alphabet_for_mutant]

for length in range(generate_length):
    str_list=[random.choice(alphabet) for i in range(random.randint(200,380))]
    random_str = "".join(str_list)
    random_str = re.sub(r"[UZOB]", "X", random_str)
    wild_type_seq.append(random_str)

    sequence =[char for char in random_str]
    index = random.choice(range(len(sequence)))
    replace = random.choice(alphabet_for_mutant)
    origin = sequence[index-1]
    sequence[index-1] = replace
    mutant_seq = ''.join(sequence)
    mutant_type_seq.append(mutant_seq)
    aa_index.append(index)
    sub_aa.append(replace)
    origin_aa.append(origin)
    label = [random.randint(0,1) for i in range(generate_length)]

dict = {'index': aa_index, 'origin_aa': origin_aa,'sub_aa': sub_aa, 'wild_type': wild_type_seq,'mutant':mutant_type_seq, 'label':label}
protein_seq = pd.DataFrame(dict)


KeyboardInterrupt: 

In [None]:
protein_seq.to_csv('practise_seq.csv', index = False)

In [None]:
len(protein_seq['wild_type'][0].replace(" ",''))

In [46]:
protein_seq.shape

(5000, 6)

In [45]:
protein_seq = protein_seq[:5000]

In [42]:
protein_seq.drop(labels=4274, inplace = True)

In [41]:
protein_seq.iloc[4274]

index                                                       66
origin_aa                                                    D
sub_aa                                                       H
wild_type    H K M X X M S X F I M J D X E P Y T K L X X Y ...
mutant       H K M X X M S X F I M J D X E P Y T K L X X Y ...
label                                                        0
Name: 4275, dtype: object

In [30]:
protein_seq['wild_type'][1242]

'F J S I D Q Q L C Y D Q G X J Q X X X I J X N K N N H W Y F S K K H X I Q X L X Q W A S J M P G P K P T V X X E A X X X K I X X T D A A X W P M K T A R W L D X Q X V H Q D P X X N X X Q K X X M D I V W X M L S I W R X X X X C W X E W E V K D S X R D X X P Y X T K H L G X L W H C J F L K X E X A X X A V Q R A X W R D X K E L F D N G J J A C T S D I Y K P H A Q C S X R H J M A K X C L X X Q X X N Q N G A X R I K E I X C X J M K X X X W G J X E N G E H K I K X F J H S J E F X E V X E D D V J S V X H C T X L F H V E G R X Q M D N N M V R V A M G R X W L Y T M F T T F P M D C H J J A S C E X Q X D D Y I J E S M V L R I E F C X P G N X P X X G N T E M X X J P X G W V H'

In [6]:
wild_seq = protein_seq['wild_type']
mutant_seq = protein_seq['mutant']

In [7]:
# config
max_seq_len = 380
batch_size = 16

# # CUDA
# USE_CUDA = torch.cuda.is_available()

# # MPS:
# USE_MPS = torch.has_mps

# if USE_MPS:
#     torch.cuda.manual_seed(2020)
#     device = torch.device('mps')
#     print('Device name: MPS')
# else:
#     print('No GPU available, using the CPU instead.')
#     device = torch.device("cpu")

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA RTX A5000


In [47]:
import pickle
def get_embedding(protein_seq):
    
    xs = []
    result = None
    
    for index, seq in protein_seq.iterrows():
        s_len = len(seq['wild_type'].replace(" ",'')) + 1
        aa_index = seq['index']
        label = seq['label']
        wt_aa = seq['origin_aa']
        mt_aa = seq['sub_aa']
        # add_special_tokens adds extra token at the end of each sequence
        token_encoding = tokenizer.batch_encode_plus([seq['wild_type'], seq['mutant']], add_special_tokens=True, padding="longest")
        input_ids      = torch.tensor(token_encoding['input_ids']).to(device)
        attention_mask = torch.tensor(token_encoding['attention_mask']).to(device)

        with torch.no_grad():
            # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
            embedding_repr = model(input_ids, attention_mask=attention_mask)
            emb = embedding_repr.last_hidden_state[:, :s_len]
            try:
                emb = emb[:, aa_index, :]
            except:
                print ('Error happends in No. {}, aa_index: {}, sequence length: {}'.format(index, aa_index, s_len))
            # print(aa_index)
            x = emb.detach().cpu().numpy().squeeze()
            temp = pd.DataFrame({'label':label,'mutant_index': aa_index,'wt_aa': wt_aa, 't_aa': mt_aa, 'wt_emb': [x[0, :]], 'mt_emb':[x[1,:]]})
            if result is None:
                result=temp
            else:
                result = pd.concat([result,temp])

            xs.append({'x':x.reshape(1,-1), 'label':label})
            
    result.to_csv('test_seq.csv', index = None)
    with open('emb.pkl', 'wb') as f:
        pickle.dump(xs,f)

        
get_embedding(protein_seq)  


In [48]:
# read residue_embeddings
with open('emb.pkl', 'rb') as file:
    y = pickle.load(file)

In [49]:
data_y = []
data_X = []
for i in range(len(y)):
    data_X.append(y[i]['x'][0])
    data_y.append(y[i]['label'])

In [52]:
# turn residue_enbeddings (tensors) into numpy array
data_X = np.array(data_X)
data_X.shape

(5000, 2048)

In [56]:
# 切分数据集
X_train, X_test, y_train, y_test= train_test_split(data_X, data_y,
                                                    test_size=0.2,
                                                    stratify=data_y,
                                                   random_state=42)
# 切分出valid数据集
X_valid, X_test, y_valid, y_test = train_test_split(X_test,y_test,
                                               test_size=0.3,
                                               shuffle=True,
                                               stratify=y_test,
                                               random_state=2020)
len(X_train), len(X_test),len(y_train),len(y_test), len(X_valid),len(y_valid)

(4000, 300, 4000, 300, 700, 700)

In [60]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
sgd = SGDClassifier(loss="hinge", penalty="l1", max_iter=8)
cross_val_score(sgd, X_train, y_train, cv=5)



array([0.50375, 0.5225 , 0.5225 , 0.48875, 0.52   ])

In [57]:
from xgboost import XGBClassifier

# rf = RandomForestClassifier(max_depth=19,random_state = 2)
# rf.fit(X_train, y_train)
# score=cross_val_score(rf,X_test,y_test,cv=5,scoring='f1')
# print(score)

eval_s = [(X_train, y_train), (X_test, y_test)]

xgb = XGBClassifier()
xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_s, verbose=False)
xgb.score(X_test, y_test)

ModuleNotFoundError: No module named 'xgboost'

In [58]:
!pip install xgboost

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Collecting xgboost
  Downloading http://mirrors.aliyun.com/pypi/packages/e4/ed/8e2a7ae4e856f4887afc0beee897088ed8dbbc1b19b0f49971019939452a/xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
[K     |▊                               | 4.6 MB 59 kB/s eta 0:52:43^C

[31mERROR: Operation cancelled by user[0m
[?25h

In [None]:
for step, (sent_id, mask, labels) in enumerate(train_dataloader):
    
    # progress update after every 50 batches.
    # if step % 50 == 0 and not step == 0:
      # print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the (sent_id, mask, labels) to gpu
    sent_id, mask, labels = sent_id.to(device), mask.to(device), labels.to(device)
    # bert = bert.to(device)
    # print(sent_id)
    x=model_bert(sent_id[0:1,:],mask[0:1,:])
    print(x)
    break

In [None]:
# print(model)

## 练习测试代码

In [None]:
# # Dataprocessing 
# protein_df.drop(labels=['PDBID','Unnamed: 3','CHAIN'], axis=1, inplace = True)
# protein_df['label'] = [random.randint(0,1) for i in range(39)]

# # 随机生成蛋白序列
# seq = []
# alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
# alphabet = [char for char in alphabet]

# generate_length = 14961

# for length in range(generate_length):
#     str_list=[random.choice(alphabet) for i in range(random.randint(200,380))]
#     random_str="".join(str_list)
#     seq.append(random_str)

# # 随机生成label
# label = [random.randint(0,1) for i in range(generate_length)]

# # 新生成的seq 和label 放入新创建的dataframe
# psudo_seq = pd.DataFrame({'sequence': seq, 'label':label})

# # 把新生成的dataframe 和以前的拼一起
# raw_seq = protein_df.append(psudo_seq)

# # settle sequences
# seq_batch = [seq for seq in raw_seq.sequence]
# seq_batch = [seq.replace(" ", '') for seq in raw_seq.sequence]
# # seq_batch = [' '.join(seq) for seq in seq_batch]
# # seq_batch = [re.sub(r"[UZOB]", "X", sequence) for sequence in seq_batch]

# # 清洗完后放入dataframe
# protein_seq = pd.DataFrame({'sequence': seq_batch, 'label':raw_seq.label}, index=None).reset_index()

# # 随机选择生成

In [None]:
# # 随机选择生成 mutation， 记录mutant index， mutant aa 和mutant_sequence
# def mutation(seq):
    
#     sequence =[char for char in seq]
#     index = random.choice(range(len(sequence)))
#     replace = random.choice(alphabet)
#     sequence[index] = replace
#     mutant_seq = ''.join(sequence)

#     return index, replace, mutant_seq

# # 把生成的mutation 放入dataframe
# # for i in range(protein_seq.shape[0]):
# #     protein_seq['index'], protein_seq['sub_aa'], protein_seq['mutant'] = protein_seq['sequence'].apply(mutation)[i]

# # # 把生成的dataframe 输出成csv
# # protein_seq.to_csv('wild_type_mutant_sequence.csv')

In [None]:
# for i in range(test_df.shape[0]):
#     test_df['index'], test_df['sub_aa'], test_df['mutant'] = test_df['sequence'].apply(mutation)[i]