In [1]:
from Bio import SeqIO
from proteinbert import load_pretrained_model
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

# Load the protein sequences
sequences = [str(record.seq) for record in SeqIO.parse('proteins.fa', 'fasta')]

# Define the sequence length as the length of the longest sequence
# seq_len = max(len(seq) for seq in sequences)
seq_len = 64


# Define the batch size
batch_size = 32

# Load the pretrained ProteinBERT model and the input encoder
pretrained_model_generator, input_encoder = load_pretrained_model()

# Create a model that outputs the activations of its hidden layers
model = get_model_with_hidden_layers_as_outputs(pretrained_model_generator.create_model(seq_len))





In [2]:
# Define the sequence length as the expected sequence length of the model
# seq_len = 512

# Preprocess the sequences to ensure they are all the same length

sequences = [seq[:seq_len-2].ljust(seq_len-2, '-') for seq in sequences]

# Encode the protein sequences
encoded_x = input_encoder.encode_X(sequences, seq_len)

# Generate the embeddings for the encoded sequences
local_representations, global_representations = model.predict(encoded_x, batch_size=batch_size)




In [4]:
import pandas as pd

# 1. Load positional information
peptides_df = pd.read_csv('peptides.csv')

# Prepare a dictionary to quickly look up the index of a protein_id in sequences
protein_id_to_index = {record.id: idx for idx, record in enumerate(SeqIO.parse('proteins.fa', 'fasta'))}

# 2. Extract embeddings for specified positions and populate the DataFrame
rows = []

for _, row in peptides_df.iterrows():
    protein_id = row['Info_protein_id']
    start_pos = row['Info_start_pos']
    end_pos = row['Info_end_pos']
    
    # Find the correct index for the protein_id in sequences
    protein_index = protein_id_to_index.get(protein_id, None)
    if protein_index is None:
        print(f"Protein ID {protein_id} not found in sequences.")
        continue
    
    max_embedding_length = local_representations[protein_index].shape[0]

    for pos in range(start_pos, min(end_pos+1, max_embedding_length+1)):  # Ensure not to exceed representation size
        embedding = local_representations[protein_index, pos-1, :]  # -1 because positions are likely 1-based
        
        row_data = {
            'Info_protein_ID': protein_id,
            'Info_pos': pos,
        }
        for i, value in enumerate(embedding):
            row_data[f'feat{i}'] = value
        
        rows.append(row_data)

# 3. Populate the DataFrame
protBert_full_data = pd.DataFrame(rows)


In [5]:
protBert_full_data.head()

Unnamed: 0,Info_protein_ID,Info_pos,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,...,feat1552,feat1553,feat1554,feat1555,feat1556,feat1557,feat1558,feat1559,feat1560,feat1561
0,0704243A,46,0.272818,0.153806,-0.062959,-0.705395,-0.075377,-0.050673,-0.038158,0.005224,...,0.002489,7.985268e-14,0.001619,6.8e-05,2.910588e-07,0.000433,9.733288e-12,1.513148e-10,5.004604e-08,7.424229e-11
1,0704243A,47,0.139574,0.062497,-0.068674,-0.377046,0.042702,0.019122,-0.092985,-0.19401,...,0.00238,7.770019e-14,0.001079,6.2e-05,5.088384e-07,0.000333,7.70418e-12,1.630329e-10,3.036438e-08,7.079878e-11
2,0704243A,48,0.012607,0.023378,-0.02918,-0.708973,-0.052233,0.099587,0.174073,-0.148519,...,0.002397,1.391157e-13,0.001421,8.4e-05,3.925778e-07,0.000419,1.063333e-11,1.322916e-10,6.472273e-09,6.427915e-11
3,0704243A,49,-0.027957,-0.105689,-0.149113,-0.659597,-0.104517,-0.049819,-0.010488,0.047503,...,0.983861,2.274311e-15,0.000854,5.7e-05,9.21004e-07,0.000336,2.910525e-11,1.093871e-10,1.620368e-08,5.763506e-11
4,0704243A,50,-0.096555,0.045439,-0.189983,-0.282624,-0.029762,-0.134147,0.124989,-0.069275,...,0.003277,9.435442e-16,0.002378,0.000184,2.02073e-06,0.000781,8.391656e-12,2.154789e-09,9.184267e-08,1.596594e-10


In [6]:
# ... [previous code]

# 4. Save the DataFrame to a CSV file
protBert_full_data.to_csv('protBert_full_data.csv', index=False)


In [9]:
import pandas as pd

# Load the retroviridae_data_no_esm1b DataFrame
retroviridae_data_no_esm1b = pd.read_csv('retroviridae_data_no_esm1b.csv')

# Assuming you already have protBert_full_data DataFrame from your previous code 
# (If not, you'll need to generate it using the steps you've previously followed)

# Perform the left join
merged_data = pd.merge(retroviridae_data_no_esm1b, protBert_full_data, how='left', left_on=['Info_protein_ID', 'Info_pos'], right_on=['Info_protein_ID', 'Info_pos'])

# Save the merged DataFrame to a CSV file
merged_data.to_csv('merged_data.csv', index=False)


In [10]:
import pandas as pd

# Load the merged_data DataFrame
merged_data = pd.read_csv('merged_data.csv')

# Determine the number of feature columns in the dataset
max_feature_index = max(int(col.split('feat')[-1]) for col in merged_data.columns if 'feat' in col)

# Create the list of feature column names
feature_columns = [f'feat{i}' for i in range(1, max_feature_index + 1)]

# Drop rows with NaN values in any of the feature columns
merged_data_cleaned = merged_data.dropna(subset=feature_columns)

# Save the cleaned DataFrame back to a CSV file
merged_data_cleaned.to_csv('merged_data_cleaned.csv', index=False)


  merged_data = pd.read_csv('merged_data.csv')


In [11]:
merged_data_cleaned.head()

Unnamed: 0,Info_PepID,Info_organism_id,Info_protein_ID,Info_pos,Info_AA,Info_pubmed_id,Info_epitope_id,Info_host_id,Info_nPos,Info_nNeg,...,feat1552,feat1553,feat1554,feat1555,feat1556,feat1557,feat1558,feat1559,feat1560,feat1561
362,AAD50663.1:2,11908,AAD50663.1,2,G,2419433,21888,9606,1,1,...,0.000366,6.983606e-15,0.000385,0.000183,5e-06,0.000212,1.289631e-15,9.775362e-08,8.478742e-07,7.382676e-11
363,AAD50663.1:2,11908,AAD50663.1,3,Q,2419433,21888,9606,1,1,...,0.00181,3.57631e-18,0.00159,0.000806,4e-06,0.000854,7.319022e-13,9.829615e-07,1.723981e-05,4.234388e-10
364,AAD50663.1:2,11908,AAD50663.1,4,I,2419433,21888,9606,1,1,...,0.002605,4.113196e-15,0.002844,0.001094,4e-06,0.001715,2.043278e-13,3.869093e-08,1.818575e-06,5.343209e-10
365,AAD50663.1:2,11908,AAD50663.1,5,F,2419433,21888,9606,1,1,...,0.002487,3.156004e-17,0.002002,0.000751,3e-06,0.001106,7.137512e-13,1.198839e-07,3.430767e-06,5.168573e-10
366,AAD50663.1:2,11908,AAD50663.1,6,S,2419433,21888,9606,1,1,...,0.001682,7.779819e-13,0.001583,0.000604,4e-06,0.000791,2.970424e-11,7.743067e-08,3.505174e-06,1.018576e-10


In [12]:
retroviridae_data_no_esm1b.head()

Unnamed: 0,Info_PepID,Info_organism_id,Info_protein_ID,Info_pos,Info_AA,Info_pubmed_id,Info_epitope_id,Info_host_id,Info_nPos,Info_nNeg,Info_type,Info_window,Info_cluster,Class
0,P00543.1:2,11780,P00543.1,396,S,6192445,57312,10000248,1,0,Epitope containing region/antigenic site,YGRYSSESDVWSFGI,6,1
1,P00543.1:2,11780,P00543.1,397,D,6192445,57312,10000248,1,0,Epitope containing region/antigenic site,GRYSSESDVWSFGIL,6,1
2,P00543.1:2,11780,P00543.1,398,V,6192445,57312,10000248,1,0,Epitope containing region/antigenic site,RYSSESDVWSFGILL,6,1
3,P00543.1:2,11780,P00543.1,399,W,6192445,57312,10000248,1,0,Epitope containing region/antigenic site,YSSESDVWSFGILLW,6,1
4,P00543.1:2,11780,P00543.1,400,S,6192445,57312,10000248,1,0,Epitope containing region/antigenic site,SSESDVWSFGILLWE,6,1
