In [1]:
import os
# Set working directory
os.chdir("D:/ACP")  # Windows path fix: use forward slash or raw string

print("Current Working Directory:", os.getcwd())

Current Working Directory: D:\ACP


In [2]:
# Step 1: Install Required Libraries (run once)
!pip install transformers
!pip install torch
!pip install pandas



In [5]:
# Step 2: Import Libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

# Step 3: Load Your Peptide CSV File
csv_path = "acp_train_data.csv"  # change this to your actual path
df = pd.read_csv(csv_path)

# Confirm structure: should have 'ID' and 'Sequence' columns
print(df[['ID', 'Sequence']].head())

# Step 4: Convert to FASTA Format and Save
fasta_path = "acp_train_data.fasta"
with open(fasta_path, "w") as fasta_file:
    for idx, row in df.iterrows():
        fasta_file.write(f">{row['ID']}\n{row['Sequence']}\n")

print(f"FASTA file saved to: {fasta_path}")

# Step 5: Preprocess Sequences for ProtBERT
def preprocess_sequence(seq):
    return " ".join(list(seq))  # adds spaces between amino acids

sequences = df["Sequence"].apply(preprocess_sequence).tolist()
ids = df["ID"].tolist()

# Step 6: Load ProtBERT Model and Tokenizer
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = BertModel.from_pretrained("Rostlab/prot_bert")
model.eval()

# Step 7: Feature Extraction Function
def extract_embedding(sequence):
    inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  # mean pooling
    return embedding

# Step 8: Apply to All Sequences
features = []
for seq in tqdm(sequences, desc="Extracting BERT features"):
    vec = extract_embedding(seq)
    features.append(vec)

# Step 9: Create DataFrame with Features
features_df = pd.DataFrame(features)
features_df.insert(0, "ID", ids)
features_df["Label"] = df["Label"]

# Step 10: Save Feature CSV
features_df.to_csv("acp_train_bert_features.csv", index=False)
print("Saved to acp_train_bert_features.csv")


        ID                               Sequence
0  ACP_335                      GTGLPMSERRKIMLMMR
1  ACP_580                   FLSLALAALPKLFCLIFKKC
2   ACP_74                          FLPIITNLLGKLL
3  ACP_428                          EGGGPQWAVGHFM
4   ACP_79  DQYKCLQHGGFCLRSSCPSNTKLQGTCKPDKPNCCKS
FASTA file saved to: acp_train_data.fasta


Extracting BERT features:   0%|                                                                | 0/738 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Extracting BERT features: 100%|██████████████████████████████████████████████████████| 738/738 [01:23<00:00,  8.81it/s]


Saved to acp_train_bert_features.csv


In [6]:
# Step 3: Load Your Peptide CSV File
csv_path = "acp_test_data.csv"  # change this to your actual path
df = pd.read_csv(csv_path)

# Confirm structure: should have 'ID' and 'Sequence' columns
print(df[['ID', 'Sequence']].head())

# Step 4: Convert to FASTA Format and Save
fasta_path = "acp_test_data.fasta"
with open(fasta_path, "w") as fasta_file:
    for idx, row in df.iterrows():
        fasta_file.write(f">{row['ID']}\n{row['Sequence']}\n")

print(f"FASTA file saved to: {fasta_path}")

# Step 5: Preprocess Sequences for ProtBERT
def preprocess_sequence(seq):
    return " ".join(list(seq))  # adds spaces between amino acids

sequences = df["Sequence"].apply(preprocess_sequence).tolist()
ids = df["ID"].tolist()

# Step 6: Load ProtBERT Model and Tokenizer
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
model = BertModel.from_pretrained("Rostlab/prot_bert")
model.eval()

# Step 7: Feature Extraction Function
def extract_embedding(sequence):
    inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  # mean pooling
    return embedding

# Step 8: Apply to All Sequences
features = []
for seq in tqdm(sequences, desc="Extracting BERT features"):
    vec = extract_embedding(seq)
    features.append(vec)

# Step 9: Create DataFrame with Features
features_df = pd.DataFrame(features)
features_df.insert(0, "ID", ids)
features_df["Label"] = df["Label"]

# Step 10: Save Feature CSV
features_df.to_csv("acp_test_bert_features.csv", index=False)
print("Saved to acp_test_bert_features.csv")


        ID                         Sequence
0    ACP_4                    FAKFLAKFLKKAL
1  ACP_766                     LNPDPCKPLAFI
2  ACP_870             TCGTCCTGAGGAGAGAGAGC
3  ACP_581                    FLSGIVGMLGKLF
4  ACP_445  GFFSTVKNLATNVAGTVIDTLKCKVTGGCRS
FASTA file saved to: acp_test_data.fasta


Extracting BERT features:   0%|                                                                | 0/185 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Extracting BERT features: 100%|██████████████████████████████████████████████████████| 185/185 [00:23<00:00,  7.96it/s]


Saved to acp_test_bert_features.csv


In [7]:
import pandas as pd
import torch
import esm
from tqdm import tqdm

# Step 1: Load CSV with 'ID' and 'Sequence'
csv_path = "acp_train_data.csv"
df = pd.read_csv(csv_path)

# Step 2: Write FASTA file for ESM input
fasta_path = "acp_train_data.fasta"
with open(fasta_path, "w") as f:
    for i, row in df.iterrows():
        f.write(f">{row['ID']}\n{row['Sequence']}\n")

print("FASTA file written.")

# Step 3: Load ESM-2 model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()  # or esm1b_t33_650M_UR50S
batch_converter = alphabet.get_batch_converter()
model.eval()  # disable dropout

# Step 4: Prepare data
data = [(row['ID'], row['Sequence']) for _, row in df.iterrows()]

# Step 5: Extract embeddings
results = []
batch_size = 8  # adjust based on GPU/CPU RAM

for i in tqdm(range(0, len(data), batch_size), desc="Extracting ESM embeddings"):
    batch_data = data[i:i+batch_size]
    labels, strs, tokens = batch_converter(batch_data)
    
    with torch.no_grad():
        outputs = model(tokens, repr_layers=[33])
        token_representations = outputs["representations"][33]
    
    for j, (label, seq) in enumerate(batch_data):
        # Ignore [CLS] (token 0) and [EOS] (last token)
        embedding = token_representations[j, 1:len(seq)+1].mean(0).cpu().numpy()
        results.append([label] + embedding.tolist())

# Step 6: Save to CSV
embedding_dim = len(results[0]) - 1
columns = ["ID"] + [f"feat_{i}" for i in range(embedding_dim)]
features_df = pd.DataFrame(results, columns=columns)

# Add labels from original CSV
features_df = features_df.merge(df[['ID', 'Label']], on='ID')

# Step 7: Save
features_df.to_csv("acp_train_esm2_features.csv", index=False)
print("Feature file saved as acp_train_esm2_features.csv")



FASTA file written.


Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to C:\Users\myousaf23/.cache\torch\hub\checkpoints\esm2_t33_650M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to C:\Users\myousaf23/.cache\torch\hub\checkpoints\esm2_t33_650M_UR50D-contact-regression.pt
Extracting ESM embeddings: 100%|███████████████████████████████████████████████████████| 93/93 [01:51<00:00,  1.20s/it]


Feature file saved as acp_train_esm2_features.csv


In [8]:
import pandas as pd
import torch
import esm
from tqdm import tqdm

# Step 1: Load CSV with 'ID' and 'Sequence'
csv_path = "acp_test_data.csv"
df = pd.read_csv(csv_path)

# Step 2: Write FASTA file for ESM input
fasta_path = "acp_test_data.fasta"
with open(fasta_path, "w") as f:
    for i, row in df.iterrows():
        f.write(f">{row['ID']}\n{row['Sequence']}\n")

print("FASTA file written.")

# Step 3: Load ESM-2 model
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()  # or esm1b_t33_650M_UR50S
batch_converter = alphabet.get_batch_converter()
model.eval()  # disable dropout

# Step 4: Prepare data
data = [(row['ID'], row['Sequence']) for _, row in df.iterrows()]

# Step 5: Extract embeddings
results = []
batch_size = 8  # adjust based on GPU/CPU RAM

for i in tqdm(range(0, len(data), batch_size), desc="Extracting ESM embeddings"):
    batch_data = data[i:i+batch_size]
    labels, strs, tokens = batch_converter(batch_data)
    
    with torch.no_grad():
        outputs = model(tokens, repr_layers=[33])
        token_representations = outputs["representations"][33]
    
    for j, (label, seq) in enumerate(batch_data):
        # Ignore [CLS] (token 0) and [EOS] (last token)
        embedding = token_representations[j, 1:len(seq)+1].mean(0).cpu().numpy()
        results.append([label] + embedding.tolist())

# Step 6: Save to CSV
embedding_dim = len(results[0]) - 1
columns = ["ID"] + [f"feat_{i}" for i in range(embedding_dim)]
features_df = pd.DataFrame(results, columns=columns)

# Add labels from original CSV
features_df = features_df.merge(df[['ID', 'Label']], on='ID')

# Step 7: Save
features_df.to_csv("acp_test_esm2_features.csv", index=False)
print("Feature file saved as acp_test_esm2_features.csv")

FASTA file written.


Extracting ESM embeddings: 100%|███████████████████████████████████████████████████████| 24/24 [00:28<00:00,  1.18s/it]


Feature file saved as acp_test_esm2_features.csv
