In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import seaborn as sns
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
hemoglobin_beta = {
'human':
"MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH",
'chimpanzee':
"MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTORFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH",
'camel':
"MVHLSGDEKNAVHGLWSKVKVDEVGGEALGRLLVVYPWTRRFFESFGDLSTADAVMNNPKVKAHGSKVLNSFGDGLNHLDNLKGTYAKLSELHCDKLHVDPENFRLLGNVLVVVLARHFGKEFTPDKQAAYQKVVAGVANALAHRYH",
'rabbit':
"MVHLSSEEKSAVTALWGKVNVEEVGGEALGRLLVVYPWTQRFFESFGDLSSANAVMNNPKVKAHGKKVLAAFSEGLSHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVIVLSHHFGKEFTPQVQAAYQKVVAGVANALAHKYH",
'pig':
"MVHLSAEEKEAVLGLWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSNADAVMGNPKVKAHGKKVLQSFSDGLKHLDNLKGTFAKLSELHCDQLHVDPENFRLLGNVIVVVLARRLGHDFNPNVQAAFQKVVAGVANALAHKYH",
'horse':
"*VQLSGEEKAAVLALWDKVNEEEVGGEALGRLLVVYPWTQRFFDSFGDLSNPGAVMGNPKVKAHGKKVLHSFGEGVHHLDNLKGTFAALSELHCDKLHVDPENFRLLGNVLVVVLARHFGKDFTPELQASYQKVVAGVANALAHKYH",
'bovine':
"M**LTAEEKAAVTAFWGKVKVDEVGGEALGRLLVVYPWTQRFFESFGDLSTADAVMNNPKVKAHGKKVLDSFSNGMKHLDDLKGTFAALSELHCDKLHVDPENFKLLGNVLVVVLARNFGKEFTPVLQADFQKVVAGVANALAHRYH",
'sheep':
"M**LTAEEKAAVTGFWGKVKVDEVGAEALGRLLVVYPWTQRFFEHFGDLSNADAVMNNPKVKAHGKKVLDSFSNGMKHLDDLKGTFAQLSELHCDKLHVDPENFRLLGNVLVVVLARHHGNEFTPVLQADFQKVVAGVANALAHKYH"
}

In [3]:

def get_protbert_embeddings(sequences, model_name="Rostlab/prot_bert"):
    """
    Get embeddings from ProtBERT for a list of protein sequences
    
    Parameters:
    - sequences: list of protein sequences
    - model_name: ProtBERT model name
    
    Returns:
    - embeddings: numpy array of embeddings
    """
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    embeddings = []
    
    for seq in sequences:
        # Add spaces between amino acids (ProtBERT format)
        spaced_seq = ' '.join(list(seq))
        
        # Tokenize
        inputs = tokenizer(spaced_seq, return_tensors='pt', padding=True, truncation=True)
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            # Use [CLS] token embedding (first token)
            cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.append(cls_embedding[0])
    
    return np.array(embeddings)


In [4]:

def visualize_protbert_embeddings(embeddings, labels=None, method='pca', figsize=(6, 4)):
    """
    Visualize protein embeddings from ProtBERT using dimensionality reduction
    
    Parameters:
    - embeddings: numpy array of shape (n_proteins, embedding_dim)
    - labels: list of protein names/categories (optional)
    - method: 'pca', 'tsne', or 'umap'
    - figsize: figure size tuple
    """
    
    # Dimensionality reduction
    if method == 'pca':
        reducer = PCA(n_components=2, random_state=42)
        embedding_2d = reducer.fit_transform(embeddings)
        title = f'ProtBERT Embeddings - PCA\n(Explained variance: {reducer.explained_variance_ratio_.sum():.3f})'
    
    elif method == 'tsne':
        perp = 30 if 30 < len(labels) else (len(labels)-1)
        reducer = TSNE(n_components=2, random_state=42, perplexity=perp)
        embedding_2d = reducer.fit_transform(embeddings)
        title = 'ProtBERT Embeddings - t-SNE'
    
    elif method == 'umap':
        reducer = umap.UMAP(n_components=2, random_state=42)
        embedding_2d = reducer.fit_transform(embeddings)
        title = 'ProtBERT Embeddings - UMAP'
    
    # Create visualization
    plt.figure(figsize=figsize)
    
    if labels is None:
        plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], alpha=0.7, s=50)
    else:
        # Color by labels if provided
        unique_labels = list(set(labels))
        colors = plt.cm.tab10(np.linspace(0, 1, len(unique_labels)))
        
        for i, label in enumerate(unique_labels):
            mask = np.array(labels) == label
            plt.scatter(embedding_2d[mask, 0], embedding_2d[mask, 1], 
                       c=[colors[i]], label=label, alpha=0.7, s=50)
        
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.title(title, fontsize=14, pad=20)
    plt.xlabel(f'{method.upper()} Component 1', fontsize=12)
    plt.ylabel(f'{method.upper()} Component 2', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


In [None]:
sample_sequences = list(hemoglobin_beta.values())
sample_labels = list(hemoglobin_beta.keys())

embeddings = get_protbert_embeddings(sample_sequences)

# Visualize with different methods
for method in ['pca', 'tsne', 'umap']:
    visualize_protbert_embeddings(embeddings, sample_labels, method=method)


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]