<a href="https://colab.research.google.com/github/xingleiking/iodraw-files/blob/main/ESM2_T36_3Bfeature_extracute.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# extract_features_colab.py
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel

# ------------------ 配置 ------------------
ESM2_MODEL = "facebook/esm2_t36_3B_UR50D"
FALLBACK_MODEL = "facebook/esm2_t30_150M_UR50D"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
INITIAL_BATCH_SIZE = 8
MAX_SEQ_LENGTH = 10
FEATURE_CACHE = "cached_esm2_3B_features.pt"
LABEL_CACHE = "cached_labels.npy"

# ------------------ 工具函数 ------------------
def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        do_lower_case=False,
        max_length=MAX_SEQ_LENGTH
    )
    model = AutoModel.from_pretrained(
        model_name,
        output_hidden_states=True,
        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
    ).to(DEVICE).eval()
    return tokenizer, model

def safe_extract_embeddings(sequences, tokenizer, model, batch_size):
    all_embs = []
    with torch.no_grad():
        for i in range(0, len(sequences), batch_size):
            batch = sequences[i:i + batch_size]
            try:
                enc = tokenizer(
                    batch,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=MAX_SEQ_LENGTH
                )
                input_ids = enc["input_ids"].to(DEVICE)
                attention_mask = enc["attention_mask"].to(DEVICE)

                out = model(input_ids, attention_mask=attention_mask)

                layer_embs = []
                for idx in [-1, -2, -3]:
                    hidden = out.hidden_states[idx]
                    mask = attention_mask.unsqueeze(-1)
                    summed = (hidden * mask).sum(dim=1)
                    lengths = mask.sum(dim=1).clamp(min=1e-9)
                    mean_pooled = summed / lengths
                    layer_embs.append(mean_pooled.cpu())

                combined = torch.cat(layer_embs, dim=1)
                all_embs.append(combined.numpy())

            except RuntimeError as e:
                if "CUDA out of memory" in str(e):
                    torch.cuda.empty_cache()
                    print(f"OOM with batch_size={batch_size}. Reducing...")
                    return safe_extract_embeddings(sequences, tokenizer, model, batch_size // 2)
                raise e

    torch.cuda.empty_cache()
    return np.vstack(all_embs)







Saving 10dataCombina.csv to 10dataCombina.csv
Extracting features...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/1.39G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

In [None]:
# ------------------ 主流程 ------------------
def main():
    from google.colab import files
    uploaded = files.upload()
    csv_path = list(uploaded.keys())[0]

    df = pd.read_csv(csv_path)
    sequences = df.iloc[:, 0].astype(str).tolist()
    labels = df.iloc[:, 1].values

    print("Extracting features...")
    try:
        tokenizer, model = load_model(ESM2_MODEL)
        X = safe_extract_embeddings(sequences, tokenizer, model, INITIAL_BATCH_SIZE)
        print(f"Successfully used 3B model. Feature shape: {X.shape}")
    except Exception as e:
        print(f"3B model failed: {e}. Falling back to 150M model...")
        tokenizer, model = load_model(FALLBACK_MODEL)
        X = safe_extract_embeddings(sequences, tokenizer, model, 32)
        print(f"Used 150M model. Feature shape: {X.shape}")

    X = add_sequence_features(sequences, X)

    torch.save(torch.tensor(X), FEATURE_CACHE)
    np.save(LABEL_CACHE, labels)
    print(f"Features saved to {FEATURE_CACHE}")
    print(f"Labels saved to {LABEL_CACHE}")

    files.download(FEATURE_CACHE)
    files.download(LABEL_CACHE)
    print("Files downloaded to your local machine")

main()