In [None]:
# 必要なライブラリをインポート
import torch
from transformers import GenerationConfig
from transformers import LlamaForCausalLM, LlamaTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = "stored_output/checkpoint-2epoch"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# モデルとトークナイザーの読み込み
model = LlamaForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map='auto',
    quantization_config=None
)
tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH)

model.eval() # 推論モード設定

Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.11s/it]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): ModulesToSaveWrapper(
      (original_module): Embedding(32000, 4096, padding_idx=0)
      (modules_to_save): ModuleDict(
        (default): Embedding(32000, 4096, padding_idx=0)
      )
    )
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Identity()
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Lin

In [4]:
generation_config = GenerationConfig(
    temperature=0.2,    # 出力の多様性を制御（低いほど決定論的）
    top_k=40,          # トップKのトークンのみを考慮
    top_p=0.9,         # 累積確率がこの値を超えるまでのトークンを考慮
    do_sample=True,    # ランダムサンプリングを有効化
    num_beams=1,       # ビーム探索の数
    repetition_penalty=1.2,  # 繰り返しを抑制
    max_new_tokens=900      # 生成する最大トークン数
)

In [59]:
# 生成したいアミノ酸配列の条件付け
input_text = "[Generate by Substrate] Substrate=<copper(1+)>"
# トークン化
inputs = tokenizer(input_text, return_tensors="pt")

# テキスト生成
with torch.no_grad():
    outputs = model.generate(
        input_ids = inputs["input_ids"].to(device),
        attention_mask = inputs['attention_mask'].to(device),
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        generation_config = generation_config,
        return_dict_in_generate=True,
        output_attentions=False
    )

# 生成されたテキストをデコード
s = outputs["sequences"][0]
output = tokenizer.decode(s, skip_special_tokens=True)
output = output.replace("</s>", "")
print(output.replace(input_text, "").strip())

Seq=<MKQYNVTGMSCAACSARVEKAVSKVDGVSSCSVSLLTNSMGVEGDADPAAIIRAVEEAGYGAALKEAASNRTESEGESAHSDDSLKDTETPKIRNRLIWSIVFLIPLMYISMGHTMWGWPVPAFMQDNHLGMGLVQMLLTIIIMIINRKFFVSGSFKSLAHRAPNMDTLVALGSGTSFLYSITALLMTATAMTTHMAAEAGEDRVMYYEMTVVIFAIVVPLGHHWSKERAKTPANIAILGGIGTFLLSMGSTLPGNFTLMDFFSMMVTMNLPEVAIFVFEAVAQCIEDRGRRIEVRSFM>


In [61]:
# 予測したいアミノ酸配列
input_text = "[Determine Substrate] Seq=<MSQTISLALEGLSCGHCVKRVKEALEQRADVEQASVTQQEAQVTGSADAAALIATVEAAGYHATLSGDRHPKPEPLTASEPPPEALTTVAETQPAQQSDAAQFLLIEGMSCASCVSRVEKALEKVPGVTQARVNLGERSALVMGDAAASQLVEAVEAAGYQAQPVQDEQERRDKQQSSARRAMRRFSWQAALALALGAPLMVWGMLGDNMMLSDDNRTLWLVIGGVTLLVMIVAGGHFFRSAWRSLRNGSATMDTLVALGTAAAWLYSFSVALWPDFFPMQARHLYFEASVMIIGLINLGHALEQRARQRSSKALERLLDLTPAQARLIDEQGEMLVPLSAVRPGMTLRLTTGDRVPVDGEIKDGEAWLDEAMLTGEAVPQSKRAGDKVYAGTLVQDGAVRFVARATGEQTALARIIYLVRQAQSSKPDVGRLADRISAVFVPVVVAIALISAAIWYLFGPQPQIAYTLVVATTVLIIACPCALGLATPMSIIAGVGRAAELGVLVRDADALQRASRIDTLVFDKTGTLTTGRPQVDEVLSWNGASRESVLQHAAALEKNASHPLAQAIVAAAETANHHEIAQFRTIRGKGVSGILDGKPLLLGNAALMQDNQIALHDAQSDIERLSRQGATPVLLAQNGELVGLIALRDGLRPESQAALHRLRQSGYRLMMLTGDREETARAIAQQAGIDEVIAGVLPEAKAQAVARLQQQGRRVAMVGDGINDAPALAQADVGIAMGGGSDVAVETAPITLMRADLNSVADALALASATLRNIRQNLLGAFIYNSLGIPLAAGALYPLTGALLSPVVAGAAMALSSITVVSNANRLLRYRPTRDDRR>"
# トークン化
inputs = tokenizer(input_text, return_tensors="pt")

# テキスト生成
with torch.no_grad():
    outputs = model.generate(
        input_ids = inputs["input_ids"].to(device),
        attention_mask = inputs['attention_mask'].to(device),
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        generation_config = generation_config,
        return_dict_in_generate=True,
        output_attentions=False
    )

# 生成されたテキストをデコード
s = outputs["sequences"][0]
output = tokenizer.decode(s, skip_special_tokens=True)
output = output.replace("</s>", "")
print(output.replace(input_text, "").strip())

Substrate=<copper(1+)>


In [56]:
from Bio.PDB import PDBParser

def get_average_plddt(pdb_file):
    parser = PDBParser()
    structure = parser.get_structure("protein", pdb_file)
    plddts = []
    for model in structure:
        for chain in model:
            for residue in chain:
                if "CA" in residue:  # CA原子が存在する場合(アミノ酸残基)
                    plddts.append(residue["CA"].get_bfactor()) # B-factorがpLDDT
    return sum(plddts) / len(plddts)

predicted_pdb_file = "esmfold_1742705160441.pdb"  # AlphaFold2 の出力 PDB ファイル
avg_plddt = get_average_plddt(predicted_pdb_file)
print(f"Average pLDDT (generated): {avg_plddt:.3f}")

Average pLDDT (generated): 0.851
