<a href="https://colab.research.google.com/github/zhouyh56/Scripts/blob/main/BIAP_Step1_2.5_PRO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# ==============================================================================
# BIAP - Step 1: Generate Job Specification JSON (Final Version)
# ==============================================================================
#
# SPECIFICATIONS:
# - requests version is pinned to 2.32.4.
# - The full, detailed prompt for the LLM is written out explicitly.
# - Uses 'GOOGLE_API_KEY' as the secret name.
#
# INSTRUCTIONS:
# 1. Ensure your secret key is named 'GOOGLE_API_KEY' in Colab Secrets (🔑 icon).
# 2. Run this cell.
# ==============================================================================

# Step 0: Install specified and necessary libraries
# As requested, pinning requests to version 2.32.4
!pip install -q "requests==2.32.4"
!pip install -q -U "google-generativeai"

# Step 1: Import all necessary modules
import requests
import json
import textwrap
import os
import time
import google.generativeai as genai
from google.colab import userdata

In [11]:
# NEW: Import the drive module
from google.colab import drive

# --- Google Drive 挂载 ---
try:
    print("[*] 正在请求挂载您的Google Drive...")
    drive.mount('/content/drive')
    print("✔️ Google Drive 挂载成功！")
except Exception as e:
    print(f"🚨 错误：无法挂载Google Drive。{e}")
    # 如果无法挂载Drive，后续无法保存文件，因此停止执行
    raise SystemExit("停止执行，因为Google Drive挂载失败。")

[*] 正在请求挂载您的Google Drive...
Mounted at /content/drive
✔️ Google Drive 挂载成功！


In [None]:
# --- API密钥配置 ---
try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GOOGLE_API_KEY)
    print("✅ Google API key configured successfully.")
except Exception as e:
    print("🚨 错误：无法配置Google API。")
    print("请确保您已在Colab Secrets (🔑) 中正确设置了名为'GOOGLE_API_KEY'的密钥。")
    raise ValueError("API Key not found or configured incorrectly.") from e

In [54]:
# --- API密钥配置 ---
try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GOOGLE_API_KEY)
    print("✅ Google API key configured successfully.")
except Exception as e:
    print("🚨 错误：无法配置Google API。")
    print("请确保您已在Colab Secrets (🔑) 中正确设置了名为'GOOGLE_API_KEY'的密钥。")
    raise ValueError("API Key not found or configured incorrectly.") from e

# --- 模块一: 输入与项目管理 ---
PROTEIN_A_ID = "P16615"
PROTEIN_B_ID = "O95140"

# ▼▼▼ NEW: The output directory is now inside your Google Drive ▼▼▼
OUTPUT_DIR = "/content/drive/MyDrive/BIAP_Jobs"
if not os.path.exists(OUTPUT_DIR):
    print(f"[*] 目录 '{OUTPUT_DIR}' 不存在，正在创建...")
    os.makedirs(OUTPUT_DIR)

PROJECT_ID = f"{PROTEIN_A_ID}_{PROTEIN_B_ID}_{int(time.time())}"
OUTPUT_JSON_PATH = os.path.join(OUTPUT_DIR, f"{PROJECT_ID}_spec.json")

# --- 模块二: 信息聚合引擎 ---
def fetch_uniprot_data(uniprot_id):
    print(f"[*] Fetching data for {uniprot_id} from UniProt...")
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    try:
        response = requests.get(url); response.raise_for_status()
        print(f"✔️  Successfully fetched data for {uniprot_id}.")
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"[!] ERROR: Could not fetch data for {uniprot_id}. {e}"); return None

def summarize_protein_info(data):
    """
    MODIFIED: This function now extracts Subunit and Interaction data.
    """
    if not data: return None
    summary = {
        "id": data.get('primaryAccession'),
        "sequence": data.get('sequence', {}).get('value'),
        "function": "N/A",
        "localization": [],
        "domains": [],
        "interaction_data": { # NEW FIELD
            "subunit_text": "N/A",
            "binary_interactors": []
        }
    }
    for c in data.get('comments', []):
        if c.get('commentType') == 'FUNCTION':
            summary['function'] = c.get('texts', [{}])[0].get('value', 'N/A')
        if c.get('commentType') == 'SUBCELLULAR LOCATION':
            summary['localization'] = [l['location']['value'] for l in c.get('subcellularLocations', [])]

        # ▼▼▼ NEW: Extracting Interaction Data ▼▼▼
        if c.get('commentType') == 'SUBUNIT':
            summary['interaction_data']['subunit_text'] = c.get('texts', [{}])[0].get('value', 'Not specified.')
        if c.get('commentType') == 'INTERACTION':
            for interaction in c.get('interactions', []):
                # We are interested in the interactor's UniProt ID
                interactor_id = interaction.get('interactant', {}).get('uniProtKBAccession')
                if interactor_id:
                    summary['interaction_data']['binary_interactors'].append(interactor_id)
        # ▲▲▲ End of New Logic ▲▲▲

    for f in data.get('features', []):
        if f.get('type') in ['Domain', 'Region', 'Coiled coil']:
            summary['domains'].append({"name": f.get('description'), "start": f.get('location', {}).get('start', {}).get('value'), "end": f.get('location', {}).get('end', {}).get('value')})
    return summary


# --- 模块三: 真实的Gemini API调用 ---
def generate_hypothesis_with_gemini(protein_a_summary, protein_b_summary):
    model = genai.GenerativeModel('gemini-2.5-pro')

    # ==========================================================
    # ▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼
    # As requested, the full prompt text is now written out explicitly.
    prompt = f"""System Role: You are an expert computational structural biologist. Your task is to analyze two proteins based on the provided data and propose the most likely interacting domains for structure prediction.

Input Data:
---
Protein A: {protein_a_summary['id']}
Function: {textwrap.shorten(protein_a_summary['function'], width=300)}
Subcellular Localization: {', '.join(protein_a_summary['localization'])}
Known Domains: {json.dumps(protein_a_summary['domains'], indent=2)}
Interaction Data: {json.dumps(protein_a_summary['interaction_data'], indent=2)}
---
Protein B: {protein_b_summary['id']}
Function: {textwrap.shorten(protein_b_summary['function'], width=300)}
Subcellular Localization: {', '.join(protein_b_summary['localization'])}
Known Domains: {json.dumps(protein_b_summary['domains'], indent=2)}
Interaction Data: {json.dumps(protein_b_summary['interaction_data'], indent=2)}
---
Instructions:
1. Direct Analysis: First, analyze the domains and functions of Protein A and B directly to find evidence of interaction.
2. Interaction Network Analysis:
    a. Look at Protein A's "Interaction" in UniProt Database. For each known interactor, consider its properties.
    b. Domain Analogy: Does Protein B have any of the same domains as Protein A's known interactors? If so, this is strong evidence they might bind in a similar way. Search every Protein A's known interactors.
    c. Partner Analogy: Does Protein B's name or function appear directly in Protein A's "Subunit" description or "Binary Interactors" list in UniProt Database? This is very strong evidence. Search every Protein A's known subunit and interactors.
3. Synthesize all evidence from Direct and Network analysis to form your final hypotheses. In your `reasoning` text, explicitly state what evidence you used (e.g., "Direct domain match", "Domain analogy with interactor XXXX", "Direct partner listing in Subunit data").
4. Analyze the subcellular localization for both proteins to confirm they can physically meet.
5. Based on the function annotations, domain types (e.g., coiled-coils are common interaction motifs), and literature context, deduce which specific domains are the most likely to be involved in a direct physical interaction.
6. Generate a ranked list of up to 2 interaction hypotheses, from most likely to least likely.
7. For each hypothesis, provide the specific protein ID and the start and end residue numbers for the proposed interacting region.
8. For each hypothesis, provide a confidence score. This score should include a descriptive category ("High", "Medium", or "Low") and a numerical value between 0.0 (lowest confidence) and 1.0 (highest confidence). The score should reflect your certainty based on the provided evidence (e.g., direct literature mentions, strong domain function correlation, etc.).
9. IMPORTANT: Format your entire response as a single, clean JSON object. The JSON object MUST include a key named 'interaction_hypotheses' containing the list of hypotheses. Do not include any extra text, explanations, or markdown formatting like ```json before or after the JSON block.
10. The JSON object MUST strictly follow this exact structure, including all specified key names:
{{
  "interaction_hypotheses": [
    {{
      "hypothesis_rank": integer,
      "protein_A_interaction_domain": {{
        "id": "string",
        "name": "string",
        "start": integer,
        "end": integer
      }},
      "protein_B_interaction_domain": {{
        "id": "string",
        "name": "string",
        "start": integer,
        "end": integer
      }},
      "confidence": {{
        "category": "High | Medium | Low",
        "score": float
      }},
      "rationale": "A brief text explaining the reasoning for this specific hypothesis."
    }}
    ]
  }}
"""
    # ▲▲▲ END OF UPDATED PROMPT ▲▲▲
    # ▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲
    # ==========================================================

    print("\n" + "="*50)
    print(f"--- Calling Gemini API with model 'gemini-2.5-pro'... ---")

    try:
        response = model.generate_content(prompt)
        response_text = response.text
        print("--- Raw response received from Gemini API ---")
        print(response_text)
        print("="*50 + "\n")

        # ▼▼▼ NEW: Robust JSON extraction logic ▼▼▼
        # Use regex to find the first occurrence of a JSON object (starts with { and ends with })
        import re # Added import re here
        json_match = re.search(r'\{.*\}', response_text, re.DOTALL)

        if json_match:
            cleaned_text = json_match.group(0)
            print("--- DEBUG: Extracted JSON block for parsing ---"); print(cleaned_text); print("="*50 + "\n")
            # Parse the extracted JSON
            return json.loads(cleaned_text)
        else:
            print("[!] Failure: No valid JSON object found in the Gemini response.")
            return None
    except json.JSONDecodeError as e:
        print(f"[!] Failure: Could not decode JSON from the response. Error: {e}")
        return None
    except Exception as e:
        print(f"[!] An error occurred during the Gemini API call: {e}"); return None


# --- 模块四 (修改后): 创建任务描述JSON, 并返回布尔值 ---
def create_job_specification_json(analysis_result, protein_a_summary, protein_b_summary, output_path):
    """
    Tries to create the job specification JSON.
    Returns True on success, False on failure.
    """
    print(f"[*] Attempting to create Job Specification JSON...")
# ▼▼▼ NEW: Added a debugging print statement ▼▼▼
    print(f"--- DEBUG: Received analysis_result of type: {type(analysis_result)} ---")

    try:
        if not isinstance(analysis_result, dict) or 'interaction_hypotheses' not in analysis_result or not analysis_result.get('interaction_hypotheses'):
            print("[!] Failure: Gemini response did not contain a valid 'interaction_hypotheses' list.")
            return False
        top_hypothesis = analysis_result['interaction_hypotheses'][0]
        region_a, region_b = top_hypothesis['protein_A_interaction_domain'], top_hypothesis['protein_B_interaction_domain']
        seq_a, seq_b = protein_a_summary['sequence'], protein_b_summary['sequence']
        fragment_a = seq_a[region_a['start']-1 : region_a['end']]; fragment_b = seq_b[region_b['start']-1 : region_b['end']]
        fasta_content = (f">{region_a['id']}|Chain_A\n{fragment_a}\n"
                         f">{region_b['id']}|Chain_B\n{fragment_b}\n")
        job_spec = { "job_id": PROJECT_ID, "protein_A": protein_a_summary['id'], "protein_B": protein_b_summary['id'], "llm_hypothesis": analysis_result, "alphafold_input": {"fasta_content": fasta_content, "model_preset": "multimer"}, "metadata": {"creation_date": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())} }
        with open(output_path, 'w') as f:
            json.dump(job_spec, f, indent=4)
        print(f"✔️  Job Specification JSON created successfully at: {output_path}")
        return True
    except (KeyError, IndexError, TypeError) as e:
        print(f"[!] Failure: Could not process the parsed JSON. It might be malformed. Error: {e}"); return False
    except IOError as e:
        print(f"[!] Failure: Could not write file to disk. Error: {e}"); return False



# --- 主程序 (修改后) ---
def main():
    protein_a_data = fetch_uniprot_data(PROTEIN_A_ID)
    protein_b_data = fetch_uniprot_data(PROTEIN_B_ID)
    if not (protein_a_data and protein_b_data):
        print("\n❌ 失败：未能获取一个或多个蛋白质的初始数据。")
        return

    protein_a_summary = summarize_protein_info(protein_a_data)
    protein_b_summary = summarize_protein_info(protein_b_data)

    analysis_result = generate_hypothesis_with_gemini(protein_a_summary, protein_b_summary)

    # ==========================================================
    # ▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼▼
    # THIS IS THE NEW LOGIC
    if analysis_result:
        # 尝试创建文件，并根据其返回值判断最终结果
        is_successful = create_job_specification_json(
            analysis_result,
            protein_a_summary,
            protein_b_summary,
            OUTPUT_JSON_PATH
        )

        if is_successful:
            print("\n🎉 成功：可用于AlphaFold的JSON任务文件已生成！🎉")
            print("   下一步：使用 'biap_run_job.py' 脚本来执行此文件。")
        else:
            print("\n❌ 失败：未能生成可用于AlphaFold的JSON任务文件。❌")
            print("   请检查上面的日志，查看Gemini的响应是否有效或是否存在文件写入权限问题。")
    else:
        # Gemini API调用本身就失败了
        print("\n❌ 失败：未能从Gemini获取有效的分析结果。❌")
    # ▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲▲
    # ==========================================================

if __name__ == "__main__":
    main()

✅ Google API key configured successfully.
[*] Fetching data for P16615 from UniProt...
✔️  Successfully fetched data for P16615.
[*] Fetching data for O95140 from UniProt...
✔️  Successfully fetched data for O95140.

--- Calling Gemini API with model 'gemini-2.5-pro'... ---
--- Raw response received from Gemini API ---
```json
{
  "interaction_hypotheses": [
    {
      "hypothesis_rank": 1,
      "protein_A_interaction_domain": {
        "id": "P16615",
        "name": "Interaction with TMEM64 and PDIA3",
        "start": 788,
        "end": 1042
      },
      "protein_B_interaction_domain": {
        "id": "O95140",
        "name": "Part of a helix bundle domain, formed by helices from N-terminal and C-terminal regions",
        "start": 722,
        "end": 753
      },
      "confidence": {
        "category": "High",
        "score": 0.95
      },
      "rationale": "Strong evidence supports this hypothesis based on subcellular colocalization and function. Protein A (SERCA2) is on