# Annotate features for proteins.

## Step1 Annotate pLI and pLoF score for valid proteins. 

### Step1.1 Extract all protein molecules from a filtered LPPI

In [1]:
import pandas as pd

inter_file = 'lppi_with_valid_lnc.csv'
inter = pd.read_csv(inter_file)

# Concatenate two columns into a new Series and remove duplicates
molecule = pd.concat([inter['Node_i'], inter['Node_j']]).reset_index(drop=True)
molecule_df = pd.DataFrame(molecule, columns=['molecule'])
molecule_df = molecule_df.drop_duplicates()

protein_file = '../../data/LPPI/human/protein.csv'
proteins = pd.read_csv(protein_file)

proteins = proteins[proteins['protein_id'].isin(molecule_df['molecule'])]

# Export to CSV file
protein_file = 'proteins.csv'
proteins.to_csv(protein_file, index=False)


### Step1.2 Annotate pLI score for valid proteins.

In [None]:
import pandas as pd

valid_protein = pd.read_csv("proteins.csv", dtype=str)
pLoF = pd.read_csv("../../omics/protein/human/pLoF_v2.txt", sep='\t')

# Filtering required columns from pLoF
pLoF = pLoF[['gene', 'obs_lof', 'exp_lof', 'oe_lof', 'lof_z', 'pLI']]
#pLoF = pLoF[['gene', 'lof.obs', 'lof.exp', 'lof.oe', 'lof.z_score', 'lof.pLI']]

# Merging on 'protein' column from valid_protein and 'gene' column from pLoF
protein_pLI = pd.merge(valid_protein, pLoF, left_on='protein', right_on='gene', how='inner')
protein_pLI = protein_pLI.drop(columns=['gene', 'protein'])

# Extract rows containing NaN values in protein_pLI
na_score_protein = protein_pLI[protein_pLI.isna().any(axis=1)]
na_score_protein = na_score_protein[['protein_id']]

protein_pLI_cleaned = protein_pLI.dropna()
protein_invalid_score = valid_protein[~valid_protein['protein_id'].isin(protein_pLI_cleaned['protein_id'])]

pLI_means = protein_pLI_cleaned.groupby('protein_id').mean().reset_index()

# Save results
pLI_means.to_csv('protein_annotation.csv', index=False)
na_score_protein.to_csv('NA_score_protein.csv', index=False)  # Saving rows with NaN values
protein_invalid_score.to_csv('invalid_score_protein.csv', index=False)  # Saving rows with NaN values


### Step1.3 Convert oe_lof&lof_z to p-value and get log.

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import poisson, norm

# Load the annotation
df = pd.read_csv("protein_annotation.csv")

# Calculate p-value for oe_lof using the Poisson distribution
df["oe_lof_pval"] = df.apply(lambda row: poisson.cdf(row['obs_lof'], row['exp_lof']), axis=1)

# Calculate p-value for lof_z using a two-tailed Z-test
df["lof_z_pval"] = df["lof_z"].apply(lambda x: 2 * (1 - norm.cdf(abs(x))) if pd.notna(x) else np.nan)

# Apply a log10 transformation to p-values with a small number adjustment
df["log_oe_lof_pval"] = df["oe_lof_pval"].apply(lambda x: np.log10(x + 1e-10))
df["log_lof_z_pval"] = df["lof_z_pval"].apply(lambda x: np.log10(x + 1e-10))
df["log_pLI"] = df["pLI"].apply(lambda x: np.log10(x) if pd.notna(x) and x > 0 else np.nan)

# Select columns to keep
df = df[['protein_id', "log_oe_lof_pval", "log_lof_z_pval", "log_pLI"]]

# Save the transformed annotation
df.to_csv("transformed_protein_annotation.csv", index=False)


### Step1.4 Delete interaction with protein which have NA pLI score.

In [2]:
import pandas as pd

inter = pd.read_csv("inter_with_valid_lnc.csv")
invalid_protein = pd.read_csv("invalid_score_protein.csv")
NA_protein = pd.read_csv("NA_score_protein.csv")

inter = inter[~inter['Node_i'].isin(invalid_protein['protein_id'])]
inter = inter[~inter['Node_j'].isin(invalid_protein['protein_id'])]

inter = inter[~inter['Node_i'].isin(NA_protein['protein_id'])]
inter = inter[~inter['Node_j'].isin(NA_protein['protein_id'])]

inter.columns = ['source', 'target', 'weight']
inter.to_csv("valid_inter.csv", index=False)

tissues = ['heart','lung','stomach']
for tissue in tissues:
	lnc = pd.read_csv(f"{tissue}_annotation.csv")
	lnc = lnc[lnc['lncRNA_id'].isin(inter['Node_i'])]

	lnc.to_csv(f"valid_{tissue}_annotation.csv", index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'inter_with_valid_lnc.csv'

GO数量+直系同源基因数量+表达量做特征

根据蛋白质名称获取ensembl id(from BED file)

In [3]:
import pandas as pd
import os
import re

# Set the directory containing ensembl data
ensembl_dir = "../../reference_lncRNA/human/bed/ensembl/"

# Read protein_id and protein_name information
proteins = pd.read_csv('./proteins.csv')

# Initialize remaining protein list
remained_protein = proteins[['protein_id', 'protein']].copy()
results = []

# Get genomic position by ensembl_id & gene_name
# **STEP 1: Extract the version number from BED file**
def extract_version(filename):
    match = re.search(r'GRCh38\.(\d+)\.bed', filename) # ensembl
    return int(match.group(1)) if match else -1

# **STEP 2: Get all BED files and sort them by version number**
bed_files = [f for f in os.listdir(ensembl_dir) if f.endswith(".bed")]
bed_files_sorted = sorted(bed_files, key=extract_version, reverse=True)  # Sort by version number in descending order

# **STEP 3: Iterate over sorted BED files**
for bed_file in bed_files_sorted:
    bed_path = os.path.join(ensembl_dir, bed_file)

    # Read ensembl BED file
    ensembl_bed = pd.read_csv(bed_path, sep='\t', header=None, 
                              names=['chr', 'start', 'end', 'gene_name', 'ensembl_id', 'strand'])

    # Match by gene_id
    ensembl_id_map = pd.merge(remained_protein, 
                               ensembl_bed[['gene_name', 'ensembl_id']], 
                               left_on='protein',
                               right_on='gene_name', how='inner')
    results.append(ensembl_id_map)
    remained_protein = remained_protein[~remained_protein['protein_id'].isin(ensembl_id_map['protein_id'])]

# Combine all results
pro_ens = pd.concat(results, ignore_index=True).drop_duplicates(subset=['protein_id'])

# Save remaining proteins without genomic positions
remained_protein.drop_duplicates().to_csv('pro_no_id.csv', index=False)

# Generate CSV files
pro_ens[['protein', 'protein_id', 'ensembl_id']].to_csv('pro_ens_msp.csv', index=False)
print("Processing complete. Results saved.")


Processing complete. Results saved.


获取GO数量 √

In [6]:
"""
Fetch GO annotations from Ensembl BioMart by Ensembl Gene IDs (from input column `ensembl_id`)
and produce TOTAL unique GO counts per protein_id.

Input CSV  (must contain columns): protein_id, protein, ensembl_id
Output CSV (columns): protein_id, go_count
"""

from __future__ import annotations
import io
import re
import time
from pathlib import Path
from typing import Dict, List

import pandas as pd
import requests

# ===================== User settings =====================
INPUT_CSV  = "pro_ens_msp.csv"          # must have columns: protein_id, protein, ensembl_id
OUTPUT_CSV = "pro_go_counts.csv"

BIOMART_URL = "https://www.ensembl.org/biomart/martservice"

# Datasets per species (extend as needed)
DATASET_FOR_PREFIX: Dict[str, str] = {
    "ENSG":   "hsapiens_gene_ensembl",   # human
    "ENSMUSG":"mmusculus_gene_ensembl",  # mouse
}

BATCH_SIZE = 200
TIMEOUT    = 60
RETRY      = 3
BACKOFF    = 0.6
# ========================================================

SESSION = requests.Session()  # reuse HTTP connections


def clean_ensembl_id(x: str) -> str:
    """Strip spaces and remove version suffix (e.g., ENSG... .16 -> ENSG...)."""
    if not isinstance(x, str):
        return ""
    x = x.strip()
    if not x:
        return ""
    return re.sub(r"\.\d+$", "", x)


def detect_dataset(ensembl_id: str) -> str | None:
    """Choose BioMart dataset by ID prefix (ENSG/ENSMUSG)."""
    for pref, ds in DATASET_FOR_PREFIX.items():
        if ensembl_id.startswith(pref):
            return ds
    return None


def build_query_xml(dataset: str, ensg_list: List[str]) -> str:
    """Build BioMart XML for TSV output: ensembl_gene_id, external_gene_name, go_id."""
    values_xml = "".join(f"<Value>{gid}</Value>" for gid in ensg_list if gid)
    return f"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="1" datasetConfigVersion="0.6">
  <Dataset name="{dataset}" interface="default">
    <Filter name="ensembl_gene_id" excluded="0">
      {values_xml}
    </Filter>
    <Attribute name="ensembl_gene_id"/>
    <Attribute name="external_gene_name"/>
    <Attribute name="go_id"/>
  </Dataset>
</Query>
"""


def post_biomart(xml_query: str) -> str:
    """POST XML to BioMart and return TSV text with simple retry/backoff."""
    params = {"query": xml_query}
    last_exc = None
    for i in range(RETRY):
        try:
            r = SESSION.post(BIOMART_URL, data=params, timeout=TIMEOUT)
            if r.status_code == 200 and r.text is not None:
                return r.text
        except requests.RequestException as e:
            last_exc = e
        time.sleep(BACKOFF * (i + 1))
    if last_exc:
        raise last_exc
    raise RuntimeError("BioMart request failed")


# ---------- Load input ----------
df_in = pd.read_csv(INPUT_CSV, dtype=str)
required_cols = {"protein_id", "protein", "ensembl_id"}
missing = required_cols - set(df_in.columns)
if missing:
    raise ValueError(f"Input CSV missing required columns: {sorted(missing)}")

# Normalize Ensembl IDs (strip + remove version)
df_in["ensembl_id_clean"] = df_in["ensembl_id"].fillna("").astype(str).map(clean_ensembl_id)

# Build protein_id -> ensembl_id mapping
# If a protein_id appears multiple times with different non-empty Ensembl IDs,
# we keep the first non-empty and warn.
grp = df_in.groupby("protein_id", dropna=False)["ensembl_id_clean"].apply(
    lambda s: [x for x in s if x]
).rename("ensembl_ids_nonempty")

protein_to_ensembl: Dict[str, str] = {}
ambiguous = []
for pid, id_list in grp.items():
    if not id_list:
        protein_to_ensembl[pid] = ""  # no ID available
    else:
        uniq = sorted(set(id_list))
        protein_to_ensembl[pid] = uniq[0]
        if len(uniq) > 1:
            ambiguous.append((pid, uniq))

if ambiguous:
    print(f"[WARN] {len(ambiguous)} protein_id have multiple distinct Ensembl IDs. "
          f"Using the first after sorting. Example: {ambiguous[0]}")

# Unique Ensembl IDs to query
unique_ids = sorted({eid for eid in protein_to_ensembl.values() if eid})

# Group IDs by dataset
groups: Dict[str, List[str]] = {}
unrecognized = []
for eid in unique_ids:
    ds = detect_dataset(eid)
    if ds:
        groups.setdefault(ds, []).append(eid)
    else:
        unrecognized.append(eid)

if unrecognized:
    print(f"[WARN] Unrecognized Ensembl ID prefixes (not queried): {len(unrecognized)} "
          f"examples: {unrecognized[:5]}  (extend DATASET_FOR_PREFIX if needed)")

# ---------- Query BioMart ----------
parts = []
for dataset, id_list in groups.items():
    for i in range(0, len(id_list), BATCH_SIZE):
        batch = id_list[i:i + BATCH_SIZE]
        xml = build_query_xml(dataset, batch)
        tsv = post_biomart(xml)
        if not tsv or not tsv.strip():
            continue
        part = pd.read_csv(
            io.StringIO(tsv),
            sep="\t",
            header=None,
            names=["ensembl_id", "external_gene_name", "go_id"],
            dtype=str,
        )
        parts.append(part)

ann = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(
    columns=["ensembl_id", "external_gene_name", "go_id"], dtype=str
)
ann = ann.fillna("")
ann = ann[(ann["ensembl_id"] != "") & (ann["go_id"] != "")]

# Unique GO per Ensembl ID
ann_unique = ann.drop_duplicates(subset=["ensembl_id", "go_id"])
go_counts_by_eid = (
    ann_unique.groupby("ensembl_id")["go_id"].nunique().rename("go_count")
)

# ---------- Map back to protein_id ----------
out = pd.DataFrame({"protein_id": list(protein_to_ensembl.keys())})
out["ensembl_id"] = out["protein_id"].map(protein_to_ensembl)

out = out.merge(go_counts_by_eid, left_on="ensembl_id", right_index=True, how="left")
out["go_count"] = out["go_count"].fillna(0).astype(int)

# Only keep required columns
out = out[["protein_id", "go_count"]]

# Save
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"[DONE] Wrote {len(out)} rows to {OUTPUT_CSV}")
print(f"[INFO] Datasets used: {', '.join(groups.keys()) or 'None'} | "
      f"Unrecognized Ensembl IDs: {len(unrecognized)} | "
      f"Proteins without Ensembl ID: {sum(1 for v in protein_to_ensembl.values() if not v)}")


ConnectTimeout: HTTPSConnectionPool(host='www.ensembl.org', port=443): Max retries exceeded with url: /biomart/martservice (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fac311ebb10>, 'Connection to www.ensembl.org timed out. (connect timeout=60)'))

获取直系同源等位基因的数量   √

In [None]:
import requests
import pandas as pd
import time
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configuration parameters
INPUT_FILE = "protein_name.txt"
OUTPUT_FILE = "all_species_ortholog_counts.csv"
THREADS = 5
MAX_RETRIES = 3
SLEEP_BETWEEN = 0.4
HEADERS = {"Content-Type": "application/json"}

def count_orthologs_by_symbol(symbol):
    url = f"https://rest.ensembl.org/homology/symbol/homo_sapiens/{symbol}?type=orthologues"
    for attempt in range(MAX_RETRIES):
        try:
            r = requests.get(url, headers=HEADERS, timeout=15)
            if r.status_code == 200:
                data = r.json()
                homologies = data['data'][0].get('homologies', [])
                count = sum(1 for h in homologies if h['type'] == 'ortholog_one2one')
                return (symbol, count, "OK")
            elif r.status_code == 404:
                return (symbol, 0, "NotFound")
            else:
                print(f"[{symbol}] HTTP {r.status_code}")
        except Exception as e:
            print(f"[{symbol}] error: {e}")
        time.sleep(SLEEP_BETWEEN)
    return (symbol, 0, "Error")

def load_symbols():
    with open(INPUT_FILE) as f:
        return [line.strip() for line in f if line.strip()]

def load_existing_results():
    if not os.path.exists(OUTPUT_FILE):
        return set()
    df = pd.read_csv(OUTPUT_FILE)
    return set(df['GeneSymbol'].values)

def save_result(symbol, count, status):
    with open(OUTPUT_FILE, "a") as f:
        f.write(f"{symbol},{count},{status}\n")

def main():
    all_symbols = load_symbols()
    done_symbols = load_existing_results()
    symbols_to_query = [s for s in all_symbols if s not in done_symbols]

    print(f"\n Total genes: {len(all_symbols)}")
    print(f" Already processed: {len(done_symbols)}")
    print(f" Pending: {len(symbols_to_query)}")
    print(f" Starting concurrent queries with {THREADS} threads\n")

    with open(OUTPUT_FILE, "a") as f:
        if os.stat(OUTPUT_FILE).st_size == 0:
            f.write("GeneSymbol,OrthologCount,Status\n")

    with ThreadPoolExecutor(max_workers=THREADS) as executor:
        future_to_symbol = {
            executor.submit(count_orthologs_by_symbol, symbol): symbol
            for symbol in symbols_to_query
        }

        for future in as_completed(future_to_symbol):
            symbol = future_to_symbol[future]
            try:
                result = future.result()
                save_result(*result)
                print(f" {result[0]} → {result[1]} orthologs")
            except Exception as exc:
                print(f" {symbol} exception: {exc}")
            time.sleep(SLEEP_BETWEEN)

    print(f"\n Query complete! Results saved to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


获取表达量

In [None]:
# save as: fetch_expression_gtex.py
# usage:
#   python fetch_expression_gtex.py --map mapping.csv --out-prefix expr
#
# Outputs:
#   expr.summary.csv  -> one row per input protein, with overall median & mean of tissue medians
#   expr.long.csv     -> per-tissue median TPM (long format) for each gene
#
# Input mapping.csv columns:
#   input_protein, symbol, uniprot_acc, ensembl_gene

import argparse
import time
import requests
import pandas as pd
from typing import Optional, Tuple, List, Dict, Any

GTEX_EXPR = "https://gtexportal.org/api/v2/gene/medianExpression"
HEADERS_JSON = {"Accept": "application/json"}

def request_with_retry(url: str, params: dict, headers: dict = None, max_try: int = 3, timeout: int = 60):
    """Simple retry wrapper for HTTP GET."""
    for i in range(max_try):
        r = requests.get(url, params=params, headers=headers, timeout=timeout)
        if r.status_code == 200:
            return r
        time.sleep(0.7 * (i + 1))
    return r

def strip_version(ensg: Optional[str]) -> Optional[str]:
    """Drop version suffix from Ensembl gene ID if present (e.g., ENSG... .12)."""
    if not ensg or not isinstance(ensg, str):
        return ensg
    return ensg.split(".")[0]

def gtex_expression_summary(ensembl_gene: str) -> Tuple[Optional[float], Optional[float], List[Dict[str, Any]]]:
    """
    Fetch GTEx v8 per-tissue median TPM for a gene, then compute:
      - overall_median_of_tissue_medians
      - overall_mean_of_tissue_medians
    Returns:
      (overall_median, overall_mean, details_list)
    details_list: [{"tissue": str, "median_tpm": float, "unit": str}, ...]
    """
    if not ensembl_gene:
        return None, None, []
    # Try with full ID first; fallback to non-version ID if necessary.
    ensg_full = ensembl_gene
    params = {"gencodeId": ensg_full, "format": "json"}
    r = request_with_retry(GTEX_EXPR, params=params, headers=HEADERS_JSON)
    rows = []
    if r.status_code == 200:
        rows = (r.json() or {}).get("data") or []
    if not rows:
        ensg_nover = strip_version(ensembl_gene)
        if ensg_nover and ensg_nover != ensg_full:
            params = {"gencodeId": ensg_nover, "format": "json"}
            r2 = request_with_retry(GTEX_EXPR, params=params, headers=HEADERS_JSON)
            if r2.status_code == 200:
                rows = (r2.json() or {}).get("data") or []

    if not rows:
        return None, None, []

    details = []
    for row in rows:
        details.append({
            "tissue": row.get("tissueSiteDetail"),
            "median_tpm": row.get("median"),
            "unit": row.get("unit")
        })

    medians = [d["median_tpm"] for d in details if isinstance(d.get("median_tpm"), (int, float))]
    if not medians:
        return None, None, details

    s = pd.Series(medians)
    overall_median = float(s.median())
    overall_mean = float(s.mean())
    return overall_median, overall_mean, details

def main():
    ap = argparse.ArgumentParser(description="Fetch GTEx expression (per-tissue median TPM) and compute overall stats.")
    ap.add_argument("--map", dest="mapping", required=True, help="Mapping CSV from step 1.")
    ap.add_argument("--out-prefix", dest="out_prefix", default="expr", help="Output file prefix.")
    args = ap.parse_args()

    m = pd.read_csv(args.mapping)
    req_cols = {"input_protein", "symbol", "ensembl_gene"}
    if not req_cols.issubset(set(m.columns)):
        raise ValueError("Mapping CSV must contain 'input_protein', 'symbol', 'ensembl_gene'.")

    # Deduplicate API calls by ensembl_gene
    unique_ensg = sorted(set([str(x) for x in m["ensembl_gene"].dropna().astype(str)]))
    ensg_to_summary = {}
    ensg_to_long = {}

    for ensg in unique_ensg:
        med, mean, details = gtex_expression_summary(ensg)
        ensg_to_summary[ensg] = (med, mean)
        ensg_to_long[ensg] = details
        time.sleep(0.12)

    # Build summary table (one row per input_protein)
    sum_rows = []
    for _, row in m.iterrows():
        ip = row.get("input_protein")
        sym = row.get("symbol")
        ensg = row.get("ensembl_gene")
        med, mean = (None, None)
        if pd.notna(ensg):
            med, mean = ensg_to_summary.get(str(ensg), (None, None))
        sum_rows.append({
            "input_protein": ip,
            "symbol": sym,
            "ensembl_gene": ensg,
            "gtex_overall_median_tpm": med,
            "gtex_overall_mean_tpm": mean
        })
    summary_df = pd.DataFrame(sum_rows)

    # Build long table (per tissue)
    long_rows = []
    for _, row in m.iterrows():
        ip = row.get("input_protein")
        sym = row.get("symbol")
        ensg = row.get("ensembl_gene")
        details = []
        if pd.notna(ensg):
            details = ensg_to_long.get(str(ensg), [])
        for d in details:
            long_rows.append({
                "input_protein": ip,
                "symbol": sym,
                "ensembl_gene": ensg,
                "tissue": d.get("tissue"),
                "median_tpm": d.get("median_tpm"),
                "unit": d.get("unit")
            })
    long_df = pd.DataFrame(long_rows)

    summary_path = f"{args.out_prefix}.summary.csv"
    long_path = f"{args.out_prefix}.long.csv"
    summary_df.to_csv(summary_path, index=False)
    long_df.to_csv(long_path, index=False)
    print(f"Done.\nSummary: {summary_path}\nLong: {long_path}")

if __name__ == "__main__":
    main()
