# Annotate features for proteins.

## Step1 Extract all protein. 

### Step1.1 Extract all protein molecules from a filtered LPPI.

In [1]:
import pandas as pd

inter_file = 'lppi_with_valid_lnc.csv'
inter = pd.read_csv(inter_file)

# Concatenate two columns into a new Series and remove duplicates
molecule = pd.concat([inter['Node_i'], inter['Node_j']]).reset_index(drop=True)
molecule_df = pd.DataFrame(molecule, columns=['molecule'])
molecule_df = molecule_df.drop_duplicates()

protein_file = '../../data/LPPI/human/protein_updated.csv'
proteins = pd.read_csv(protein_file)

proteins = proteins[proteins['protein_id'].isin(molecule_df['molecule'])]

# Export to CSV file
protein_file = 'proteins.csv'
proteins.to_csv(protein_file, index=False)


### Step1.2 Get ensembl id from BED file for protein.

In [2]:
import pandas as pd
import os
import re

# Set the directory containing ensembl data
ensembl_dir = "../../reference_lncRNA/human/bed/ensembl/"

# Read protein_id and protein_name information
proteins = pd.read_csv('./proteins.csv')

# Initialize remaining protein list
remained_protein = proteins[['protein_id', 'protein']].copy()
results = []

# Get genomic position by ensembl_id & gene_name
# **STEP 1: Extract the version number from BED file**
def extract_version(filename):
    match = re.search(r'GRCh38\.(\d+)\.bed', filename) # ensembl
    return int(match.group(1)) if match else -1

# **STEP 2: Get all BED files and sort them by version number**
bed_files = [f for f in os.listdir(ensembl_dir) if f.endswith(".bed")]
bed_files_sorted = sorted(bed_files, key=extract_version, reverse=True)  # Sort by version number in descending order

# **STEP 3: Iterate over sorted BED files**
for bed_file in bed_files_sorted:
    bed_path = os.path.join(ensembl_dir, bed_file)

    # Read ensembl BED file
    ensembl_bed = pd.read_csv(bed_path, sep='\t', header=None, 
                              names=['chr', 'start', 'end', 'gene_name', 'ensembl_id', 'strand'])

    # Match by gene_id
    ensembl_id_map = pd.merge(remained_protein, 
                               ensembl_bed[['gene_name', 'ensembl_id']], 
                               left_on='protein',
                               right_on='gene_name', how='inner')
    results.append(ensembl_id_map)
    remained_protein = remained_protein[~remained_protein['protein_id'].isin(ensembl_id_map['protein_id'])]

# Combine all results
pro_ens = pd.concat(results, ignore_index=True).drop_duplicates(subset=['protein_id'])

# Save remaining proteins without genomic positions
remained_protein.drop_duplicates().to_csv('pro_no_id.csv', index=False)

# Generate CSV files
pro_ens[['protein', 'protein_id', 'ensembl_id']].to_csv('pro_ens_map.csv', index=False)
print("Processing complete. Results saved.")


Processing complete. Results saved.


## Step2 Annotate features

### Step2.1 Annotate the number of go terms.

In [4]:
import pandas as pd

import pandas as pd

def count_gene_go_terms(go_file):

    df = pd.read_csv(go_file, dtype=str)
    
    # Keep only gene symbols and GO terms, remove duplicates
    df_filtered = df[["Gene name", "GO term accession"]].drop_duplicates()
    
    # Count the number of unique GO terms associated with each gene
    gene_go_counts = df_filtered.groupby(["Gene name"])["GO term accession"].nunique().reset_index()
    
    # Rename columns for clarity
    gene_go_counts.columns = ["gene_name", "GO_Term_Count"]
    
    return gene_go_counts

go_file = "../../features/protein/human/go.csv"  
gene_go_counts = count_gene_go_terms(go_file)

protein = pd.read_csv("proteins.csv", dtype=str)

filtered_gene_go_term_counts = pd.merge(protein, gene_go_counts, left_on='protein', right_on='gene_name', how='inner')
filtered_gene_go_term_counts = filtered_gene_go_term_counts[['protein_id','GO_Term_Count']]

# Save the result to a CSV file
filtered_gene_go_term_counts.to_csv("protein_go_term_counts.csv", sep=",", index=False)


### Step2.2 Annotate the number of orthologs.

Step2.2.1 Get ortholog count using Ensembl REST API

In [None]:
import requests
import pandas as pd
import time
import os
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configuration parameters
INPUT_FILE = "proteins.csv"
OUTPUT_FILE = "all_species_ortholog_counts.csv"
THREADS = 5
MAX_RETRIES = 3
SLEEP_BETWEEN = 0.4
HEADERS = {"Content-Type": "application/json"}

def count_orthologs_by_symbol(symbol):
    url = f"https://rest.ensembl.org/homology/symbol/homo_sapiens/{symbol}?type=orthologues"
    for attempt in range(MAX_RETRIES):
        try:
            r = requests.get(url, headers=HEADERS, timeout=15)
            if r.status_code == 200:
                data = r.json()
                homologies = data['data'][0].get('homologies', [])
                count = sum(1 for h in homologies if h['type'] == 'ortholog_one2one')
                return (symbol, count) 
            elif r.status_code == 404:
                return None          
            else:
                print(f"[{symbol}] HTTP {r.status_code}")
        except Exception as e:
            print(f"[{symbol}] error: {e}")
        time.sleep(SLEEP_BETWEEN)
    return None                 

def load_symbols():
    with open(INPUT_FILE, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        return [row['protein'] for row in reader if row.get('protein')]

def load_existing_results():
    if not os.path.exists(OUTPUT_FILE):
        return set()
    df = pd.read_csv(OUTPUT_FILE)
    return set(df['gene_name'].values)

def save_result(symbol, count):
    with open(OUTPUT_FILE, "a") as f:
        f.write(f"{symbol},{count}\n")

def main():
    all_symbols = load_symbols()
    done_symbols = load_existing_results()
    symbols_to_query = [s for s in all_symbols if s not in done_symbols]

    print(f"\n Total genes: {len(all_symbols)}")
    print(f" Already processed: {len(done_symbols)}")
    print(f" Pending: {len(symbols_to_query)}")
    print(f" Starting concurrent queries with {THREADS} threads\n")

    if not os.path.exists(OUTPUT_FILE) or os.stat(OUTPUT_FILE).st_size == 0:
        with open(OUTPUT_FILE, "w") as f:
            f.write("gene_name,ortholog_count\n")

    with ThreadPoolExecutor(max_workers=THREADS) as executor:
        future_to_symbol = {
            executor.submit(count_orthologs_by_symbol, symbol): symbol
            for symbol in symbols_to_query
        }

        for future in as_completed(future_to_symbol):
            symbol = future_to_symbol[future]
            try:
                result = future.result()
                if result: 
                    save_result(*result)
                    print(f" {result[0]} â†’ {result[1]} orthologs")
            except Exception as exc:
                print(f" {symbol} exception: {exc}")
            time.sleep(SLEEP_BETWEEN)

    print(f"\n Query complete! Results saved to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


Step2.2.2 Annotate ortholog count

In [1]:
import pandas as pd

orthologs_counts = pd.read_csv('all_species_ortholog_counts.csv')
protein = pd.read_csv('proteins.csv')

orthologs_counts = orthologs_counts[['gene_name','ortholog_count']]
protein_orthologs_counts = pd.merge(protein,orthologs_counts,left_on='protein',right_on='gene_name',how='inner')
protein_orthologs_counts = protein_orthologs_counts[['protein_id','ortholog_count']]
protein_orthologs_counts.to_csv("protein_orthologs_counts.csv",index=False)

### Step2.3 Annotate the expression feature.

In [2]:
import pandas as pd

protein = pd.read_csv("pro_ens_map.csv", dtype=str)

for t in ['heart', 'lung', 'stomach']:
    exp_file = pd.read_csv(
        f"../../features/protein/human/exp_{t}.tsv",
        sep="\t",
        dtype=str
    )

    exp_file = exp_file[['gene_id', 'TPM', 'pme_TPM']].copy()

    # 1) keep ENSG only
    exp_file = exp_file[exp_file['gene_id'].str.startswith('ENSG', na=False)].copy()

    # 2) mark PAR_Y using original gene_id
    exp_file["is_par_y"] = exp_file["gene_id"].str.endswith("_PAR_Y")

    # 3) define base gene id by removing _PAR_Y only
    exp_file["gene_id_base"] = exp_file["gene_id"].str.replace(r"_PAR_Y$", "", regex=True)

    # 4) canonical-first selection:
    #    ENSG... > ENSG..._PAR_Y
    exp_file = exp_file.sort_values(
        ["gene_id_base", "is_par_y"],
        ascending=[True, True]
    )
    exp_file = exp_file.drop_duplicates(
        subset=["gene_id_base"],
        keep="first"
    )

    # 5) remove version suffix (.xx)
    exp_file["gene_id_base"] = (
        exp_file["gene_id_base"]
        .str.replace(r"\.\d+$", "", regex=True)
    )

    # 6) merge with protein mapping
    protein_exp = pd.merge(
        protein,
        exp_file[["gene_id_base", "pme_TPM"]],
        left_on="ensembl_id",
        right_on="gene_id_base",
        how="inner"
    )

    protein_exp = protein_exp[['protein_id', 'pme_TPM']].drop_duplicates()
    protein_exp.to_csv(f"protein_exp_{t}.csv", index=False)

    print(f"[{t}] saved protein_exp_{t}.csv, n={protein_exp.shape[0]}")


[heart] saved protein_exp_heart.csv, n=18264
[lung] saved protein_exp_lung.csv, n=18264
[stomach] saved protein_exp_stomach.csv, n=18264


## Step3 Merge annotation.

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler


go_anno = pd.read_csv("protein_go_term_counts.csv")
homo_anno = pd.read_csv("protein_orthologs_counts.csv")
for t in ['heart','lung','stomach']:
	exp_anno = pd.read_csv(f"protein_exp_{t}.csv")
	protein_annotation = pd.merge(exp_anno, go_anno, on='protein_id', how="inner")
	protein_annotation = protein_annotation.merge(homo_anno, on="protein_id", how='inner')

	ID_column = protein_annotation.iloc[:, [0]]
	feature_columns = protein_annotation.iloc[:, 1:]

	scaler = StandardScaler()
	normalized_data = scaler.fit_transform(feature_columns)

	df_normalized = pd.concat([ID_column, pd.DataFrame(normalized_data, columns=feature_columns.columns)], axis=1)

	df_normalized.to_csv(f"protein_annotation_{t}.csv",index=False)

## Step4 Get inter with valid protein.

In [5]:
import pandas as pd

inter = pd.read_csv("lppi_with_valid_lnc.csv")
valid_protein = pd.read_csv(f"protein_annotation_heart.csv")

valid_protein_set = set(valid_protein['protein_id'])

# Define a function to check whether nodes starting with 'p' exist in valid_protein
def check_valid_protein(node):
    if node.startswith('p'):
        return node in valid_protein_set
    return True  # Keep nodes that do not start with 'p'

# Check each row: both Node_i and Node_j must be valid
inter = inter[inter['Node_i'].apply(check_valid_protein) & inter['Node_j'].apply(check_valid_protein)]

tissues = ['heart', 'lung', 'stomach']
for tissue in tissues:
    lnc = pd.read_csv(f"{tissue}_annotation.csv")
    lnc = lnc[lnc['lncRNA_id'].isin(inter['Node_i'])]

    lnc.to_csv(f"valid_{tissue}_annotation.csv", index=False)

inter.columns = ['source', 'target']
#inter.to_csv("valid_inter.csv", index=False)

inter['weight'] = 1
inter.to_csv("unweighted_inter.csv", index=False)
