# Annotate features for proteins.

## Step1 Extract all protein. 

### Step1.1 Extract all protein molecules from a filtered LPPI.

In [9]:
import pandas as pd

# Example usage
inter_file = 'inter_with_valid_lnc.csv'
inter = pd.read_csv(inter_file)

# Concatenate two columns into a new Series and remove duplicates
molecule = pd.concat([inter['Node_i'], inter['Node_j']]).reset_index(drop=True)
molecule_df = pd.DataFrame(molecule, columns=['molecule'])
molecule_df = molecule_df.drop_duplicates()

protein_file = '../../data/LPPI/mouse/protein_updated.csv'
proteins = pd.read_csv(protein_file)

proteins = proteins[proteins['protein_ID'].isin(molecule_df['molecule'])]

# Export to CSV file
protein_file = 'proteins.csv'
proteins.to_csv(protein_file, index=False)


## Step2 Annotate features

### Step2.1 Annotate the number of go terms.

In [11]:
import pandas as pd

def count_gene_go_terms(gaf_file):
    # Read GAF (Gene Association Format) file
    columns = [
        "DB", "DB_Object_ID", "DB_Object_Symbol", "Qualifier", "GO_ID", "DB_Reference", 
        "Evidence_Code", "With_or_From", "Aspect", "DB_Object_Name", "DB_Object_Synonym",
        "DB_Object_Type", "Taxon", "Date", "Assigned_By", "Annotation_Extension", "Gene_Product_Form_ID"
    ]
    
    # Only read the first 15 columns to avoid issues with different GAF versions
    df = pd.read_csv(gaf_file, sep="\t", comment="!", header=None, names=columns, usecols=range(15), dtype=str)
    
    # Filter for mouse-specific data (taxon:10090)
    df = df[df["Taxon"] == "taxon:10090"]

    # Keep only gene symbols and GO terms, remove duplicates
    df_filtered = df[["DB_Object_Symbol", "GO_ID"]].drop_duplicates()
    
    # Count the number of unique GO terms associated with each gene
    gene_go_counts = df_filtered.groupby(["DB_Object_Symbol"])["GO_ID"].nunique().reset_index()
    
    # Rename columns for clarity
    gene_go_counts.columns = ["Gene_Symbol", "GO_Term_Count"]
    
    return gene_go_counts

gaf_file = "../../omics/protein/mouse/mgi.gaf"  
gene_go_counts = count_gene_go_terms(gaf_file)

protein = pd.read_csv("proteins.csv", dtype=str)

filtered_gene_go_term_counts = pd.merge(protein, gene_go_counts, left_on='protein', right_on='Gene_Symbol', how='inner')
filtered_gene_go_term_counts = filtered_gene_go_term_counts[['protein_ID','GO_Term_Count']]

# Save the result to a CSV file
filtered_gene_go_term_counts.to_csv("protein_go_term_counts.csv", sep=",", index=False)


### Step2.2 Annotate the number of orthologs.

Step2.2.1 Extract gene name.

In [12]:
import pandas as pd

data = pd.read_csv("proteins.csv")

gene = data[['protein']]

gene.to_csv("protein_name.txt", sep="\t",index=False, header=None)

Step2.2.2 Get ortholog count using Ensembl REST API

In [None]:
import requests
import pandas as pd
import time
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configuration parameters
INPUT_FILE = "protein_name.txt"
OUTPUT_FILE = "all_species_ortholog_counts.csv"
THREADS = 5
MAX_RETRIES = 3
SLEEP_BETWEEN = 0.4
HEADERS = {"Content-Type": "application/json"}

def count_orthologs_by_symbol(symbol):
    url = f"https://rest.ensembl.org/homology/symbol/mus_musculus/{symbol}?type=orthologues"
    for attempt in range(MAX_RETRIES):
        try:
            r = requests.get(url, headers=HEADERS, timeout=15)
            if r.status_code == 200:
                data = r.json()
                homologies = data['data'][0].get('homologies', [])
                count = sum(1 for h in homologies if h['type'] == 'ortholog_one2one')
                return (symbol, count, "OK")
            elif r.status_code == 404:
                return (symbol, 0, "NotFound")
            else:
                print(f"[{symbol}] HTTP {r.status_code}")
        except Exception as e:
            print(f"[{symbol}] error: {e}")
        time.sleep(SLEEP_BETWEEN)
    return (symbol, 0, "Error")

def load_symbols():
    with open(INPUT_FILE) as f:
        return [line.strip() for line in f if line.strip()]

def load_existing_results():
    if not os.path.exists(OUTPUT_FILE):
        return set()
    df = pd.read_csv(OUTPUT_FILE)
    return set(df['GeneSymbol'].values)

def save_result(symbol, count, status):
    with open(OUTPUT_FILE, "a") as f:
        f.write(f"{symbol},{count},{status}\n")

def main():
    all_symbols = load_symbols()
    done_symbols = load_existing_results()
    symbols_to_query = [s for s in all_symbols if s not in done_symbols]

    print(f"\n Total genes: {len(all_symbols)}")
    print(f" Already processed: {len(done_symbols)}")
    print(f" Pending: {len(symbols_to_query)}")
    print(f" Starting concurrent queries with {THREADS} threads\n")

    with open(OUTPUT_FILE, "a") as f:
        if os.stat(OUTPUT_FILE).st_size == 0:
            f.write("GeneSymbol,OrthologCount,Status\n")

    with ThreadPoolExecutor(max_workers=THREADS) as executor:
        future_to_symbol = {
            executor.submit(count_orthologs_by_symbol, symbol): symbol
            for symbol in symbols_to_query
        }

        for future in as_completed(future_to_symbol):
            symbol = future_to_symbol[future]
            try:
                result = future.result()
                save_result(*result)
                print(f" {result[0]} → {result[1]} orthologs")
            except Exception as exc:
                print(f" {symbol} exception: {exc}")
            time.sleep(SLEEP_BETWEEN)

    print(f"\n Query complete! Results saved to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


Step2.2.3 Annotate ortholog count

In [15]:
import pandas as pd

orthologs_counts = pd.read_csv('all_species_ortholog_counts.csv')
protein = pd.read_csv('proteins.csv')

orthologs_counts = orthologs_counts[orthologs_counts['Status']=='OK']
orthologs_counts = orthologs_counts[['GeneSymbol','OrthologCount']]
protein_orthologs_counts = pd.merge(protein,orthologs_counts,left_on='protein',right_on='GeneSymbol',how='inner')
protein_orthologs_counts = protein_orthologs_counts[['protein_ID','OrthologCount']]
protein_orthologs_counts.to_csv("protein_orthologs_counts.csv",index=False)

### Step2.3 Annotate the expression feature.

Step2.3.1 Calculate avg_TPM of gene at 22 tissues.

In [16]:
import pandas as pd

exp_file = pd.read_csv("../../omics/protein/mouse/filtered_exp.csv")

# Ensure column names are correct (remove leading/trailing spaces or capitalization issues)
exp_file.columns = exp_file.columns.str.strip()

# Calculate the mean of `avg_TPM` grouped by `Gene Symbol` and `Anatomical Structure`
grouped_avg = exp_file.groupby(['Gene Symbol', 'Anatomical Structure'])['avg_TPM'].mean().reset_index()
grouped_avg.rename(columns={'avg_TPM': 'Mean_avg_TPM'}, inplace=True)

# Compute the median and mean of the above means, grouped by `Gene Symbol`
final_stats = grouped_avg.groupby('Gene Symbol')['Mean_avg_TPM'].agg(['median', 'mean']).reset_index()
final_stats.rename(columns={'median': 'Median_avg_TPM', 'mean': 'Mean_avg_TPM'}, inplace=True)

final_stats.to_csv("exp.csv",index=False)


  exp_file = pd.read_csv("../../omics/protein/mouse/filtered_exp.csv")


step2.3.2 Annotate the mean&media of expression at 22 tissues.

In [17]:

import pandas as pd
exp_file = pd.read_csv("exp.csv")
protein = pd.read_csv("proteins.csv", dtype=str)
protein_exp = pd.merge(protein, exp_file, left_on='protein', right_on='Gene Symbol', how='inner')

protein_exp = protein_exp[['protein_ID','Median_avg_TPM','Mean_avg_TPM']]
protein_exp.to_csv("protein_exp.csv", sep=",", index=False)


## Step3 Merge annotation.

In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

exp_anno = pd.read_csv("protein_exp.csv")
go_anno = pd.read_csv("protein_go_term_counts.csv")
homo_anno = pd.read_csv("protein_orthologs_counts.csv")

protein_annotation = pd.merge(exp_anno, go_anno, on='protein_ID', how="inner")
protein_annotation = protein_annotation.merge(homo_anno, on="protein_ID", how='inner')

ID_column = protein_annotation.iloc[:, [0]]
feature_columns = protein_annotation.iloc[:, 1:]

scaler = StandardScaler()
normalized_data = scaler.fit_transform(feature_columns)

df_normalized = pd.concat([ID_column, pd.DataFrame(normalized_data, columns=feature_columns.columns)], axis=1)

df_normalized.to_csv("transformed_protein_annotation.csv",index=False)

### Step3.2 Get inter with valid protein.

In [19]:
import pandas as pd

inter = pd.read_csv("inter_with_valid_lnc.csv")
valid_protein = pd.read_csv("transformed_protein_annotation.csv")

valid_protein_set = set(valid_protein['protein_ID'])

# Define a function to check whether nodes starting with 'p' exist in valid_protein
def check_valid_protein(node):
    if node.startswith('p'):
        return node in valid_protein_set
    return True  # Keep nodes that do not start with 'p'

# Check each row: both Node_i and Node_j must be valid
inter = inter[inter['Node_i'].apply(check_valid_protein) & inter['Node_j'].apply(check_valid_protein)]

inter.to_csv("valid_inter.csv", index=False)

tissues = ['heart', 'lung', 'brain']
for tissue in tissues:
    lnc = pd.read_csv(f"{tissue}_annotation.csv")
    lnc = lnc[lnc['lncRNA_ID'].isin(inter['Node_i'])]

    lnc.to_csv(f"valid_{tissue}_annotation.csv", index=False)


## Step4 Calculate edge wight.

In [20]:
import pandas as pd

# Read the input CSV file
inter = pd.read_csv("valid_inter.csv")

# Ensure (Node_i, Node_j) and (Node_j, Node_i) are considered the same edge
# Sort each pair so that the smaller node always comes first
inter[['Node_i', 'Node_j']] = inter[['Node_i', 'Node_j']].apply(lambda x: tuple(sorted(x)), axis=1, result_type='expand')

# Compute the weight (number of occurrences of each edge)
inter['weight'] = inter.groupby(['Node_i', 'Node_j']).transform('size')

# Remove duplicate edges, keeping only one occurrence per (Node_i, Node_j) pair
inter = inter.drop_duplicates(subset=['Node_i', 'Node_j'])

inter.columns = ['source', 'target', 'weight']

# Save the weighted edge list to a new CSV file
inter.to_csv('weighted_valid_inter.csv', index=False)

print("Weighted interaction file has been successfully saved!")


Weighted interaction file has been successfully saved!


In [23]:
import pandas as pd

# 加载CSV文件
df = pd.read_csv('valid_inter.csv')

# 创建节点类型列
df['Type1'] = df['Node_i'].apply(lambda x: 'lncRNA' if x.startswith('l') else 'Protein')
df['Type2'] = df['Node_j'].apply(lambda x: 'lncRNA' if x.startswith('l') else 'Protein')

# 创建边的类型标签，这里设置axis=1以按行应用函数
df['Edge Type'] = df.apply(lambda row: 'LPI' if (row['Type1'] != row['Type2']) else 'PPI', axis=1)

# 初始化统计每种边中每种节点的独特数量
edge_node_unique_counts = {
    'LPI': {'lncRNA': set(), 'Protein': set()},
    'PPI': {'Protein': set()}
}

# 对每种边进行遍历，收集独特节点
for index, row in df.iterrows():
    edge_type = row['Edge Type']
    edge_node_unique_counts[edge_type][row['Type1']].add(row['Node_i'])
    edge_node_unique_counts[edge_type][row['Type2']].add(row['Node_j'])

# 计算每种边的数量
edge_counts = df['Edge Type'].value_counts()

# 输出每种边的统计结果
print("边的统计：")
print(edge_counts)
print("\n每种边中每个节点的独特数量：")
for edge_type, nodes in edge_node_unique_counts.items():
    print(f"{edge_type}:")
    for node_type, node_set in nodes.items():
        print(f"  {node_type}: {len(node_set)}")


边的统计：
Edge Type
LPI    139935
PPI     76235
Name: count, dtype: int64

每种边中每个节点的独特数量：
LPI:
  lncRNA: 37853
  Protein: 180
PPI:
  Protein: 11498
