# Annotate features for proteins.

## Step1 Extract all protein. 

### Step1.1 Extract all protein molecules from a filtered LPPI.

In [28]:
import pandas as pd

# Example usage
inter_file = 'inter_with_valid_lnc.csv'
inter = pd.read_csv(inter_file)

# Concatenate two columns into a new Series and remove duplicates
molecule = pd.concat([inter['Node_i'], inter['Node_j']]).reset_index(drop=True)
molecule_df = pd.DataFrame(molecule, columns=['molecule'])
molecule_df = molecule_df.drop_duplicates()

protein_file = '../../data/LPPI/mouse/protein_updated.csv'
proteins = pd.read_csv(protein_file)

proteins = proteins[proteins['protein_ID'].isin(molecule_df['molecule'])]

# Export to CSV file
protein_file = 'proteins.csv'
proteins.to_csv(protein_file, index=False)


### Step1.2 Annotate MGI ID.

In [29]:
import pandas as pd

valid_protein = pd.read_csv("proteins.csv", dtype=str)
ID_map = pd.read_csv("../../protein/mouse/MGI_ID_mapping.csv")

protein_with_ID = pd.merge(valid_protein, ID_map, left_on='protein', right_on='Marker Symbol',how="inner")

protein_with_ID = protein_with_ID[['protein','protein_ID','MGI ID']]
protein_with_ID.to_csv('protein_MGI_ID.csv', index=False)

invalid_protein = valid_protein[~valid_protein['protein_ID'].isin(protein_with_ID['protein_ID'])]

invalid_protein.to_csv("no_MGI_ID_protein.csv", index=False)


## Step2 Annotate features

### Step2.1 Annotate the number of go terms.

In [30]:
import pandas as pd

def count_gene_go_terms(gaf_file):
    # 读取 GAF 文件（以制表符分隔）
    columns = [
        "DB", "DB_Object_ID", "DB_Object_Symbol", "Qualifier", "GO_ID", "DB_Reference", 
        "Evidence_Code", "With_or_From", "Aspect", "DB_Object_Name", "DB_Object_Synonym",
        "DB_Object_Type", "Taxon", "Date", "Assigned_By", "Annotation_Extension","Gene_Product_Form_ID"
    ]
    
    # 只读取前 15 列（防止 GAF 版本不同导致列数问题）
    df = pd.read_csv(gaf_file, sep="\t", comment="!", header=None, names=columns, usecols=range(15), dtype=str)
    
    # 过滤掉非小鼠数据（taxon:10090）
    df = df[df["Taxon"] == "taxon:10090"]

    # 只保留基因 ID、基因名称和 GO term
    df_filtered = df[["DB_Object_Symbol", "GO_ID"]].drop_duplicates()
    
    # 统计每个基因的 GO term 数量
    gene_go_counts = df_filtered.groupby(["DB_Object_Symbol"])["GO_ID"].nunique().reset_index()
    
    # 重命名列
    gene_go_counts.columns = ["Gene_Symbol", "GO_Term_Count"]
    
    return gene_go_counts

gaf_file = "../../protein/mouse/mgi.gaf"  
gene_go_counts = count_gene_go_terms(gaf_file)

protein = pd.read_csv("proteins.csv", dtype=str)

filtered_gene_go_term_counts = pd.merge(protein, gene_go_counts, left_on='protein', right_on='Gene_Symbol', how='inner')
filtered_gene_go_term_counts = filtered_gene_go_term_counts[['protein_ID','GO_Term_Count']]
# 将结果保存到文件
filtered_gene_go_term_counts.to_csv("protein_go_term_counts.csv", sep=",", index=False)



### Step2.2 Annotate the number of orthologs.

Step2.2.1 Extract gene name.

In [32]:
import pandas as pd

data = pd.read_csv("proteins.csv")

gene = data[['protein']]

gene.to_csv("protein_name.txt", sep="\t",index=False, header=None)

Step2.2.2 Get ortholog count using Ensembl REST API

In [33]:
import requests
import pandas as pd
import time
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

# 配置参数
INPUT_FILE = "protein_name.txt"
OUTPUT_FILE = "all_species_ortholog_counts.csv"
THREADS = 5
MAX_RETRIES = 3
SLEEP_BETWEEN = 0.4
HEADERS = {"Content-Type": "application/json"}

def count_orthologs_by_symbol(symbol):
    url = f"https://rest.ensembl.org/homology/symbol/mus_musculus/{symbol}?type=orthologues"
    for attempt in range(MAX_RETRIES):
        try:
            r = requests.get(url, headers=HEADERS, timeout=15)
            if r.status_code == 200:
                data = r.json()
                homologies = data['data'][0].get('homologies', [])
                count = sum(1 for h in homologies if h['type'] == 'ortholog_one2one')
                return (symbol, count, "OK")
            elif r.status_code == 404:
                return (symbol, 0, "NotFound")
            else:
                print(f"[{symbol}] HTTP {r.status_code}")
        except Exception as e:
            print(f"[{symbol}] error: {e}")
        time.sleep(SLEEP_BETWEEN)
    return (symbol, 0, "Error")

def load_symbols():
    with open(INPUT_FILE) as f:
        return [line.strip() for line in f if line.strip()]

def load_existing_results():
    if not os.path.exists(OUTPUT_FILE):
        return set()
    df = pd.read_csv(OUTPUT_FILE)
    return set(df['GeneSymbol'].values)

def save_result(symbol, count, status):
    with open(OUTPUT_FILE, "a") as f:
        f.write(f"{symbol},{count},{status}\n")

def main():
    all_symbols = load_symbols()
    done_symbols = load_existing_results()
    symbols_to_query = [s for s in all_symbols if s not in done_symbols]

    print(f"\n🧬 总基因数：{len(all_symbols)}")
    print(f"✅ 已处理：{len(done_symbols)}")
    print(f"🔍 待处理：{len(symbols_to_query)}")
    print(f"🚀 启动并发查询，线程数：{THREADS}\n")

    with open(OUTPUT_FILE, "a") as f:
        if os.stat(OUTPUT_FILE).st_size == 0:
            f.write("GeneSymbol,OrthologCount,Status\n")

    with ThreadPoolExecutor(max_workers=THREADS) as executor:
        future_to_symbol = {
            executor.submit(count_orthologs_by_symbol, symbol): symbol
            for symbol in symbols_to_query
        }

        for future in as_completed(future_to_symbol):
            symbol = future_to_symbol[future]
            try:
                result = future.result()
                save_result(*result)
                print(f"✅ {result[0]} → {result[1]} orthologs")
            except Exception as exc:
                print(f"❌ {symbol} exception: {exc}")
            time.sleep(SLEEP_BETWEEN)

    print(f"\n🎉 查询完成！结果保存在：{OUTPUT_FILE}")

if __name__ == "__main__":
    main()



🧬 总基因数：12023
✅ 已处理：11289
🔍 待处理：734
🚀 启动并发查询，线程数：5

✅ Erbin → 184 orthologs
✅ Bmal1 → 179 orthologs
✅ Eloc → 149 orthologs
✅ H2az1 → 149 orthologs
✅ Patj → 166 orthologs
[H2af] HTTP 400
✅ H4c16 → 12 orthologs
✅ Zfp24 → 107 orthologs
[H2af] HTTP 400
✅ Eloa → 183 orthologs
[H2af] HTTP 400
✅ H2af → 0 orthologs
✅ Skp1 → 149 orthologs
✅ Pea15b-ps → 0 orthologs
✅ Nherf4 → 143 orthologs
✅ Nherf1 → 139 orthologs
✅ Nherf2 → 138 orthologs
✅ Kmt5c → 165 orthologs
✅ H1f7 → 64 orthologs
✅ Sem1 → 144 orthologs
✅ Kmt5b → 188 orthologs
✅ Macroh2a3 → 0 orthologs
✅ Mesd → 194 orthologs
✅ Grk2 → 131 orthologs
✅ Diaph2 → 181 orthologs
✅ Plppr4 → 146 orthologs
✅ H1f4 → 86 orthologs
✅ Prag1 → 196 orthologs
✅ H2bc21 → 86 orthologs
✅ H4c14 → 136 orthologs
✅ Adgrv1 → 159 orthologs
✅ Nsd2 → 176 orthologs
✅ Ift25 → 124 orthologs
✅ Epb41l1 → 184 orthologs
✅ Ufd1 → 181 orthologs
✅ Vpreb1a → 3 orthologs
✅ Tamalin → 181 orthologs
✅ Septin4 → 176 orthologs
✅ Abraxas2 → 194 orthologs
✅ Kat14 → 196 orthologs
✅ Myorg → 

Step2.2.3 Annotate ortholog count

In [34]:
import pandas as pd

orthologs_counts = pd.read_csv('all_species_ortholog_counts.csv')
protein = pd.read_csv('proteins.csv')

orthologs_counts = orthologs_counts[orthologs_counts['Status']=='OK']
orthologs_counts = orthologs_counts[['GeneSymbol','OrthologCount']]
protein_orthologs_counts = pd.merge(protein,orthologs_counts,left_on='protein',right_on='GeneSymbol',how='inner')
protein_orthologs_counts = protein_orthologs_counts[['protein_ID','OrthologCount']]
protein_orthologs_counts.to_csv("protein_orthologs_counts.csv",index=False)

### Step2.3 Annotate the expression feature.

Step2.3.1 Calculate avg_TPM of gene at 22 tissues.

In [22]:
import pandas as pd

exp_file = pd.read_csv("../../protein/mouse/filtered_exp.csv")

# 确保列名正确（避免大小写或空格问题）
exp_file.columns = exp_file.columns.str.strip()

# 先按 `Gene Symbol` 和 `Anatomical Structure` 计算 `avg_TPM` 均值
grouped_avg = exp_file.groupby(['Gene Symbol', 'Anatomical Structure'])['avg_TPM'].mean().reset_index()
grouped_avg.rename(columns={'avg_TPM': 'Mean_avg_TPM'}, inplace=True)

# 再按 `Gene Symbol` 计算这些均值的 中位数 和 均值
final_stats = grouped_avg.groupby('Gene Symbol')['Mean_avg_TPM'].agg(['median', 'mean']).reset_index()
final_stats.rename(columns={'median': 'Median_avg_TPM', 'mean': 'Mean_avg_TPM'}, inplace=True)

final_stats.to_csv("exp.csv",index=False)

  exp_file = pd.read_csv("../../protein/mouse/filtered_exp.csv")


step2.3.2 Annotate the mean&media of expression at 22 tissues.

In [35]:

import pandas as pd
exp_file = pd.read_csv("exp.csv")
protein = pd.read_csv("proteins.csv", dtype=str)
protein_exp = pd.merge(protein, exp_file, left_on='protein', right_on='Gene Symbol', how='inner')

# 将结果保存到文件
protein_exp = protein_exp[['protein_ID','Median_avg_TPM','Mean_avg_TPM']]
protein_exp.to_csv("protein_exp.csv", sep=",", index=False)


## Step3 Merge annotation.

In [36]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

exp_anno = pd.read_csv("protein_exp.csv")
go_anno = pd.read_csv("protein_go_term_counts.csv")
homo_anno = pd.read_csv("protein_orthologs_counts.csv")

protein_annotation = pd.merge(exp_anno, go_anno, on='protein_ID', how="inner")
protein_annotation = protein_annotation.merge(homo_anno, on="protein_ID", how='inner')

first_column = protein_annotation.iloc[:, [0]]
other_columns = protein_annotation.iloc[:, 1:]

scaler = StandardScaler()
normalized_data = scaler.fit_transform(other_columns)

df_normalized = pd.concat([first_column, pd.DataFrame(normalized_data, columns=other_columns.columns)], axis=1)

df_normalized.to_csv("transformed_protein_annotation.csv",index=False)

### Step3.2 Get inter with valid protein.

In [3]:
import pandas as pd

inter = pd.read_csv("inter_with_valid_lnc.csv")
valid_protein = pd.read_csv("transformed_protein_annotation.csv")

valid_protein_set = set(valid_protein['protein_ID'])

# 定义函数检查以'p'开头的节点是否在valid_protein中
def check_valid_protein(node):
    if node.startswith('p'):
        return node in valid_protein_set
    return True  # 若不以'p'开头，保留该节点

# 逐行检查Node_i和Node_j是否有效
inter = inter[inter['Node_i'].apply(check_valid_protein) & inter['Node_j'].apply(check_valid_protein)]

inter.to_csv("valid_inter.csv", index=False)

#cell_lines = ['CH12.LX','ES-E14','MEL', 'ac']
tissues = ['brain']
for tissue in tissues:
	lnc = pd.read_csv(f"{tissue}_annotation.csv")
	lnc = lnc[lnc['lncRNA_ID'].isin(inter['Node_i'])]

	lnc.to_csv(f"valid_{tissue}_annotation.csv", index=False)

## Step4 Calculate edge wight.

In [3]:
import pandas as pd

# Read the input CSV file
inter = pd.read_csv("valid_inter.csv")

# Ensure (Node_i, Node_j) and (Node_j, Node_i) are considered the same edge
# Sort each pair so that the smaller node always comes first
inter[['Node_i', 'Node_j']] = inter[['Node_i', 'Node_j']].apply(lambda x: tuple(sorted(x)), axis=1, result_type='expand')

# Compute the weight (number of occurrences of each edge)
inter['weight'] = inter.groupby(['Node_i', 'Node_j']).transform('size')

# Remove duplicate edges, keeping only one occurrence per (Node_i, Node_j) pair
inter = inter.drop_duplicates(subset=['Node_i', 'Node_j'])

inter.columns = ['source', 'target', 'weight']

# Save the weighted edge list to a new CSV file
inter.to_csv('weighted_valid_inter.csv', index=False)

print("Weighted interaction file has been successfully saved!")


Weighted interaction file has been successfully saved!


In [39]:
import pandas as pd 

lppi = pd.read_csv("valid_inter.csv")

# 根据第一列前缀是否为 'l' 来划分 LPI 和 PPI
LPI = lppi[lppi.iloc[:, 0].str.startswith('l')]
PPI = lppi[~lppi.iloc[:, 0].str.startswith('l')]

# 统计LPI中lncRNA、protein数量和边数量
lncRNA_count_LPI = LPI.iloc[:, 0].nunique()
protein_count_LPI = LPI.iloc[:, 1].nunique()
edges_count_LPI = len(LPI)

# 统计PPI中protein数量和边数量（两列均为protein）
protein_count_PPI = pd.unique(PPI.iloc[:, [0, 1]].values.ravel()).size
edges_count_PPI = len(PPI)

# 输出统计结果
print("LPI统计结果：")
print(f"lncRNA数量：{lncRNA_count_LPI}")
print(f"protein数量：{protein_count_LPI}")
print(f"边的数量：{edges_count_LPI}")

print("\nPPI统计结果：")
print(f"protein数量：{protein_count_PPI}")
print(f"边的数量：{edges_count_PPI}")


LPI统计结果：
lncRNA数量：37658
protein数量：177
边的数量：135107

PPI统计结果：
protein数量：11489
边的数量：76095


In [1]:
import pandas as pd

# 读取 CSV 文件
# 假设 CSV 中前两列为节点（即边）
df = pd.read_csv('weighted_valid_inter.csv')

# 统计每个节点的度
node_degrees = pd.concat([df['source'], df['target']]).value_counts()

# 分别提取 l 开头和 p 开头的节点
l_nodes = node_degrees[node_degrees.index.str.startswith('l')]
p_nodes = node_degrees[node_degrees.index.str.startswith('p')]

# 计算 l 开头节点的度的范围和均值
l_degree_range = (l_nodes.min(), l_nodes.max())
l_degree_mean = l_nodes.mean()

# 计算 p 开头节点的度的范围和均值
p_degree_range = (p_nodes.min(), p_nodes.max())
p_degree_mean = p_nodes.mean()

# 打印结果
print(f"l 开头节点度的范围：{l_degree_range}")
print(f"l 开头节点度的均值：{l_degree_mean}")

print(f"\np 开头节点度的范围：{p_degree_range}")
print(f"p 开头节点度的均值：{p_degree_mean}")


l 开头节点度的范围：(1, 93)
l 开头节点度的均值：3.019411546019438

p 开头节点度的范围：(1, 24508)
p 开头节点度的均值：21.73330434782609
