# Annotate features for proteins.

## Step1 Annotate pLI and pLoF score for valid proteins. 

### Step1.1 Extract all protein molecules from a filtered LPPI

In [3]:
import pandas as pd

# Example usage
inter_file = 'inter_with_valid_lnc.csv'
inter = pd.read_csv(inter_file)

# Concatenate two columns into a new Series and remove duplicates
molecule = pd.concat([inter['Node_i'], inter['Node_j']]).reset_index(drop=True)
molecule_df = pd.DataFrame(molecule, columns=['molecule'])
molecule_df = molecule_df.drop_duplicates()

protein_file = '../../data/LPPI/human/ensembl/protein.csv'
proteins = pd.read_csv(protein_file)

proteins = proteins[proteins['protein_ID'].isin(molecule_df['molecule'])]

# Export to CSV file
protein_file = 'proteins.csv'
proteins.to_csv(protein_file, index=False)


### Step1.2 Annotate pLI score for valid proteins.

In [4]:
import pandas as pd

valid_protein = pd.read_csv("proteins.csv", dtype=str)
pLoF = pd.read_csv("../../protein/human/pLoF.txt", sep='\t')

# Filtering required columns from pLoF
pLoF = pLoF[['gene','oe_lof_upper', 'obs_lof', 'exp_lof', 'oe_lof', 'lof_z', 'pLI']]

# Merging on 'protein' column from valid_protein and 'gene' column from pLoF
protein_pLI = pd.merge(valid_protein, pLoF, left_on='protein', right_on='gene', how='inner')
protein_pLI = protein_pLI.drop(columns=['gene', 'protein'])

# Extract rows containing NaN values in protein_pLI
na_score_protein = protein_pLI[protein_pLI.isna().any(axis=1)]
na_score_protein = na_score_protein[['protein_ID']]

protein_pLI_cleaned = protein_pLI.dropna()

pLI_means = protein_pLI_cleaned.groupby('protein_ID').mean().reset_index()

# Save results
pLI_means.to_csv('protein_annotation.csv', index=False)
na_score_protein.to_csv('NA_score_protein.csv', index=False)  # Saving rows with NaN values


### Step1.3 Convert oe_lof&lof_z to p-value and get log.

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import poisson, norm

# Load the dataset
df = pd.read_csv("protein_annotation.csv")

# Calculate p-value for each observation using the Poisson distribution
df["oe_lof_pval"] = df.apply(lambda row: poisson.cdf(row['obs_lof'], row['exp_lof']), axis=1)

# Calculate p-value for lof_z using a two-tailed Z-test
df["lof_z_pval"] = df["lof_z"].apply(lambda x: 2 * (1 - norm.cdf(abs(x))) if pd.notna(x) else np.nan)

# Apply a log10 transformation to p-values with a small number adjustment
df["log_oe_lof_pval"] = df["oe_lof_pval"].apply(lambda x: np.log10(x + 1e-10))
df["log_lof_z_pval"] = df["lof_z_pval"].apply(lambda x: np.log10(x + 1e-10))
df["log_pLI"] = df["pLI"].apply(lambda x: np.log10(x) if pd.notna(x) and x > 0 else np.nan)

# Select columns to keep
df = df[['protein_ID', "log_oe_lof_pval", "log_lof_z_pval", "log_pLI"]]

# Save the transformed dataset
df.to_csv("transformed_protein_annotation.csv", index=False)


### Step1.4 Delete interaction with protein which have NA pLI score.

In [2]:
import pandas as pd

inter = pd.read_csv("inter_with_valid_lnc.csv")
invalid_protein = pd.read_csv("NA_score_protein.csv")

inter = inter[~inter['Node_i'].isin(invalid_protein['protein_ID'])]
inter = inter[~inter['Node_j'].isin(invalid_protein['protein_ID'])]

inter.to_csv("valid_inter.csv", index=False)

#cell_lines = ['HeLa','HepG2','K562', 'ac']
tissues = ['heart','lung','stomach']
for tissue in tissues:
	lnc = pd.read_csv(f"{tissue}_annotation.csv")
	lnc = lnc[lnc['lncRNA_ID'].isin(inter['Node_i'])]

	lnc.to_csv(f"valid_{tissue}_annotation.csv", index=False)

## Step2 Calculate edge wight.

In [3]:
import pandas as pd

# Read the input CSV file
inter = pd.read_csv("valid_inter.csv")

# Ensure (Node_i, Node_j) and (Node_j, Node_i) are considered the same edge
# Sort each pair so that the smaller node always comes first
inter[['Node_i', 'Node_j']] = inter[['Node_i', 'Node_j']].apply(lambda x: tuple(sorted(x)), axis=1, result_type='expand')

# Compute the weight (number of occurrences of each edge)
inter['weight'] = inter.groupby(['Node_i', 'Node_j']).transform('size')

# Remove duplicate edges, keeping only one occurrence per (Node_i, Node_j) pair
inter = inter.drop_duplicates(subset=['Node_i', 'Node_j'])

inter.columns = ['source', 'target', 'weight']

# Save the weighted edge list to a new CSV file
inter.to_csv('weighted_valid_inter.csv', index=False)

print("Weighted interaction file has been successfully saved!")


Weighted interaction file has been successfully saved!


In [None]:
import pandas as pd 

lppi = pd.read_csv("valid_inter.csv")

# 根据第一列前缀是否为 'l' 来划分 LPI 和 PPI
LPI = lppi[lppi.iloc[:, 0].str.startswith('l')]
PPI = lppi[~lppi.iloc[:, 0].str.startswith('l')]

# 统计LPI中lncRNA、protein数量和边数量
lncRNA_count_LPI = LPI.iloc[:, 0].nunique()
protein_count_LPI = LPI.iloc[:, 1].nunique()
edges_count_LPI = len(LPI)

# 统计PPI中protein数量和边数量（两列均为protein）
protein_count_PPI = pd.unique(PPI.iloc[:, [0, 1]].values.ravel()).size
edges_count_PPI = len(PPI)

# 输出统计结果
print("LPI统计结果：")
print(f"lncRNA数量：{lncRNA_count_LPI}")
print(f"protein数量：{protein_count_LPI}")
print(f"边的数量：{edges_count_LPI}")

print("\nPPI统计结果：")
print(f"protein数量：{protein_count_PPI}")
print(f"边的数量：{edges_count_PPI}")


LPI统计结果：
lncRNA数量：51899
protein数量：3165
边的数量：480546

PPI统计结果：
protein数量：17749
边的数量：1130619
