#### Filter the BioGRID data to obtain human and mouse PPI data.

In [1]:
# Human
import pandas as pd
import numpy as np
from pathlib import Path

# ======================= paths ======================
human_PPI_file = '../../data/raw/BIOGRID-ORGANISM-Homo_sapiens-4.4.248.tab3.txt'
human_edges_out = '../../data/PPI/human/ppi.csv'
human_proteins_out = '../../data/PPI/human/protein_in_ppi.csv'
# ====================================================

# Read TAB3 as strings
df = pd.read_csv(human_PPI_file, sep='\t', dtype=str, low_memory=False)

# --- (1) Keep physical interactions only ---
# Experimental System Type column should be 'physical' (case-insensitive)
df = df[df['Experimental System Type'].str.lower() == 'physical']

# --- (2) Keep only rows where BOTH interactors are proteins ---
# In TAB3, UniProt evidence exists if Swiss-Prot or TrEMBL accession is present (not '-')
a_has_protein = (df['SWISS-PROT Accessions Interactor A'].fillna('-') != '-') | \
                (df['TREMBL Accessions Interactor A'].fillna('-') != '-')
b_has_protein = (df['SWISS-PROT Accessions Interactor B'].fillna('-') != '-') | \
                (df['TREMBL Accessions Interactor B'].fillna('-') != '-')
df = df[a_has_protein & b_has_protein]

# --- (3) Keep species = human-human ---
df = df[
    (df['Organism Name Interactor A'] == 'Homo sapiens') &
    (df['Organism Name Interactor B'] == 'Homo sapiens')
]

# --- (4) Pick readable symbols as endpoints ---
edges = df[['Official Symbol Interactor A', 'Official Symbol Interactor B']].copy()
edges.columns = ['Protein A', 'Protein B']

# --- (5) Remove self-loops ---
edges = edges[edges['Protein A'] != edges['Protein B']]

# --- (6) Save PPI edges (symbol-symbol) ---
edges.to_csv(human_edges_out, index=False)

# --- (7) Build unique protein list observed in PPI and save ---
proteins = pd.DataFrame({
    'protein': pd.concat([edges['Protein A'], edges['Protein B']], ignore_index=True).astype(str).str.strip()
})
proteins.drop_duplicates().to_csv(human_proteins_out, index=False)

print(f"[DONE] Human PPI edges: {human_edges_out}")
print(f"[DONE] Human proteins in PPI: {human_proteins_out}")

[DONE] Human PPI edges: ../../data/PPI/human/ppi.csv
[DONE] Human proteins in PPI: ../../data/PPI/human/protein_in_ppi.csv


In [2]:
# Mouse

import pandas as pd
import numpy as np  # If NumPy is not installed, replace with pandas-only approach below
from pathlib import Path

# ==================== User paths ====================
mouse_PPI_file = '../../data/raw/BIOGRID-ORGANISM-Mus_musculus-4.4.248.tab3.txt'
mouse_edges_out = '../../data/PPI/mouse/ppi.csv'
mouse_proteins_out = '../../data/PPI/mouse/protein_in_ppi.csv'
# ====================================================

# Read TAB3 as strings
df = pd.read_csv(mouse_PPI_file, sep='\t', dtype=str, low_memory=False)

# --- (1) Keep physical interactions only ---
df = df[df['Experimental System Type'].str.lower() == 'physical']

# --- (2) Keep only rows where BOTH interactors are proteins (Swiss-Prot or TrEMBL present) ---
a_has_protein = (df['SWISS-PROT Accessions Interactor A'].fillna('-') != '-') | \
                (df['TREMBL Accessions Interactor A'].fillna('-') != '-')
b_has_protein = (df['SWISS-PROT Accessions Interactor B'].fillna('-') != '-') | \
                (df['TREMBL Accessions Interactor B'].fillna('-') != '-')
df = df[a_has_protein & b_has_protein]

# --- (3) Keep species = mouse-mouse ---
df = df[
    (df['Organism Name Interactor A'] == 'Mus musculus') &
    (df['Organism Name Interactor B'] == 'Mus musculus')
]

# --- (4) Build symbol-symbol edges ---
edges = df[['Official Symbol Interactor A', 'Official Symbol Interactor B']].copy()
edges.columns = ['Protein A', 'Protein B']

# --- (5) Remove self-loops ---
edges = edges[edges['Protein A'] != edges['Protein B']]

# --- (6) Save PPI edges ---
edges.to_csv(mouse_edges_out, index=False)

# --- (7) Save unique proteins ---
proteins = pd.DataFrame({
    'protein': pd.concat([edges['Protein A'], edges['Protein B']], ignore_index=True).astype(str).str.strip()
})
proteins.drop_duplicates().to_csv(mouse_proteins_out, index=False)

print(f"[DONE] Mouse PPI edges: {mouse_edges_out}")
print(f"[DONE] Mouse proteins in PPI: {mouse_proteins_out}")


[DONE] Mouse PPI edges: ../../data/PPI/mouse/ppi.csv
[DONE] Mouse proteins in PPI: ../../data/PPI/mouse/protein_in_ppi.csv


##### Construct undirected weighted graph for the PPI.

In [3]:
# Human
import pandas as pd

# Input and output file paths
input_file = "../../data/PPI/human/ppi.csv"   # PPI file
output_file = "../../data/PPI/human/ppi_weighted.csv"

# Load the data (two columns: Protein A, Protein B)
ppi = pd.read_csv(input_file)

# Normalize edges for undirected graph: sort the two nodes so (A,B) == (B,A)
ppi['protein_a'] = ppi[['Protein A', 'Protein B']].min(axis=1)
ppi['protein_b'] = ppi[['Protein A', 'Protein B']].max(axis=1)

# Count occurrences of each undirected edge
edge_weights = (
    ppi.groupby(['protein_a', 'protein_b'])
       .size()
       .reset_index(name='edge_weight')
)

# Save result
edge_weights.to_csv(output_file, index=False)

print(f"Weighted PPI network saved to {output_file}")


Weighted PPI network saved to ../../data/PPI/human/ppi_weighted.csv


In [4]:
# Mouse
import pandas as pd

# Input and output file paths
input_file = "../../data/PPI/mouse/ppi.csv"   # PPI file
output_file = "../../data/PPI/mouse/ppi_weighted.csv"

# Load the data (two columns: Protein A, Protein B)
ppi = pd.read_csv(input_file)

# Normalize edges for undirected graph: sort the two nodes so (A,B) == (B,A)
ppi['protein_a'] = ppi[['Protein A', 'Protein B']].min(axis=1)
ppi['protein_b'] = ppi[['Protein A', 'Protein B']].max(axis=1)

# Count occurrences of each undirected edge
edge_weights = (
    ppi.groupby(['protein_a', 'protein_b'])
       .size()
       .reset_index(name='edge_weight')
)

# Save result
edge_weights.to_csv(output_file, index=False)

print(f"Weighted PPI network saved to {output_file}")


Weighted PPI network saved to ../../data/PPI/mouse/ppi_weighted.csv


#### Merge the PPI and LPI networks to generate the LPPI network.

In [5]:
# Human
import pandas as pd

# File paths
lpi_file = '../../data/LPI/human/lpi_weighted.csv'       # LPI file (contains lncRNA-protein interactions)
lpi_protein_file = '../../data/LPI/human/protein.csv'    # LPI protein file (must contain 'protein' column)
ppi_file = '../../data/PPI/human/ppi_weighted.csv'                # PPI file (must contain 'Protein A', 'Protein B')
ppi_protein_file = '../../data/PPI/human/protein_in_ppi.csv'
output_file = '../../data/LPPI/human/lppi.csv'

# Load input data
lpi_data = pd.read_csv(lpi_file)
lpi_proteins = pd.read_csv(lpi_protein_file)
ppi_data = pd.read_csv(ppi_file)
ppi_proteins = pd.read_csv(ppi_protein_file)

# --- 1) Collect all unique protein names from PPI and LPI ---
lpi_proteins = lpi_proteins[['protein']]
all_proteins = pd.concat([ppi_proteins, lpi_proteins], ignore_index=True).dropna()
all_proteins = all_proteins[all_proteins != ""].drop_duplicates()

# --- 2) Assign IDs in the format "p<name>" ---
all_proteins['protein_id'] = ["p" + str(x) for x in all_proteins['protein']]

# Save unified protein mapping
all_proteins.to_csv('../../data/LPPI/human/protein.csv', index=False)

protein_map = all_proteins[['protein','protein_id']]
# --- 3) Map new IDs back to PPI ---
ppi_data = ppi_data.merge(protein_map, left_on='protein_a', right_on='protein', how='left')
ppi_data = ppi_data.rename(columns={'protein_id': 'Node_i'}).drop(columns=['protein','protein_a'])

ppi_data = ppi_data.merge(protein_map, left_on='protein_b', right_on='protein', how='left')
ppi_data = ppi_data.rename(columns={'protein_id': 'Node_j'}).drop(columns=['protein','protein_b'])

# --- 4) Map new IDs back to LPI ---
lpi_data = lpi_data.merge(protein_map, on='protein', how='left')

# --- 5) Build edge tables in format: Node_i, Node_j ---
lpi_data = lpi_data[['lncRNA_id', 'protein_id', 'edge_weight']].rename(columns={'lncRNA_id': 'Node_i', 'protein_id': 'Node_j'})

# --- 6) Merge LPI and PPI edges into final LPPI file ---
lppi = pd.concat([lpi_data, ppi_data], ignore_index=True)
lppi.to_csv(output_file, index=False)

print(f"Processing complete! Final LPPI file saved to {output_file}")
print("Unified protein mapping saved to ../../data/LPPI/human/protein.csv")


Processing complete! Final LPPI file saved to ../../data/LPPI/human/lppi.csv
Unified protein mapping saved to ../../data/LPPI/human/protein.csv


In [6]:
# Mouse
import pandas as pd

# File paths
lpi_file = '../../data/LPI/mouse/lpi_weighted.csv'       # LPI file (contains lncRNA-protein interactions)
lpi_protein_file = '../../data/LPI/mouse/protein.csv'    # LPI protein file (must contain 'protein' column)
ppi_file = '../../data/PPI/mouse/ppi_weighted.csv'                # PPI file (must contain 'Protein A', 'Protein B')
ppi_protein_file = '../../data/PPI/mouse/protein_in_ppi.csv'
output_file = '../../data/LPPI/mouse/lppi.csv'

# Load input data
lpi_data = pd.read_csv(lpi_file)
lpi_proteins = pd.read_csv(lpi_protein_file)
ppi_data = pd.read_csv(ppi_file)
ppi_proteins = pd.read_csv(ppi_protein_file)

# --- 1) Collect all unique protein names from PPI and LPI ---
lpi_proteins = lpi_proteins[['protein']]
all_proteins = pd.concat([ppi_proteins, lpi_proteins], ignore_index=True).dropna()
all_proteins = all_proteins[all_proteins != ""].drop_duplicates()

# --- 2) Assign IDs in the format "p<name>" ---
all_proteins['protein_id'] = ["p" + str(x) for x in all_proteins['protein']]

# Save unified protein mapping
all_proteins.to_csv('../../data/LPPI/mouse/protein.csv', index=False)

protein_map = all_proteins[['protein','protein_id']]

# --- 3) Map new IDs back to PPI ---
ppi_data = ppi_data.merge(protein_map, left_on='protein_a', right_on='protein', how='left')
ppi_data = ppi_data.rename(columns={'protein_id': 'Node_i'}).drop(columns=['protein','protein_a'])

ppi_data = ppi_data.merge(protein_map, left_on='protein_b', right_on='protein', how='left')
ppi_data = ppi_data.rename(columns={'protein_id': 'Node_j'}).drop(columns=['protein','protein_b'])

# --- 4) Map new IDs back to LPI ---
lpi_data = lpi_data.merge(protein_map, on='protein', how='left')

# --- 5) Build edge tables in format: Node_i, Node_j ---
lpi_data = lpi_data[['lncRNA_id', 'protein_id', 'edge_weight']].rename(columns={'lncRNA_id': 'Node_i', 'protein_id': 'Node_j'})

# --- 6) Merge LPI and PPI edges into final LPPI file ---
lppi = pd.concat([lpi_data, ppi_data], ignore_index=True)
lppi.to_csv(output_file, index=False)

print(f"Processing complete! Final LPPI file saved to {output_file}")
print("Unified protein mapping saved to ../../data/LPPI/mouse/protein.csv")


Processing complete! Final LPPI file saved to ../../data/LPPI/mouse/lppi.csv
Unified protein mapping saved to ../../data/LPPI/mouse/protein.csv


#### Fix protein name
right_human_protein.csv and right_mouse_protein.csv are used to correct protein name.

In [7]:
# Human
import pandas as pd

# === Configuration file paths ===
mapping_path = "right_human_protein.csv"         
protein_path = "../../data/LPPI/human/protein.csv"        
lppi_path = "../../data/LPPI/human/lppi.csv"               

# === Read mapping table ===
df_map = pd.read_csv(mapping_path, header=None, names=["raw", "std"])
symbol_map = dict(zip(df_map["raw"], df_map["std"]))

# === Process protein file ===
df_protein = pd.read_csv(protein_path)
df_protein["NewSymbol"] = df_protein["protein"].map(symbol_map).fillna(df_protein["protein"])
df_protein["NewProteinID"] = "p" + df_protein["NewSymbol"]

# Save the updated protein file
update_protein = df_protein[["NewSymbol", "NewProteinID"]]
update_protein.columns = ['protein', 'protein_id']
update_protein = update_protein.drop_duplicates()
update_protein.to_csv("../../data/LPPI/human/protein_updated.csv", index=False)
print("Saved: protein_updated.csv")

# === Process LPPI file ===
df_lppi = pd.read_csv(lppi_path)

# Build protein ID map: p+raw → p+std
protein_id_map = {
    "p" + old: "p" + new
    for old, new in zip(df_map["raw"], df_map["std"])
}

# Replace IDs in LPPI where node IDs start with 'p'
def replace_protein_id(value):
    return protein_id_map.get(value, value) if value.startswith("p") else value

df_lppi["Node_i"] = df_lppi["Node_i"].apply(replace_protein_id)
df_lppi["Node_j"] = df_lppi["Node_j"].apply(replace_protein_id)

# Save the updated LPPI file
df_lppi.to_csv("../../data/LPPI/human/lppi_updated.csv", index=False)
print("Saved: lppi_updated.csv (includes LPI + PPI)")


Saved: protein_updated.csv
Saved: lppi_updated.csv (includes LPI + PPI)


In [8]:
# Mouse
import pandas as pd

# === Configuration file paths ===
mapping_path = "right_mouse_protein.csv"         
protein_path = "../../data/LPPI/mouse/protein.csv"        
lppi_path = "../../data/LPPI/mouse/lppi.csv"               

# === Read mapping table ===
df_map = pd.read_csv(mapping_path)
symbol_map = dict(zip(df_map["raw"], df_map["std"]))

# === Process protein file ===
df_protein = pd.read_csv(protein_path)
df_protein["NewSymbol"] = df_protein["protein"].map(symbol_map).fillna(df_protein["protein"])
df_protein["NewProteinID"] = "p" + df_protein["NewSymbol"]

# Save the updated protein file
update_protein = df_protein[["NewSymbol", "NewProteinID"]]
update_protein.columns = ['protein', 'protein_id']
update_protein = update_protein.drop_duplicates()
update_protein.to_csv("../../data/LPPI/mouse/protein_updated.csv", index=False)
print("Saved: protein_updated.csv")

# === Process LPPI file ===
df_lppi = pd.read_csv(lppi_path)

# Build protein ID map: p+raw → p+std
protein_id_map = {
    "p" + old: "p" + new
    for old, new in zip(df_map["raw"], df_map["std"])
}

# Replace IDs in LPPI where node IDs start with 'p'
def replace_protein_id(value):
    return protein_id_map.get(value, value) if value.startswith("p") else value

df_lppi["Node_i"] = df_lppi["Node_i"].apply(replace_protein_id)
df_lppi["Node_j"] = df_lppi["Node_j"].apply(replace_protein_id)

# Save the updated LPPI file
df_lppi.to_csv("../../data/LPPI/mouse/lppi_updated.csv", index=False)
print("Saved: lppi_updated.csv (includes LPI + PPI)")


Saved: protein_updated.csv
Saved: lppi_updated.csv (includes LPI + PPI)
