## Get ensembl id of essential lncRNA

In [None]:
# Human
import os
import re
import pandas as pd

# Use Ensembl BED directory
ensembl_dir = f"../../reference_lncRNA/human/bed/ensembl/"

# ------------------------------------------------------------
# BED utilities: extract version number and sort by version
# ------------------------------------------------------------
def extract_version(filename: str) -> int:
    """
    Extract Ensembl version number from BED filename,
    """
    m = re.search(r'GRCh38\.(\d+)\.bed$', filename)
    return int(m.group(1)) if m else -1

def list_bed_files_sorted(bed_dir: str):
    """
    List all BED files in directory and sort them by version
    in descending order (newer versions first)
    """
    bed_files = [f for f in os.listdir(bed_dir) if f.endswith(".bed")]
    return sorted(bed_files, key=extract_version, reverse=True)

def read_ensembl_bed(bed_path: str) -> pd.DataFrame:
    """
    Read Ensembl BED file.
    """
    df = pd.read_csv(bed_path, sep="\t", header=None, comment="#", dtype={0: str})
    if df.shape[1] < 6:
        raise ValueError(f"{bed_path} has fewer than 6 columns")

    df = df.iloc[:, :6].copy()
    df.columns = ["chr", "bed_start", "bed_end", "gene_name", "ensembl_id", "bed_strand"]

    df["bed_start"] = pd.to_numeric(df["bed_start"], errors="coerce")
    df["bed_end"] = pd.to_numeric(df["bed_end"], errors="coerce")

    return df.dropna(subset=["chr", "bed_start", "bed_end"])

# ------------------------------------------------------------
# Mapping function
# ------------------------------------------------------------
def map_geneid_from_beds(need_df, bed_dir):
    out = need_df.copy()
    out["ensembl_id"] = pd.NA  # final ensembl id
    out["match_type"] = pd.NA  # direct/name/coord

    bed_files = list_bed_files_sorted(bed_dir)

    # ============================================================
    # First pass: if gene_id is already ENSG -> directly assign
    # ============================================================
    mask_direct = out["gene_id"].notna() & out["gene_id"].astype(str).str.startswith("ENSG")
    out.loc[mask_direct, "ensembl_id"] = out.loc[mask_direct, "gene_id"]
    out.loc[mask_direct, "match_type"] = "direct"

    # ============================================================
    # Second pass: gene_name matching
    # ============================================================
    for bed_file in bed_files:
        if not out["ensembl_id"].isna().any():
            break

        bed = read_ensembl_bed(os.path.join(bed_dir, bed_file))

        name_to_id = (
            bed.dropna(subset=["gene_name", "ensembl_id"])
               .groupby("gene_name", as_index=False)["ensembl_id"]
               .first()
        )

        tmp = out[out["ensembl_id"].isna()][["gene_name_single"]].merge(
            name_to_id,
            left_on="gene_name_single",
            right_on="gene_name",
            how="left"
        )

        idxs = out.index[out["ensembl_id"].isna()]
        matched = tmp["ensembl_id"].notna()

        out.loc[idxs[matched], "ensembl_id"] = tmp.loc[matched, "ensembl_id"].values
        out.loc[idxs[matched], "match_type"] = "name"

    # ============================================================
    # Third pass: coordinate overlap matching (only remaining NA)
    # Condition:
    #   - same chr
    #   - same strand
    #   - overlap_len / query_len >= 0.5
    # ============================================================
    for bed_file in bed_files:
        if not out["ensembl_id"].isna().any():
            break

        bed = read_ensembl_bed(os.path.join(bed_dir, bed_file))

        # group for faster lookup
        bed_by_chr = {c: g for c, g in bed.groupby("chr", sort=False)}

        for idx in out.index[out["ensembl_id"].isna()]:
            r = out.loc[idx]
            chr_ = r["chr"]
            strand = r["strand"]

            if chr_ not in bed_by_chr:
                continue
            if strand not in {"+", "-"}:
                continue

            query_start = int(r["start"])
            query_end = int(r["end"])
            query_len = query_end - query_start
            if query_len <= 0:
                continue

            g = bed_by_chr[chr_]
            g = g[g["bed_strand"] == strand]

            # Find overlapping genes
            overlaps = g[(g["bed_start"] < query_end) & (g["bed_end"] > query_start)]
            if overlaps.empty:
                continue

            # Compute overlap length
            overlaps = overlaps.copy()
            overlaps["overlap_len"] = overlaps.apply(
                lambda x: min(query_end, x["bed_end"]) - max(query_start, x["bed_start"]),
                axis=1
            )
            overlaps = overlaps[overlaps["overlap_len"] > 0]

            if overlaps.empty:
                continue

            # overlap ratio based on query length (unmapped gene)
            overlaps["overlap_ratio"] = overlaps["overlap_len"] / query_len

            # keep only >= 1
            overlaps = overlaps[overlaps["overlap_ratio"] >= 1]
            if overlaps.empty:
                continue

            # Choose best hit:
            #  1) max overlap_ratio
            #  2) if tie -> shortest Ensembl gene span
            overlaps["span"] = overlaps["bed_end"] - overlaps["bed_start"]
            best = overlaps.sort_values(["overlap_ratio", "span"], ascending=[False, True]).iloc[0]

            out.at[idx, "ensembl_id"] = best["ensembl_id"]
            out.at[idx, "match_type"] = "coord"

    return out


# ------------------------------------------------------------
# Main
# ------------------------------------------------------------
all_lnc = pd.read_csv(f"../../data/LPI/human/lncRNA.csv")

# Remove invalid gene_name entries
all_lnc = all_lnc[
    (all_lnc["gene_name"].notna()) &
    (all_lnc["gene_name"] != "-") &
    (all_lnc["gene_name"] != "")
]

# Split multiple gene names
all_lnc["gene_name_single"] = all_lnc["gene_name"].astype(str).str.split(";")
need = all_lnc.explode("gene_name_single", ignore_index=True)

need["gene_name_single"] = need["gene_name_single"].astype(str).str.strip()
need = need[need["gene_name_single"] != ""]

# Load BED coordinates
lnc_bed = pd.read_csv(f"../../process/construct_lppi/human_lncRNA_0-based.bed", sep="\t", header=None)
lnc_bed.columns = ["chr", "start", "end", "identifier", "score", "strand"]
lnc_bed = lnc_bed[["chr", "start", "end", "identifier", "strand"]]

need = need.merge(lnc_bed, on="identifier", how="inner")

need = need[["identifier", "gene_name_single", "gene_id", "chr", "start", "end", "strand"]].copy()
mapped = map_geneid_from_beds(need, ensembl_dir)

# Output mapped Ensembl IDs
out_ids = f"mapped_ensembl_ids_human.csv"
mapped.dropna(subset=["ensembl_id"])[["identifier", "ensembl_id"]].drop_duplicates().sort_values(
    by=["identifier", "ensembl_id"]
).to_csv(out_ids, index=False)

# Output unmatched genes
out_unmatched = f"unmatched_genes_human.csv"
mapped[mapped["ensembl_id"].isna()][
    ["identifier", "gene_name_single", "gene_id", "chr", "start", "end", "strand"]
].drop_duplicates().to_csv(out_unmatched, index=False)

print(f"mapped IDs -> {out_ids}")
print(f"unmatched -> {out_unmatched}")



mapped IDs -> mapped_ensembl_ids_human.csv
unmatched -> unmatched_genes_human.csv

Match summary:
match_type
name      29550
direct    11409
<NA>       6469
coord      6403
Name: count, dtype: int64


In [None]:
# Mouse
import os
import re
import pandas as pd

# Use Ensembl BED directory
ensembl_dir = f"../../reference_lncRNA/mouse/bed/ensembl/"

# ------------------------------------------------------------
# BED utilities: extract version number and sort by version
# ------------------------------------------------------------
def extract_version(filename: str) -> int:
    """
    Extract Ensembl version number from BED filename.
    """
    m = re.search(r'GRCm38\.(\d+)\.bed$', filename)
    return int(m.group(1)) if m else -1

def list_bed_files_sorted(bed_dir: str):
    """
    List only GRCm38 BED files and sort by version (desc).
    """
    bed_files = [
        f for f in os.listdir(bed_dir)
        if re.match(r"Mus_musculus\.GRCm38\.\d+\.bed$", f)
    ]
    return sorted(bed_files, key=extract_version, reverse=True)

def read_ensembl_bed(bed_path: str) -> pd.DataFrame:
    """
    Read Ensembl BED file.
    """
    df = pd.read_csv(bed_path, sep="\t", header=None, comment="#", dtype={0: str})
    if df.shape[1] < 6:
        raise ValueError(f"{bed_path} has fewer than 6 columns")

    df = df.iloc[:, :6].copy()
    df.columns = ["chr", "bed_start", "bed_end", "gene_name", "ensembl_id", "bed_strand"]

    df["bed_start"] = pd.to_numeric(df["bed_start"], errors="coerce")
    df["bed_end"] = pd.to_numeric(df["bed_end"], errors="coerce")

    return df.dropna(subset=["chr", "bed_start", "bed_end"])

# ------------------------------------------------------------
# Mapping function
# ------------------------------------------------------------
def map_geneid_from_beds(need_df, bed_dir):
    out = need_df.copy()
    out["ensembl_id"] = pd.NA
    out["match_type"] = pd.NA

    bed_files = list_bed_files_sorted(bed_dir)

    # ============================================================
    # First pass: direct mapping if gene_id is ENSMUSG
    # ============================================================
    mask_direct = out["gene_id"].notna() & out["gene_id"].astype(str).str.startswith("ENSMUSG")
    out.loc[mask_direct, "ensembl_id"] = out.loc[mask_direct, "gene_id"]
    out.loc[mask_direct, "match_type"] = "direct"

    # ============================================================
    # Second pass: name matching across BED files
    # ============================================================
    for bed_file in bed_files:
        if not out["ensembl_id"].isna().any():
            break

        bed = read_ensembl_bed(os.path.join(bed_dir, bed_file))

        name_to_id = (
            bed.dropna(subset=["gene_name", "ensembl_id"])
               .groupby("gene_name", as_index=False)["ensembl_id"]
               .first()
        )

        tmp = out[out["ensembl_id"].isna()][["gene_name_single"]].merge(
            name_to_id,
            left_on="gene_name_single",
            right_on="gene_name",
            how="left"
        )

        idxs = out.index[out["ensembl_id"].isna()]
        matched = tmp["ensembl_id"].notna()

        out.loc[idxs[matched], "ensembl_id"] = tmp.loc[matched, "ensembl_id"].values
        out.loc[idxs[matched], "match_type"] = "name"

    # ============================================================
    # Third pass: coordinate overlap matching (remaining NA only)
    # Condition:
    #   - same chr
    #   - same strand
    #   - overlap_len / query_len >= 0.5
    # ============================================================
    for bed_file in bed_files:
        if not out["ensembl_id"].isna().any():
            break

        bed = read_ensembl_bed(os.path.join(bed_dir, bed_file))
        bed_by_chr = {c: g for c, g in bed.groupby("chr", sort=False)}

        for idx in out.index[out["ensembl_id"].isna()]:
            r = out.loc[idx]
            chr_ = r["chr"]
            strand = r["strand"]

            if chr_ not in bed_by_chr:
                continue
            if strand not in {"+", "-"}:
                continue

            query_start = int(r["start"])
            query_end = int(r["end"])
            query_len = query_end - query_start
            if query_len <= 0:
                continue

            g = bed_by_chr[chr_]
            g = g[g["bed_strand"] == strand]

            overlaps = g[(g["bed_start"] < query_end) & (g["bed_end"] > query_start)]
            if overlaps.empty:
                continue

            overlaps = overlaps.copy()
            overlaps["overlap_len"] = overlaps.apply(
                lambda x: min(query_end, x["bed_end"]) - max(query_start, x["bed_start"]),
                axis=1
            )
            overlaps = overlaps[overlaps["overlap_len"] > 0]
            if overlaps.empty:
                continue

            overlaps["overlap_ratio"] = overlaps["overlap_len"] / query_len
            overlaps = overlaps[overlaps["overlap_ratio"] >= 1]
            if overlaps.empty:
                continue

            overlaps["span"] = overlaps["bed_end"] - overlaps["bed_start"]
            best = overlaps.sort_values(["overlap_ratio", "span"], ascending=[False, True]).iloc[0]

            out.at[idx, "ensembl_id"] = best["ensembl_id"]
            out.at[idx, "match_type"] = "coord"

    return out


# ------------------------------------------------------------
# Main
# ------------------------------------------------------------
all_lnc = pd.read_csv(f"../../data/LPI/mouse/lncRNA.csv")

# Remove invalid gene_name entries
all_lnc = all_lnc[
    (all_lnc["gene_name"].notna()) &
    (all_lnc["gene_name"] != "-") &
    (all_lnc["gene_name"] != "")
]

# Split multiple gene names
all_lnc["gene_name_single"] = all_lnc["gene_name"].astype(str).str.split(";")
need = all_lnc.explode("gene_name_single", ignore_index=True)

need["gene_name_single"] = need["gene_name_single"].astype(str).str.strip()
need = need[need["gene_name_single"] != ""]

# Load BED coordinates for lncRNAs
lnc_bed = pd.read_csv(f"../../process/construct_lppi/mouse_lncRNA_0-based.bed", sep="\t", header=None)
lnc_bed.columns = ["chr", "start", "end", "identifier", "score", "strand"]
lnc_bed = lnc_bed[["chr", "start", "end", "identifier", "strand"]]

need = need.merge(lnc_bed, on="identifier", how="inner")

need = need[["identifier", "gene_name_single", "gene_id", "chr", "start", "end", "strand"]].copy()
mapped = map_geneid_from_beds(need, ensembl_dir)

# Output mapped Ensembl IDs
out_ids = f"mapped_ensembl_ids_mouse.csv"
mapped.dropna(subset=["ensembl_id"])[["identifier", "ensembl_id"]].drop_duplicates().sort_values(
    by=["identifier", "ensembl_id"]
).to_csv(out_ids, index=False)

# Output unmatched genes
out_unmatched = f"unmatched_genes_mouse.csv"
mapped[mapped["ensembl_id"].isna()][
    ["identifier", "gene_name_single", "gene_id", "chr", "start", "end", "strand"]
].drop_duplicates().to_csv(out_unmatched, index=False)

print(f"mapped IDs -> {out_ids}")
print(f"unmatched -> {out_unmatched}")



mapped IDs -> mapped_ensembl_ids_mouse.csv
unmatched -> unmatched_genes_mouse.csv

Match summary:
match_type
coord     14041
name       9121
<NA>       7858
direct     7164
Name: count, dtype: int64


In [1]:
import pandas as pd

k = 40
species = 'human'

lnc_mapping = pd.read_csv(f"../../data/LPI/{species}/lncRNA_mapping.csv")
ensembl_mapping = pd.read_csv(f"mapped_ensembl_ids_{species}.csv")

for tissue in ['heart', 'lung', 'stomach', 'common']:
    ess_lnc = pd.read_csv(
        f"../ess_number/filtered/{species}/BC_top{k}pct_{species}_{tissue}_esslnc.csv",
        header=None, names=['lncRNA_id']
    )

    ess_lnc = (
        ess_lnc
        .merge(lnc_mapping, on='lncRNA_id', how='inner')
        .merge(ensembl_mapping, left_on='member_id', right_on='identifier', how='inner')
    )

    ess_lnc[['ensembl_id']].drop_duplicates().to_csv(
        f"{species}_{tissue}.txt",
        index=False, header=None
    )


In [2]:
import pandas as pd

k = 60
species = 'mouse'

lnc_mapping = pd.read_csv(f"../../data/LPI/{species}/lncRNA_mapping.csv")
ensembl_mapping = pd.read_csv(f"mapped_ensembl_ids_{species}.csv")

for tissue in ['heart', 'lung', 'brain', 'common']:
    ess_lnc = pd.read_csv(
        f"../ess_number/filtered/{species}/BC_top{k}pct_{species}_{tissue}_esslnc.csv",
        header=None, names=['lncRNA_id']
    )

    ess_lnc = (
        ess_lnc
        .merge(lnc_mapping, on='lncRNA_id', how='inner')
        .merge(ensembl_mapping, left_on='member_id', right_on='identifier', how='inner')
    )

    ess_lnc[['ensembl_id']].drop_duplicates().to_csv(
        f"{species}_{tissue}.txt",
        index=False, header=None
    )


### Statistic results of  Go Term(BP,CC,MF)

In [4]:
import pandas as pd

species = 'mouse'
if species == "mouse":
	item = ['heart', 'lung', 'brain', 'common']
else:
	item = ['heart', 'lung', 'stomach', 'common']

# Read the file
for i in item:
	file_path = f'./DAVID_chart/{species}_{i}.csv'  
	df = pd.read_csv(file_path)  
	df = df[['Category', 'Term', 'P-Value', 'Fold Enrichment', 'FDR']]

	# Filter out rows with FDR >= 0.05 (non-significant GO terms)
	significant_df = df[df['FDR'] < 0.05]
	significant_df = significant_df[significant_df['Fold Enrichment'] > 1]

	# For each category, get the top 5 GO terms by sorting based on p-value
	top_bp = significant_df[significant_df['Category'] == 'GOTERM_BP_DIRECT'].sort_values('P-Value').head(5)
	top_cc = significant_df[significant_df['Category'] == 'GOTERM_CC_DIRECT'].sort_values('P-Value').head(5)
	top_mf = significant_df[significant_df['Category'] == 'GOTERM_MF_DIRECT'].sort_values('P-Value').head(5)
	df_go = pd.concat([top_bp, top_cc, top_mf], ignore_index=True)
	df_go.to_csv(f"./filtered_go/{species}_{i}.csv", index=False)



In [5]:
import pandas as pd
import os

csv_files = [
    "./filtered_go/human_heart.csv",
    "./filtered_go/human_lung.csv",
    "./filtered_go/human_stomach.csv",
    "./filtered_go/human_common.csv",
    "./filtered_go/mouse_heart.csv",
    "./filtered_go/mouse_lung.csv",
    "./filtered_go/mouse_brain.csv",
    "./filtered_go/mouse_common.csv",
]

output_excel_file = "go_data.xlsx"

all_dfs = []

for csv_file in csv_files:
    df = pd.read_csv(csv_file)

    # Extract name like human_heart
    base = os.path.splitext(os.path.basename(csv_file))[0]
    species, tissue = base.split("_", 1)

    # Insert metadata columns
    df.insert(0, "Species", species)
    df.insert(1, "Tissue", tissue)

    all_dfs.append(df)

# Concatenate all
merged_df = pd.concat(all_dfs, ignore_index=True)

# Write to one sheet
merged_df.to_excel(output_excel_file, sheet_name="GO", index=False)

print(f"✅ Merged {len(csv_files)} CSV files into one sheet in {output_excel_file}")


✅ Merged 8 CSV files into one sheet in go_data.xlsx
