### Calculate GIC score

In [1]:
# Human GIC scoring with transcript dedup (by identical sequence)

import math
import pandas as pd
from tqdm import tqdm

# ======================= paths ======================
mapping_file = '../../data/LPI/human/lncRNA_mapping.csv'  # columns: lncRNA_id, member_id (identifier)
lnc_trans_path = './human/filtered_lnc_trans.csv'         # columns: identifier, transcript_id
MFE_path = './human/trans_MFE.csv'                        # columns: transcript_id, MFE
seq_path = './human/transcript_sequences.fasta'           # FASTA for transcripts

sorted_GIC_path = './human/sorted_GIC_score.csv'
# ====================================================


# ---------------- Triplet setup ----------------
BASES = ['a', 't', 'c', 'g']
TRIPLETS = [a+b+c for a in BASES for b in BASES for c in BASES]
mers = {}  # reused container for counting triplets


# ---------------- I/O helpers ----------------
def read_mapping(filename):
    """Read lncRNA_id <-> identifier mapping (long format)."""
    df = pd.read_csv(filename, dtype=str)
    # build lncRNA_id -> list of identifiers
    m = df.groupby('lncRNA_id')['member_id'].agg(list).to_dict()
    return m


def read_lnc_trans(filename):
    """Read identifier -> transcript_id mapping (long)."""
    df = pd.read_csv(filename, dtype=str, header=None)
    df.columns=['identifier', 'transcript_id']
    return df.groupby('identifier')['transcript_id'].agg(list).to_dict()


def read_trans_mfe(filename):
    """Read transcript -> MFE mapping."""
    df = pd.read_csv(filename, dtype={'transcript_id': str})
    return pd.Series(df['MFE'].values, index=df['transcript_id'].astype(str)).to_dict()


def read_fasta(filename):
    """Read transcript FASTA. Return dict: tid -> raw sequence (no newlines)."""
    res = {}
    with open(filename, 'r') as f:
        tid = None
        seq_chunks = []
        for line in f:
            if line.startswith('>'):
                # flush previous
                if tid is not None:
                    res[tid] = ''.join(seq_chunks)
                tid = line.strip()[1:]  # take full header after '>'
                seq_chunks = []
            else:
                seq_chunks.append(line.strip())
        if tid is not None:
            res[tid] = ''.join(seq_chunks)
    return res


# -------------- sequence windows & 3-mer freq --------------
def slidingWindow(seq, l, win, step=1): 
    length = l
    mod = divmod((length-win), step)[1]
    if (win >= length):
        return seq
    else:
        start = 0
        end = win
        fragments = []
        while (len(seq[start:end]) == win):
            fragments.append(seq[start:end])
            start += step
            end += step
        if (mod > 0):
            fragments.append(seq[(length-win):])
        return fragments

def stat3mer(seq, l):   
    freq = {}
    for item in TRIPLETS:
        mers[item] = 0
    num3mer = float(l-2)
    all3mer = slidingWindow(seq, l, win=3, step=1)
    for i in set(TRIPLETS):
        mers[i] = all3mer.count(i)
    for triplet in TRIPLETS:
        freq[triplet] = mers[triplet]/num3mer
    return freq


# -------------- main --------------
if __name__ == '__main__':
    # 1) load mappings
    lnc_to_identifiers = read_mapping(mapping_file)              # lncRNA_id -> [identifier...]
    id_to_transcripts = read_lnc_trans(lnc_trans_path)           # identifier -> [transcript_id...]
    trans_to_mfe = read_trans_mfe(MFE_path)                      # transcript_id -> MFE
    trans_seq_raw = read_fasta(seq_path)                         # transcript_id -> sequence (raw)

    # 2) transcript dedup by identical sequence (normalize to lowercase, strip N/LF)
    seq_to_rep = {}                  # normalized_seq -> representative transcript_id
    rep_seq = {}                     # representative transcript_id -> normalized_seq
    trans_to_rep = {}                # transcript_id -> representative transcript_id

    for tid, raw in tqdm(trans_seq_raw.items(), desc="Deduplicating transcripts by sequence"):
        norm = raw.replace('\n', '').replace('\r', '').lower()
        if norm not in seq_to_rep:
            seq_to_rep[norm] = tid
            rep_seq[tid] = norm
        trans_to_rep[tid] = seq_to_rep[norm]

    # 3) compute features for representative transcripts only (those that have MFE)
    LRMODEL_FEATURES_7 = ['intercept', 'length', 'mfe/L', 'cga', 'gcg', 'tcg', 'acg', 'tca']
    LRMODEL_COEFS_7 = [0.7417, 2.612e-04, 4.295, 48.66, 15.64, 76.23, -1.113, -60.29]
    LRMODEL_7 = dict(zip(LRMODEL_FEATURES_7, LRMODEL_COEFS_7))

    features_trans = {}  # rep_tid -> feature dict
    for rep_tid, norm_seq in tqdm(rep_seq.items(), desc="Computing transcript features"):
        # Use MFE of the representative if available; if not, try to borrow from any duplicate with same seq
        mfe_val = trans_to_mfe.get(rep_tid, None)
        if mfe_val is None:
            # search any transcript that maps to this representative and has MFE
            # (rare case: choose the first available MFE among duplicates)
            for tid, r in trans_to_rep.items():
                if r == rep_tid and tid in trans_to_mfe:
                    mfe_val = trans_to_mfe[tid]
                    break
        if mfe_val is None:
            # cannot compute without MFE
            continue

        L = len(norm_seq)
        f = {'trans': rep_tid, 'length': L, 'mfe/L': float(mfe_val) / L if L > 0 else 0.0}
        freq = stat3mer(norm_seq,L)
        for t in TRIPLETS:
            f[t] = freq[t]
        features_trans[rep_tid] = f

    # 4) aggregate to lncRNA level (dedup transcripts by sequence within each lncRNA)
    lncRNA_GIC_score = {}
    for lnc_id, identifiers in tqdm(lnc_to_identifiers.items(), desc="Aggregating to lncRNA"):
        # collect all transcripts from all identifiers
        all_transcripts = []
        for ident in identifiers:
            all_transcripts.extend(id_to_transcripts.get(ident, []))

        if not all_transcripts:
            continue

        # map to representative (sequence dedup) and keep unique reps
        rep_set = {trans_to_rep.get(tid, None) for tid in all_transcripts}
        rep_set.discard(None)

        # keep only reps with computed features (i.e., have sequence + MFE)
        usable_reps = [r for r in rep_set if r in features_trans]
        if not usable_reps:
            continue

        # average features across usable representative transcripts
        agg = {'length': 0.0, 'mfe/L': 0.0}
        for t in TRIPLETS:
            agg[t] = 0.0

        for rep_tid in usable_reps:
            trf = features_trans[rep_tid]
            agg['length'] += trf['length']
            agg['mfe/L'] += trf['mfe/L']
            for t in TRIPLETS:
                agg[t] += trf[t]

        n = float(len(usable_reps))
        agg['length'] /= n
        agg['mfe/L'] /= n
        for t in TRIPLETS:
            agg[t] /= n

        # logistic regression (7-feature model)
        tmp = (LRMODEL_7['intercept']
               + LRMODEL_7['length'] * agg['length']
               + LRMODEL_7['mfe/L'] * agg['mfe/L']
               + LRMODEL_7['cga'] * agg['cga']
               + LRMODEL_7['gcg'] * agg['gcg']
               + LRMODEL_7['tcg'] * agg['tcg']
               + LRMODEL_7['acg'] * agg['acg']
               + LRMODEL_7['tca'] * agg['tca'])
        gic = math.exp(tmp) / (math.exp(tmp) + 1.0)
        lncRNA_GIC_score[lnc_id] = gic

    # 5) export sorted scores
    lncRNA_scores_df_sorted = (pd.Series(lncRNA_GIC_score, name='GIC_score')
                                 .rename_axis('lncRNA_id')
                                 .reset_index()
                                 .sort_values('GIC_score', ascending=False))
    lncRNA_scores_df_sorted = lncRNA_scores_df_sorted.dropna(subset=["GIC_score"])
    lncRNA_scores_df_sorted.to_csv(sorted_GIC_path, index=False)

    print("GIC scores have been calculated and saved successfully.")
    print(f"Output file is located at: {sorted_GIC_path}")


Deduplicating transcripts by sequence: 100%|██████████| 163300/163300 [00:00<00:00, 321733.69it/s]
Computing transcript features: 100%|██████████| 154676/154676 [02:13<00:00, 1155.55it/s]
Aggregating to lncRNA: 100%|██████████| 35375/35375 [00:01<00:00, 24004.10it/s]

GIC scores have been calculated and saved successfully.
Output file is located at: ./human/sorted_GIC_score.csv





In [2]:
# Mouse GIC scoring with transcript dedup (by identical sequence)

import math
import pandas as pd
from tqdm import tqdm

# ======================= paths ======================
mapping_file = '../../data/LPI/mouse/lncRNA_mapping.csv'  # columns: lncRNA_id, member_id (identifier)
lnc_trans_path = './mouse/filtered_lnc_trans.csv'         # columns: identifier, transcript_id
MFE_path = './mouse/trans_MFE.csv'                        # columns: transcript_id, MFE
seq_path = './mouse/transcript_sequences.fasta'           # FASTA for transcripts

sorted_GIC_path = './mouse/sorted_GIC_score.csv'
# ====================================================


# ---------------- Triplet setup ----------------
BASES = ['a', 't', 'c', 'g']
TRIPLETS = [a+b+c for a in BASES for b in BASES for c in BASES]
mers = {}  # reused container for counting triplets


# ---------------- I/O helpers ----------------
def read_mapping(filename):
    """Read lncRNA_id <-> identifier mapping (long format)."""
    df = pd.read_csv(filename, dtype=str)
    # build lncRNA_id -> list of identifiers
    m = df.groupby('lncRNA_id')['member_id'].agg(list).to_dict()
    return m


def read_lnc_trans(filename):
    """Read identifier -> transcript_id mapping (long)."""
    df = pd.read_csv(filename, dtype=str, header=None)
    df.columns=['identifier', 'transcript_id']
    return df.groupby('identifier')['transcript_id'].agg(list).to_dict()


def read_trans_mfe(filename):
    """Read transcript -> MFE mapping."""
    df = pd.read_csv(filename, dtype={'transcript_id': str})
    return pd.Series(df['MFE'].values, index=df['transcript_id'].astype(str)).to_dict()


def read_fasta(filename):
    """Read transcript FASTA. Return dict: tid -> raw sequence (no newlines)."""
    res = {}
    with open(filename, 'r') as f:
        tid = None
        seq_chunks = []
        for line in f:
            if line.startswith('>'):
                # flush previous
                if tid is not None:
                    res[tid] = ''.join(seq_chunks)
                tid = line.strip()[1:]  # take full header after '>'
                seq_chunks = []
            else:
                seq_chunks.append(line.strip())
        if tid is not None:
            res[tid] = ''.join(seq_chunks)
    return res


# -------------- sequence windows & 3-mer freq --------------
def slidingWindow(seq, l, win, step=1): 
    length = l
    mod = divmod((length-win), step)[1]
    if (win >= length):
        return seq
    else:
        start = 0
        end = win
        fragments = []
        while (len(seq[start:end]) == win):
            fragments.append(seq[start:end])
            start += step
            end += step
        if (mod > 0):
            fragments.append(seq[(length-win):])
        return fragments

def stat3mer(seq, l):   
    freq = {}
    for item in TRIPLETS:
        mers[item] = 0
    num3mer = float(l-2)
    all3mer = slidingWindow(seq, l, win=3, step=1)
    for i in set(TRIPLETS):
        mers[i] = all3mer.count(i)
    for triplet in TRIPLETS:
        freq[triplet] = mers[triplet]/num3mer
    return freq


# -------------- main --------------
if __name__ == '__main__':
    # 1) load mappings
    lnc_to_identifiers = read_mapping(mapping_file)              # lncRNA_id -> [identifier...]
    id_to_transcripts = read_lnc_trans(lnc_trans_path)           # identifier -> [transcript_id...]
    trans_to_mfe = read_trans_mfe(MFE_path)                      # transcript_id -> MFE
    trans_seq_raw = read_fasta(seq_path)                         # transcript_id -> sequence (raw)

    # 2) transcript dedup by identical sequence (normalize to lowercase, strip N/LF)
    seq_to_rep = {}                  # normalized_seq -> representative transcript_id
    rep_seq = {}                     # representative transcript_id -> normalized_seq
    trans_to_rep = {}                # transcript_id -> representative transcript_id

    for tid, raw in tqdm(trans_seq_raw.items(), desc="Deduplicating transcripts by sequence"):
        norm = raw.replace('\n', '').replace('\r', '').lower()
        if norm not in seq_to_rep:
            seq_to_rep[norm] = tid
            rep_seq[tid] = norm
        trans_to_rep[tid] = seq_to_rep[norm]

    # 3) compute features for representative transcripts only (those that have MFE)
    LRMODEL_FEATURES_7 = ['intercept', 'length', 'mfe/L', 'cga', 'gcg', 'tcg', 'acg', 'tca']
    LRMODEL_COEFS_7 = [0.7417, 2.612e-04, 4.295, 48.66, 15.64, 76.23, -1.113, -60.29]
    LRMODEL_7 = dict(zip(LRMODEL_FEATURES_7, LRMODEL_COEFS_7))

    features_trans = {}  # rep_tid -> feature dict
    for rep_tid, norm_seq in tqdm(rep_seq.items(), desc="Computing transcript features"):
        # Use MFE of the representative if available; if not, try to borrow from any duplicate with same seq
        mfe_val = trans_to_mfe.get(rep_tid, None)
        if mfe_val is None:
            # search any transcript that maps to this representative and has MFE
            # (rare case: choose the first available MFE among duplicates)
            for tid, r in trans_to_rep.items():
                if r == rep_tid and tid in trans_to_mfe:
                    mfe_val = trans_to_mfe[tid]
                    break
        if mfe_val is None:
            # cannot compute without MFE
            continue

        L = len(norm_seq)
        f = {'trans': rep_tid, 'length': L, 'mfe/L': float(mfe_val) / L if L > 0 else 0.0}
        freq = stat3mer(norm_seq,L)
        for t in TRIPLETS:
            f[t] = freq[t]
        features_trans[rep_tid] = f

    # 4) aggregate to lncRNA level (dedup transcripts by sequence within each lncRNA)
    lncRNA_GIC_score = {}
    for lnc_id, identifiers in tqdm(lnc_to_identifiers.items(), desc="Aggregating to lncRNA"):
        # collect all transcripts from all identifiers
        all_transcripts = []
        for ident in identifiers:
            all_transcripts.extend(id_to_transcripts.get(ident, []))

        if not all_transcripts:
            continue

        # map to representative (sequence dedup) and keep unique reps
        rep_set = {trans_to_rep.get(tid, None) for tid in all_transcripts}
        rep_set.discard(None)

        # keep only reps with computed features (i.e., have sequence + MFE)
        usable_reps = [r for r in rep_set if r in features_trans]
        if not usable_reps:
            continue

        # average features across usable representative transcripts
        agg = {'length': 0.0, 'mfe/L': 0.0}
        for t in TRIPLETS:
            agg[t] = 0.0

        for rep_tid in usable_reps:
            trf = features_trans[rep_tid]
            agg['length'] += trf['length']
            agg['mfe/L'] += trf['mfe/L']
            for t in TRIPLETS:
                agg[t] += trf[t]

        n = float(len(usable_reps))
        agg['length'] /= n
        agg['mfe/L'] /= n
        for t in TRIPLETS:
            agg[t] /= n

        # logistic regression (7-feature model)
        tmp = (LRMODEL_7['intercept']
               + LRMODEL_7['length'] * agg['length']
               + LRMODEL_7['mfe/L'] * agg['mfe/L']
               + LRMODEL_7['cga'] * agg['cga']
               + LRMODEL_7['gcg'] * agg['gcg']
               + LRMODEL_7['tcg'] * agg['tcg']
               + LRMODEL_7['acg'] * agg['acg']
               + LRMODEL_7['tca'] * agg['tca'])
        gic = math.exp(tmp) / (math.exp(tmp) + 1.0)
        lncRNA_GIC_score[lnc_id] = gic

    # 5) export sorted scores
    lncRNA_scores_df_sorted = (pd.Series(lncRNA_GIC_score, name='GIC_score')
                                 .rename_axis('lncRNA_id')
                                 .reset_index()
                                 .sort_values('GIC_score', ascending=False))
    lncRNA_scores_df_sorted.to_csv(sorted_GIC_path, index=False)

    print("GIC scores have been calculated and saved successfully.")
    print(f"Output file is located at: {sorted_GIC_path}")


Deduplicating transcripts by sequence: 100%|██████████| 56380/56380 [00:00<00:00, 362299.75it/s]
Computing transcript features: 100%|██████████| 52895/52895 [01:07<00:00, 782.14it/s] 
Aggregating to lncRNA: 100%|██████████| 29025/29025 [00:00<00:00, 42749.39it/s]


GIC scores have been calculated and saved successfully.
Output file is located at: ./mouse/sorted_GIC_score.csv


### Get essential lncRNA samples and non-essential lncRNA samples

In [3]:
import pandas as pd

esslnc = pd.read_csv("../../data/raw/esslnc.csv")

esslnc = esslnc[["Noncode_id","target","gene_name","ensembl_id","Organism","cancer_related","disease_related","vivo","vitro"]]

human_esslnc = esslnc[esslnc['Organism']=='Human']
mouse_esslnc = esslnc[esslnc['Organism']=='Mouse']

human_esslnc.to_csv('../../data/benchmark/human/human_esslnc.csv',index=False)
mouse_esslnc.to_csv('../../data/benchmark/mouse/mouse_esslnc.csv',index=False)

In [4]:
# Human
import pandas as pd

# Read the lncRNA data
ess_lnc = pd.read_csv("../../data/benchmark/human/human_esslnc.csv")
lncRNA = pd.read_csv("../../data/LPI/human/lncRNA.csv")
lncRNA_mapping = pd.read_csv("../../data/LPI/human/lncRNA_mapping.csv")
annotation = pd.read_csv("../../annotate/human/valid_heart_annotation.csv")

ess_lnc['Noncode_id'] = ess_lnc['Noncode_id'].str.split('.').str[0]
def is_essential(id, name):
    if id != '-' and any(ess_lnc[col].isin([id]).any() for col in ['Noncode_id', 'ensembl_id']):
        return 1
    if name != '-' and any(ess_lnc[col].isin([name]).any() for col in ['gene_name', 'target']):
        return 1
    return 0

lncRNA.loc[:, 'essential'] = lncRNA.apply(lambda row: is_essential(row['gene_id'], row['gene_name']), axis=1)
ess_lnc = lncRNA[lncRNA['essential'] == 1]

ess_lnc_id = lncRNA_mapping[lncRNA_mapping['member_id'].isin(ess_lnc['identifier'])].copy()
anno_ess_lnc = ess_lnc_id[ess_lnc_id['lncRNA_id'].isin(annotation['lncRNA_id'])]
anno_ess_lnc = anno_ess_lnc[['lncRNA_id']].drop_duplicates()
anno_ess_lnc.to_csv('../../data/benchmark/human/ess_lnc.csv', index=False)

lnc_GIC = pd.read_csv("./human/sorted_GIC_score.csv")

unlabel_lnc = lnc_GIC[~lnc_GIC['lncRNA_id'].isin(anno_ess_lnc['lncRNA_id'])]
unlabel_lnc = unlabel_lnc[unlabel_lnc['lncRNA_id'].isin(annotation['lncRNA_id'])]

ess_counts = anno_ess_lnc.shape[0]

noness_lnc = unlabel_lnc.tail(ess_counts)[['lncRNA_id']].reset_index(drop=True)

noness_lnc.to_csv("../../data/benchmark/human/noness_lnc.csv", index=False)

In [11]:
# Mouse
import pandas as pd

# Read the lncRNA data
ess_lnc = pd.read_csv("../../data/benchmark/mouse/mouse_esslnc.csv")
lncRNA = pd.read_csv("../../data/LPI/mouse/lncRNA.csv")
lncRNA_mapping = pd.read_csv("../../data/LPI/mouse/lncRNA_mapping.csv")
annotation = pd.read_csv("../../annotate/mouse/valid_heart_annotation.csv")

ess_lnc['Noncode_id'] = ess_lnc['Noncode_id'].str.split('.').str[0]
def is_essential(id, name):
    if id != '-' and any(ess_lnc[col].isin([id]).any() for col in ['Noncode_id', 'ensembl_id']):
        return 1
    if name != '-' and any(ess_lnc[col].isin([name]).any() for col in ['gene_name', 'target']):
        return 1
    return 0

lncRNA.loc[:, 'essential'] = lncRNA.apply(lambda row: is_essential(row['gene_id'], row['gene_name']), axis=1)
ess_lnc = lncRNA[lncRNA['essential'] == 1]

ess_lnc_id = lncRNA_mapping[lncRNA_mapping['member_id'].isin(ess_lnc['identifier'])].copy()
anno_ess_lnc = ess_lnc_id[ess_lnc_id['lncRNA_id'].isin(annotation['lncRNA_id'])]
anno_ess_lnc = anno_ess_lnc[['lncRNA_id']].drop_duplicates()
anno_ess_lnc.to_csv('../../data/benchmark/mouse/ess_lnc.csv', index=False)

lnc_GIC = pd.read_csv("./mouse/sorted_GIC_score.csv")

unlabel_lnc = lnc_GIC[~lnc_GIC['lncRNA_id'].isin(anno_ess_lnc['lncRNA_id'])]
unlabel_lnc = unlabel_lnc[unlabel_lnc['lncRNA_id'].isin(annotation['lncRNA_id'])]

ess_counts = anno_ess_lnc.shape[0]

noness_lnc = unlabel_lnc.tail(ess_counts)[['lncRNA_id']].reset_index(drop=True)

noness_lnc.to_csv("../../data/benchmark/mouse/noness_lnc.csv", index=False)

In [6]:
import pandas as pd

# ==========================================
# 1. Data Loading
# ==========================================
print("Loading data...")
# Load the essential lncRNA database (Ground Truth)
ess_lnc_db = pd.read_csv("../../data/benchmark/human/human_esslnc.csv")

# Load raw lncRNA member information (contains gene_id, gene_name)
lncRNA = pd.read_csv("../../data/LPI/human/lncRNA.csv") 

# Load mapping file: Member ID -> Unified Node ID (lncRNA_id)
lncRNA_mapping = pd.read_csv("../../data/LPI/human/lncRNA_mapping.csv") 

annotation = pd.read_csv("../../annotate/human/valid_heart_annotation.csv") 

bed = pd.read_csv("../../data/LPI/human/lncRNA_dedup.bed", header=None, sep='\t')
bed.columns=['chr','start','end','lncRNA_id','score','strand']
# ==========================================
# 2. Build Hash Set for Fast Lookup
# ==========================================
# Clean version numbers in Noncode_id (remove suffixes like .1)
ess_lnc_db['Noncode_id'] = ess_lnc_db['Noncode_id'].str.split('.').str[0]

# Collect all potential essential identifiers into a set for O(1) lookup speed
valid_ess_ids = set(ess_lnc_db['Noncode_id'].dropna()) | \
                    set(ess_lnc_db['ensembl_id'].dropna()) 

valid_ess_names = set(ess_lnc_db['gene_name'].dropna()) | \
                    set(ess_lnc_db['target'].dropna())

# ==========================================
# 3. Identify Evidence (Member Level)
# ==========================================
print("Matching identifiers...")

# Function to return the specific matching ID as evidence
def get_evidence(row):
    # Check gene_id first
    if row['gene_id'] in valid_ess_ids:
        return row['gene_id']
    # Check gene_name
    if row['gene_name'] in valid_ess_names:
        return row['gene_name']
    return None

# Merge raw member info with the Unified Node ID mapping
# Assumption: lncRNA['identifier'] corresponds to lncRNA_mapping['member_id']
merged_df = lncRNA.merge(lncRNA_mapping, left_on='identifier', right_on='member_id', how='inner')

# Apply the evidence check to every row (constituent member)
merged_df['match_evidence'] = merged_df.apply(get_evidence, axis=1)

# ==========================================
# 4. Aggregation (Node Level)
# ==========================================
print("Aggregating nodes...")

# Helper function: Remove duplicates, sort, join with semicolons, and exclude empty values
def join_unique(series):
    # Filter out NaNs and placeholders like '-', then sort and join
    items = sorted(set([str(i) for i in series.dropna() if str(i) != '-']))
    return ';'.join(items)

def join_evidence(series):
    # Filter out None/NaNs for the evidence column
    items = sorted(set([str(i) for i in series.dropna()]))
    return ';'.join(items)

# Group by Unified Node ID (lncRNA_id) and aggregate attributes
grouped_nodes = merged_df.groupby('lncRNA_id').agg({
    'gene_id': join_unique,       # Combine all constituent IDs
    'gene_name': join_unique,     # Combine all constituent symbols
    'match_evidence': join_evidence # Combine all matched evidence IDs
}).reset_index()

# Filter for essential nodes (where evidence is not empty)
ess_nodes_final = grouped_nodes[grouped_nodes['match_evidence'] != ''].copy()
ess_nodes_final = ess_nodes_final[ess_nodes_final['lncRNA_id'].isin(annotation['lncRNA_id'])]

# ==========================================
# 5. Format Final Table
# ==========================================
print("Formatting output table...")

# Format genomic coordinates as "chr:start-end:strand"
# Ensure the annotation dataframe has columns: chromosome, start, end, strand
bed['Genomic_Coordinates'] = bed.apply(
    lambda x: f"{x['chr']}:{x['start']}-{x['end']}:{x['strand']}", axis=1
)

# Merge coordinate information into the final essential list
final_table = ess_nodes_final.merge(bed[['lncRNA_id', 'Genomic_Coordinates']], on='lncRNA_id', how='inner')

# Rename columns to match the manuscript's Supplementary Table terminology
final_table = final_table.rename(columns={
    'lncRNA_id': 'Unified_Node_ID',
    'gene_id': 'Constituent_Gene_IDs',
    'gene_name': 'Constituent_Gene_Symbols',
    'match_evidence': 'Matched_Evidence_ID'
})

# Reorder columns for clarity
final_table = final_table[[
    'Unified_Node_ID', 
    'Genomic_Coordinates', 
    'Constituent_Gene_IDs', 
    'Constituent_Gene_Symbols', 
    'Matched_Evidence_ID'
]]

# ==========================================
# 6. Export to Excel
# ==========================================
output_path = "../../data/benchmark/human/Supplementary_Table_S8.xlsx"
final_table.to_excel(output_path, index=False)

print(f"Success! Generated {output_path}")
print(f"Total Essential Nodes Identified: {len(final_table)}")

Loading data...
Matching identifiers...
Aggregating nodes...
Formatting output table...
Success! Generated ../../data/benchmark/human/Supplementary_Table_S8.xlsx
Total Essential Nodes Identified: 1372


In [7]:
import pandas as pd

# ==========================================
# 1. Data Loading
# ==========================================
print("Loading data...")
# Load the essential lncRNA database (Ground Truth)
ess_lnc_db = pd.read_csv("../../data/benchmark/mouse/mouse_esslnc.csv")

# Load raw lncRNA member information (contains gene_id, gene_name)
lncRNA = pd.read_csv("../../data/LPI/mouse/lncRNA.csv") 

# Load mapping file: Member ID -> Unified Node ID (lncRNA_id)
lncRNA_mapping = pd.read_csv("../../data/LPI/mouse/lncRNA_mapping.csv") 

annotation = pd.read_csv("../../annotate/mouse/valid_heart_annotation.csv") 

bed = pd.read_csv("../../data/LPI/mouse/lncRNA_dedup.bed", header=None, sep='\t')
bed.columns=['chr','start','end','lncRNA_id','score','strand']
# ==========================================
# 2. Build Hash Set for Fast Lookup
# ==========================================
# Clean version numbers in Noncode_id (remove suffixes like .1)
ess_lnc_db['Noncode_id'] = ess_lnc_db['Noncode_id'].str.split('.').str[0]

# Collect all potential essential identifiers into a set for O(1) lookup speed
valid_ess_ids = set(ess_lnc_db['Noncode_id'].dropna()) | \
                    set(ess_lnc_db['ensembl_id'].dropna()) 

valid_ess_names = set(ess_lnc_db['gene_name'].dropna()) | \
                    set(ess_lnc_db['target'].dropna())

# ==========================================
# 3. Identify Evidence (Member Level)
# ==========================================
print("Matching identifiers...")

# Function to return the specific matching ID as evidence
def get_evidence(row):
    # Check gene_id first
    if row['gene_id'] in valid_ess_ids:
        return row['gene_id']
    # Check gene_name
    if row['gene_name'] in valid_ess_names:
        return row['gene_name']
    return None

# Merge raw member info with the Unified Node ID mapping
# Assumption: lncRNA['identifier'] corresponds to lncRNA_mapping['member_id']
merged_df = lncRNA.merge(lncRNA_mapping, left_on='identifier', right_on='member_id', how='inner')

# Apply the evidence check to every row (constituent member)
merged_df['match_evidence'] = merged_df.apply(get_evidence, axis=1)

# ==========================================
# 4. Aggregation (Node Level)
# ==========================================
print("Aggregating nodes...")

# Helper function: Remove duplicates, sort, join with semicolons, and exclude empty values
def join_unique(series):
    # Filter out NaNs and placeholders like '-', then sort and join
    items = sorted(set([str(i) for i in series.dropna() if str(i) != '-']))
    return ';'.join(items)

def join_evidence(series):
    # Filter out None/NaNs for the evidence column
    items = sorted(set([str(i) for i in series.dropna()]))
    return ';'.join(items)

# Group by Unified Node ID (lncRNA_id) and aggregate attributes
grouped_nodes = merged_df.groupby('lncRNA_id').agg({
    'gene_id': join_unique,       # Combine all constituent IDs
    'gene_name': join_unique,     # Combine all constituent symbols
    'match_evidence': join_evidence # Combine all matched evidence IDs
}).reset_index()

# Filter for essential nodes (where evidence is not empty)
ess_nodes_final = grouped_nodes[grouped_nodes['match_evidence'] != ''].copy()
ess_nodes_final = ess_nodes_final[ess_nodes_final['lncRNA_id'].isin(annotation['lncRNA_id'])]

# ==========================================
# 5. Format Final Table
# ==========================================
print("Formatting output table...")

# Format genomic coordinates as "chr:start-end:strand"
# Ensure the annotation dataframe has columns: chromosome, start, end, strand
bed['Genomic_Coordinates'] = bed.apply(
    lambda x: f"{x['chr']}:{x['start']}-{x['end']}:{x['strand']}", axis=1
)

# Merge coordinate information into the final essential list
final_table = ess_nodes_final.merge(bed[['lncRNA_id', 'Genomic_Coordinates']], on='lncRNA_id', how='inner')

# Rename columns to match the manuscript's Supplementary Table terminology
final_table = final_table.rename(columns={
    'lncRNA_id': 'Unified_Node_ID',
    'gene_id': 'Constituent_Gene_IDs',
    'gene_name': 'Constituent_Gene_Symbols',
    'match_evidence': 'Matched_Evidence_ID'
})

# Reorder columns for clarity
final_table = final_table[[
    'Unified_Node_ID', 
    'Genomic_Coordinates', 
    'Constituent_Gene_IDs', 
    'Constituent_Gene_Symbols', 
    'Matched_Evidence_ID'
]]

# ==========================================
# 6. Export to Excel
# ==========================================
output_path = "../../data/benchmark/mouse/Supplementary_Table_S9.xlsx"
final_table.to_excel(output_path, index=False)

print(f"Success! Generated {output_path}")
print(f"Total Essential Nodes Identified: {len(final_table)}")

Loading data...
Matching identifiers...
Aggregating nodes...
Formatting output table...
Success! Generated ../../data/benchmark/mouse/Supplementary_Table_S9.xlsx
Total Essential Nodes Identified: 24
