## Obtaining essential lncRNA genes in different tissues.

In [None]:
import pandas as pd

# Define tissues and model names
human_tissues = ['heart','lung','stomach']
mouse_tissues = ['heart','lung','brain']
model_names = ['SVM', 'MLP']

for tissue in human_tissues:
    # Load lncRNA data
    lnc = pd.read_csv("../../data/LPI/human/lncRNA.csv")
    lnc = lnc[['lncRNA_ID', 'gene_id', 'symbol','chr','start','end','strand']]
    
    # List to store essential genes predicted by each model
    all_predictions_ess = []

    # Iterate over models
    for model in model_names:
        # Read prediction file for the current model
        prediction = pd.read_csv(f"../../results/human/{model}_predictions_{tissue}.csv", dtype='str')
        
        # Filter for essential genes (Pre_Label == '1')
        prediction_ess = prediction[prediction['Pre_Label'] == '1']
        prediction_ess = prediction_ess[['lncRNA_ID']]
        
        # Merge the predicted essential genes with lncRNA data
        prediction_ess = pd.merge(prediction_ess, lnc, on='lncRNA_ID', how="inner")
        prediction_ess.to_csv(f"../../results/human/{model}_{tissue}_ess.csv", index=False)
        
        # Append the essential genes (as a set) for the current model
        all_predictions_ess.append(prediction_ess)  # Use set to store lncRNA_ID for intersection
    
    # Calculate the intersection (common essential genes across all models)
    intersection_ess = all_predictions_ess[0]  # Initialize with the first model's essential genes
    for ess_set in all_predictions_ess[1:]:
        intersection_ess.merge(ess_set, on='lncRNA_ID', how='inner')
    
    # Save the result to a CSV file
    intersection_ess.to_csv(f"../../results/human/{tissue}_essential_genes.csv", index=False)

    print(f"Intersection essential genes for {tissue} saved successfully.")


Intersection essential genes for heart saved successfully.
Intersection essential genes for lung saved successfully.
Intersection essential genes for stomach saved successfully.


## Union

In [13]:
import pandas as pd

# Define tissues
mouse_tissues = ['heart', 'lung', 'brain']
human_tissues = ['heart', 'lung', 'stomach']

# Initialize an empty DataFrame to store combined results
combined_df = pd.DataFrame()

# Read and concatenate all tissue essential gene files
for tissue in mouse_tissues:
    df = pd.read_csv(f"../../results/mouse/{tissue}_essential_genes.csv")
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Drop duplicate rows based on all columns
combined_df = combined_df.drop_duplicates()

# Save the union result to CSV
combined_df.to_csv("../../results/mouse/mouse_essential_genes_union.csv", index=False)


CSV2bed

In [14]:
import pandas as pd

# Function to convert CSV to valid 6-column BED format
def convert_csv_to_bed(csv_file_path, bed_file_path):
    # Read CSV
    df = pd.read_csv(csv_file_path)

    # Check required columns
    required_columns = ['chr', 'start', 'end', 'lncRNA_ID', 'strand']
    if not all(column in df.columns for column in required_columns):
        raise ValueError("Missing required columns in CSV.")

    # Construct BED columns in correct order
    df_bed = pd.DataFrame()
    df_bed['chr'] = df['chr']
    df_bed['start'] = df['start'].astype(int)
    df_bed['end'] = df['end'].astype(int)
    df_bed['name'] = df['lncRNA_ID']
    df_bed['score'] = 0
    df_bed['strand'] = df['strand']

    # Save as BED (tab-separated, no header/index)
    df_bed.to_csv(bed_file_path, sep='\t', header=False, index=False)

# Example usage
csv_file_path = '../../results/mouse/mouse_essential_genes_union.csv'
bed_file_path = 'mouse_essential_genes_union.bed'
convert_csv_to_bed(csv_file_path, bed_file_path)


### Run get_overlap.sh to find duplicate essential lncRNA genes.

In [15]:
import pandas as pd
from collections import defaultdict

# Input file paths
csv_file = '../../results/mouse/mouse_essential_genes_union.csv'
overlap_file = 'mouse_overlapping_genes.txt'
output_file = 'deduplicated_mouse_essential_genes.csv'

# Load the original annotation table
df = pd.read_csv(csv_file)
df['length'] = df['end'] - df['start']  # Calculate gene length for selecting representatives

# Load the overlapping gene pairs (fully overlapping based on BEDTools results)
merge_pairs = pd.read_csv(overlap_file, sep='\s+', header=None, names=['A', 'B'])

# === Build union-find structure (disjoint set) to group overlapping genes ===
parent = {}

def find(x):
    parent.setdefault(x, x)
    if parent[x] != x:
        parent[x] = find(parent[x])
    return parent[x]

def union(x, y):
    parent[find(y)] = find(x)

# Apply union for all overlapping pairs
for a, b in zip(merge_pairs['A'], merge_pairs['B']):
    union(a, b)

# Group all genes by their leader node in the union-find structure
groups = defaultdict(set)
for gene in set(merge_pairs['A']).union(set(merge_pairs['B'])):
    groups[find(gene)].add(gene)

# === Determine representative gene per group: longest one ===
merge_map = {}  # representative lncRNA_ID → list of merged lncRNA_IDs
for group in groups.values():
    group_df = df[df['lncRNA_ID'].isin(group)]
    rep_row = group_df.loc[group_df['length'].idxmax()]  # select longest gene
    rep_id = rep_row['lncRNA_ID']
    other_ids = set(group) - {rep_id}
    merge_map[rep_id] = list(other_ids)

# === Build final output ===
# Retain entries that were never merged + representative entries
merged_ids = set(merge_pairs['B'])  # IDs that were merged into others
all_rep_ids = set(merge_map.keys())
retained_ids = set(df['lncRNA_ID']) - merged_ids
final_ids = retained_ids.union(all_rep_ids)

# Filter the dataframe
df_merged = df[df['lncRNA_ID'].isin(final_ids)].copy()

# Add a column showing which IDs were merged into each representative
df_merged['Merged_IDs'] = df_merged['lncRNA_ID'].apply(lambda x: ';'.join(merge_map[x]) if x in merge_map else '')

# Drop the temporary length column
df_merged.drop(columns='length', inplace=True)

# Save the result
df_merged.to_csv(output_file, index=False)
print(f"✅ Merge completed using longest gene per group. Output saved to: {output_file}")


✅ Merge completed using longest gene per group. Output saved to: deduplicated_mouse_essential_genes.csv


## Obtaining essential lncRNA genes in different tissues.

In [18]:
import pandas as pd

# === Step 1: Load the merged file and reconstruct merge_map ===
# The file should contain columns: lncRNA_ID, Merged_IDs
merged_df = pd.read_csv("deduplicated_mouse_essential_genes.csv")

# Build merge_map: representative → [merged_IDs]
merge_map = {}

for _, row in merged_df.iterrows():
    rep_id = row['lncRNA_ID']
    if pd.notna(row.get('Merged_IDs')) and row['Merged_IDs'].strip():
        merged_list = row['Merged_IDs'].split(';')
        merge_map[rep_id] = merged_list

# Create reverse map: lncRNA_ID (any member) → representative
reverse_map = {}
for rep, others in merge_map.items():
    reverse_map[rep] = rep  # rep maps to itself
    for gene in others:
        reverse_map[gene] = rep

# === Step 2: Load a specific tissue's gene list ===
# Replace this with your actual tissue file path
tissue_df = pd.read_csv("../../results/mouse/brain_essential_genes.csv")
lnc_ids = tissue_df['lncRNA_ID']

# === Step 3: Replace lncRNA_IDs with their representative IDs ===
representative_ids = lnc_ids.apply(lambda x: reverse_map.get(x, x))

# === Step 4: Remove duplicates and save to file ===
unique_reps = representative_ids.drop_duplicates().to_frame(name='lncRNA_ID')
unique_reps.to_csv("deduplicated_mouse_brain_essential_genes.csv", index=False, header=None)


------转录本级别的映射，已舍弃------

In [29]:
import pandas as pd
import os
import re
from tqdm import tqdm

# Configuration paths
ENSEMBL_GTF_DIR = "../../data/reference_lncRNA/human/gtf/ensembl/"
LNCBOOK_GTF = "../../data/reference_lncRNA/human/gtf/lncRNA_LncBookv2.0_GRCh38.gtf"
NONCODE_V5_GTF = "../../data/reference_lncRNA/human/gtf/NONCODEv5_human_hg38_lncRNA.gtf"
NONCODE_V6_GTF = "../../data/reference_lncRNA/human/gtf/NONCODEv6_human_hg38_lncRNA.gtf"

# Load gene ID mapping
lnc_df = pd.read_csv('../../results/human/heart_essential_genes.csv')
gene_id_to_lnc = {row['gene_id']: row['lncRNA_ID'] for index, row in lnc_df.iterrows()}
symbol_to_lnc = {row['symbol']: row['lncRNA_ID'] for index, row in lnc_df.iterrows()}

def process_gtf(input_path, fout, gene_id_to_lnc, symbol_to_lnc, use_symbol=False):
    with open(input_path) as f:
        for line in tqdm(f, desc=f"Parsing {os.path.basename(input_path)}", unit=" lines"):
            if line.startswith('#') or not line.strip():
                continue
            segments = line.strip().split('\t')
            if len(segments) < 9 or segments[2] not in {'transcript', 'exon'}:
                continue
            attributes = segments[8]
            gene_id_match = re.search(r'gene_id "([^"]+)"', attributes)
            gene_name_match = re.search(r'gene_name "([^"]+)"', attributes)
            lncRNA_id = None
            if gene_id_match:
                gene_id = gene_id_match.group(1).split('.')[0]  # Remove version number if present
                if gene_id in gene_id_to_lnc:
                    lncRNA_id = gene_id_to_lnc[gene_id]
            elif use_symbol and gene_name_match and gene_name_match.group(1) in symbol_to_lnc:
                lncRNA_id = symbol_to_lnc[gene_name_match.group(1)]
            if lncRNA_id:
                all_found_lncRNA_IDs.add(lncRNA_id)
                segments[8] = re.sub(r'gene_id "[^"]+"', f'gene_id "{lncRNA_id}"', segments[8])
                fout.write('\t'.join(segments) + '\n')
    return {k: v for k, v in gene_id_to_lnc.items() if v not in all_found_lncRNA_IDs}, \
           {k: v for k, v in symbol_to_lnc.items() if v not in all_found_lncRNA_IDs}

all_found_lncRNA_IDs = set()
with open("human_heart.gtf", 'w') as fout:
    # Process specific GTF files
    for gtf_path in [LNCBOOK_GTF, NONCODE_V6_GTF, NONCODE_V5_GTF]:
        gene_id_to_lnc, symbol_to_lnc = process_gtf(gtf_path, fout, gene_id_to_lnc, symbol_to_lnc, use_symbol=False)
    # Process Ensembl files in version order
    ensembl_files = sorted(
        [f for f in os.listdir(ENSEMBL_GTF_DIR) if f.endswith('.gtf')],
        key=lambda x: int(re.search(r'GRCh38\.(\d+)\.gtf', x).group(1)),
        reverse=True
    )
    for ef in ensembl_files:
        gene_id_to_lnc, symbol_to_lnc = process_gtf(os.path.join(ENSEMBL_GTF_DIR, ef), fout, gene_id_to_lnc, symbol_to_lnc, use_symbol=True)

# Save missing genes
missing_lnc_ids = set(lnc_df['lncRNA_ID']) - all_found_lncRNA_IDs
missing_df = lnc_df[lnc_df['lncRNA_ID'].isin(missing_lnc_ids)]
missing_df.to_csv('missing_lncRNAs_human_heart.csv', index=False)

print("Processing complete. Check 'missing_lncRNAs_human_heart.csv' for lncRNAs without transcripts.")


Parsing lncRNA_LncBookv2.0_GRCh38.gtf: 1412552 lines [00:10, 135137.41 lines/s]
Parsing NONCODEv6_human_hg38_lncRNA.gtf: 608746 lines [00:04, 128774.67 lines/s]
Parsing NONCODEv5_human_hg38_lncRNA.gtf: 601456 lines [00:04, 135874.02 lines/s]
Parsing Homo_sapiens.GRCh38.113.gtf: 4114455 lines [00:25, 161683.06 lines/s]
Parsing Homo_sapiens.GRCh38.112.gtf: 3464559 lines [00:17, 201826.56 lines/s]
Parsing Homo_sapiens.GRCh38.111.gtf: 3424902 lines [00:18, 189566.35 lines/s]
Parsing Homo_sapiens.GRCh38.110.gtf: 3421627 lines [00:16, 206360.74 lines/s]
Parsing Homo_sapiens.GRCh38.109.gtf: 3420366 lines [00:19, 179730.78 lines/s]
Parsing Homo_sapiens.GRCh38.108.gtf: 3409311 lines [00:16, 208457.69 lines/s]
Parsing Homo_sapiens.GRCh38.107.gtf: 3371249 lines [00:17, 192782.84 lines/s]
Parsing Homo_sapiens.GRCh38.106.gtf: 3279410 lines [00:10, 301738.75 lines/s]
Parsing Homo_sapiens.GRCh38.104.gtf: 3146137 lines [00:08, 366044.95 lines/s]
Parsing Homo_sapiens.GRCh38.97.gtf: 2877402 lines [00:09

Processing complete. Check 'missing_lncRNAs_human_heart.csv' for lncRNAs without transcripts.





In [31]:
import subprocess
import os

def run_gffcompare(annotated_gtf, reference_gtf, output_prefix):
    """
    Run gffcompare to compare transcript annotations against a reference.

    Parameters:
    annotated_gtf (str): Path to the GTF file with your annotations.
    reference_gtf (str): Path to the reference GTF file.
    output_prefix (str): Prefix for output files generated by gffcompare.
    """
    # Construct the gffcompare command
    command = [
        "gffcompare",
        "-r", reference_gtf,  # Reference GTF file
        "-o", output_prefix,  # Output prefix
        annotated_gtf         # Annotated GTF file
    ]
    
    # Execute the command
    result = subprocess.run(command, capture_output=True, text=True)
    
    # Check if gffcompare ran successfully
    if result.returncode == 0:
        print("gffcompare completed successfully.")
        print(result.stdout)
    else:
        print("Error in gffcompare:")
        print(result.stderr)

# Example usage
annotated_gtf = "human_heart.gtf"
reference_gtf = "gencode.v47.long_noncoding_RNAs.gtf"
output_prefix = "gffc"

run_gffcompare(annotated_gtf, reference_gtf, output_prefix)


gffcompare completed successfully.



In [28]:
import pandas as pd

def strict_matching(tmap_path):
    # 读取 tmap 文件（带 header）
    df = pd.read_csv(tmap_path, sep='\t')

    # 定义我们关心的 class_code
    match_classes = {'=', 'j', 'k', 'e'}

    strict_matches = {}

    # 按 qry_gene_id 分组
    for qry_gene, group in df.groupby('qry_gene_id'):
        class_codes = set(group['class_code'].dropna())

        # 要求该基因的所有转录本 class_code 都在允许范围内
        if class_codes.issubset(match_classes):
            ref_genes = group['ref_gene_id'].dropna().unique()
            if len(ref_genes) == 1:
                strict_matches[qry_gene] = ref_genes[0]

    return strict_matches

# 使用
tmap_path = 'gffc.human_heart.gtf.tmap'
strict_matches = strict_matching(tmap_path)

# 保存结果
results_df = pd.DataFrame(list(strict_matches.items()), columns=['Query Gene', 'Reference Gene'])
results_df.to_csv('human_heart_strict_matches.csv', index=False)
print("Strict matches saved to 'human_heart_strict_matches.csv'.")


Strict matches saved to 'human_heart_strict_matches.csv'.
