### Obtaining essential lncRNA genes in different tissues.

In [3]:
import pandas as pd

# Define tissues and model names
species = 'human'
if species == 'human':
    tissues = ['heart','lung','stomach']
else:
    tissues = ['heart','lung','brain']

model_names = ['SVM', 'MLP']

for tissue in tissues:
    # Load lncRNA data
    lnc = pd.read_csv(f"../../data/LPI/{species}/lncRNA.csv")
    lnc = lnc[['lncRNA_ID', 'gene_id', 'symbol','chr','start','end','strand']]
    
    # List to store essential genes predicted by each model
    all_predictions_ess = []

    # Iterate over models
    for model in model_names:
        # Read prediction file for the current model
        prediction = pd.read_csv(f"../../results/{species}/{model}_predictions_{tissue}.csv", dtype='str')
        
        # Filter for essential genes (Pre_Label == '1')
        prediction_ess = prediction[prediction['Pre_Label'] == '1']
        prediction_ess = prediction_ess[['lncRNA_ID']]
        
        # Merge the predicted essential genes with lncRNA data
        prediction_ess = pd.merge(prediction_ess, lnc, on='lncRNA_ID', how="inner")
        prediction_ess.to_csv(f"../../results/{species}/{model}_{tissue}_ess.csv", index=False)
        
        # Append the essential genes (as a set) for the current model
        all_predictions_ess.append(prediction_ess)  # Use set to store lncRNA_ID for intersection
    
    # Calculate the intersection (common essential genes across all models)
    intersection_ess = all_predictions_ess[0]  # Initialize with the first model's essential genes
    for ess_set in all_predictions_ess[1:]:
        intersection_ess.merge(ess_set, on='lncRNA_ID', how='inner')
    
    # Save the result to a CSV file
    intersection_ess.to_csv(f"../../results/{species}/{tissue}_essential_genes.csv", index=False)

    print(f"Intersection essential genes for {tissue} saved successfully.")


Intersection essential genes for heart saved successfully.
Intersection essential genes for lung saved successfully.
Intersection essential genes for stomach saved successfully.


### Get union of lncRNA genes in different tissues.

In [5]:
import pandas as pd

# Define tissues
species = 'mouse'

if species == 'human':
    tissues = ['heart','lung','stomach']
else:
    tissues = ['heart','lung','brain']

# Initialize an empty DataFrame to store combined results
combined_df = pd.DataFrame()

# Read and concatenate all tissue essential gene files
for tissue in tissues:
    df = pd.read_csv(f"../../results/{species}/{tissue}_essential_genes.csv")
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Drop duplicate rows based on all columns
combined_df = combined_df.drop_duplicates()

# Save the union result to CSV
combined_df.to_csv(f"../../results/{species}/{species}_essential_genes_union.csv", index=False)


### CSV2bed

In [7]:
import pandas as pd

# Function to convert CSV to valid 6-column BED format
def convert_csv_to_bed(csv_file_path, bed_file_path):
    # Read CSV
    df = pd.read_csv(csv_file_path)

    # Check required columns
    required_columns = ['chr', 'start', 'end', 'lncRNA_ID', 'strand']
    if not all(column in df.columns for column in required_columns):
        raise ValueError("Missing required columns in CSV.")

    # Construct BED columns in correct order
    df_bed = pd.DataFrame()
    df_bed['chr'] = df['chr']
    df_bed['start'] = df['start'].astype(int)
    df_bed['end'] = df['end'].astype(int)
    df_bed['name'] = df['lncRNA_ID']
    df_bed['score'] = 0
    df_bed['strand'] = df['strand']

    # Save as BED (tab-separated, no header/index)
    df_bed.to_csv(bed_file_path, sep='\t', header=False, index=False)

# Example usage
species = 'human'

csv_file_path = f'../../results/{species}/{species}_essential_genes_union.csv'
bed_file_path = f'{species}_essential_genes_union.bed'
convert_csv_to_bed(csv_file_path, bed_file_path)


- Run get_overlap.sh to find duplicate essential lncRNA genes.

### Find overlapped lncRNA genes group

In [None]:
import pandas as pd
from collections import defaultdict

# === Merge overlapping genes based on BEDTools results ===
species = 'human'
# Input file paths
csv_file = f'../../results/{species}/{species}_essential_genes_union.csv'
overlap_file = f'{species}_overlapping_genes.txt'
output_file = f'deduplicated_{species}_essential_genes.csv'

# Load the original annotation table
df = pd.read_csv(csv_file)
df['length'] = df['end'] - df['start']  # Calculate gene length for selecting representatives

# Load the overlapping gene pairs (fully overlapping based on BEDTools results)
merge_pairs = pd.read_csv(overlap_file, sep='\s+', header=None, names=['A', 'B'])

# === Build union-find structure (disjoint set) to group overlapping genes ===
parent = {}

def find(x):
    parent.setdefault(x, x)
    if parent[x] != x:
        parent[x] = find(parent[x])
    return parent[x]

def union(x, y):
    parent[find(y)] = find(x)

# Apply union for all overlapping pairs
for a, b in zip(merge_pairs['A'], merge_pairs['B']):
    union(a, b)

# Group all genes by their leader node in the union-find structure
groups = defaultdict(set)
for gene in set(merge_pairs['A']).union(set(merge_pairs['B'])):
    groups[find(gene)].add(gene)

# === Determine representative gene per group: longest one ===
merge_map = {}  # representative lncRNA_ID → list of merged lncRNA_IDs
for group in groups.values():
    # Skip empty groups
    if not group:
        continue
    
    group_df = df[df['lncRNA_ID'].isin(group)]
    
    # Ensure group_df is not empty before processing
    if group_df.empty:
        print(f"Warning: Empty group found for genes: {group}")
        continue
    
    rep_row = group_df.loc[group_df['length'].idxmax()]  # select longest gene
    rep_id = rep_row['lncRNA_ID']
    other_ids = set(group) - {rep_id}
    merge_map[rep_id] = list(other_ids)

# === Build final output ===
# Retain entries that were never merged + representative entries
merged_ids = set(merge_pairs['B'])  # IDs that were merged into others
all_rep_ids = set(merge_map.keys())
retained_ids = set(df['lncRNA_ID']) - merged_ids
final_ids = retained_ids.union(all_rep_ids)

# Filter the dataframe
df_merged = df[df['lncRNA_ID'].isin(final_ids)].copy()

# Add a column showing which IDs were merged into each representative
df_merged['Merged_IDs'] = df_merged['lncRNA_ID'].apply(lambda x: ';'.join(merge_map[x]) if x in merge_map else '')

# Drop the temporary length column
df_merged.drop(columns='length', inplace=True)

# Save the result
df_merged.to_csv(output_file, index=False)
print(f"✅ Merge completed using longest gene per group. Output saved to: {output_file}")


### Obtaining essential lncRNA genes in different tissues.

In [17]:
import pandas as pd

species = 'human'
tissue = 'stomach'

# === Step 1: Load the merged file and reconstruct merge_map ===
# The file should contain columns: lncRNA_ID, Merged_IDs
merged_df = pd.read_csv(f"deduplicated_{species}_essential_genes.csv")

# Build merge_map: representative → [merged_IDs]
merge_map = {}

for _, row in merged_df.iterrows():
    rep_id = row['lncRNA_ID']
    if pd.notna(row.get('Merged_IDs')) and row['Merged_IDs'].strip():
        merged_list = row['Merged_IDs'].split(';')
        merge_map[rep_id] = merged_list

# Create reverse map: lncRNA_ID (any member) → representative
reverse_map = {}
for rep, others in merge_map.items():
    reverse_map[rep] = rep  # rep maps to itself
    for gene in others:
        reverse_map[gene] = rep

# === Step 2: Load a specific tissue's gene list ===
# Replace this with your actual tissue file path
tissue_df = pd.read_csv(f"../../results/{species}/{tissue}_essential_genes.csv")
lnc_ids = tissue_df['lncRNA_ID']

# === Step 3: Replace lncRNA_IDs with their representative IDs ===
representative_ids = lnc_ids.apply(lambda x: reverse_map.get(x, x))

# === Step 4: Remove duplicates and save to file ===
unique_reps = representative_ids.drop_duplicates().to_frame(name='lncRNA_ID')
unique_reps.to_csv(f"deduplicated_{species}_{tissue}_essential_genes.csv", index=False, header=None)
