### Step1 Get transcripts for lncRNA genes.

In [None]:
# Human
import pandas as pd
import os
import re

# Set the directory containing ensembl transcript data
ensembl_dir = "../../reference_lncRNA/human/transcript/ensembl/"

# Read LncBook and NONCODE transcript files
cols = ['gene_id', 'transcript_id']
noncodev5_trans = pd.read_csv('../../reference_lncRNA/human/transcript/NONCODEv5_human_hg38_lncRNA_trans.txt', sep='\t', header=None, names=cols)
noncodev6_trans = pd.read_csv('../../reference_lncRNA/human/transcript/NONCODEv6_human_hg38_lncRNA_trans.txt', sep='\t', header=None, names=cols)

# Read lncRNA ID data
lncRNA = pd.read_csv('../../data/LPI/human/lncRNA.csv')
lncRNA = lncRNA[['gene_id','gene_name', 'identifier']]

# Initialize remaining lncRNA list
remained_lncRNA = lncRNA[['gene_id','gene_name', 'identifier']].copy()

# Function to remove matched rows
def update_remained_lncRNA(df, matched_df):
    """Remove matched identifier from the remaining lncRNA list"""
    updated_df = df[~df['identifier'].isin(matched_df['identifier'])]
    return updated_df[['gene_id','gene_name', 'identifier']]

# Store results
results = []

# Get transcript by NONCODE (v6 and v5)
trans_lnc_noncodev6 = pd.merge(remained_lncRNA, noncodev6_trans, left_on='identifier', right_on='gene_id', how='inner')
results.append(trans_lnc_noncodev6)
remained_lncRNA = update_remained_lncRNA(remained_lncRNA, trans_lnc_noncodev6)

trans_lnc_noncodev5 = pd.merge(remained_lncRNA, noncodev5_trans, left_on='identifier', right_on='gene_id', how='inner')
results.append(trans_lnc_noncodev5)
remained_lncRNA = update_remained_lncRNA(remained_lncRNA, trans_lnc_noncodev5)

# Extract version numbers and sort filenames in descending order
def extract_version(filename):
    match = re.search(r'GRCh38\.(\d+)_trans\.txt', filename)
    return int(match.group(1)) if match else -1  # Extract version number, default to -1 if no match

ensembl_files = [f for f in os.listdir(ensembl_dir) if f.endswith(".txt")]
sorted_ensembl_files = sorted(ensembl_files, key=extract_version, reverse=True)  # Sort by version number (descending)

# Iterate through sorted ensembl transcript files
for txt_file in sorted_ensembl_files:
    file_path = os.path.join(ensembl_dir, txt_file)

    # Read ensembl transcript file
    ensembl_trans = pd.read_csv(
        file_path, sep='\t', header=None,
        names=['gene_id', 'gene_name', 'transcript_id']
    )

    trans_lnc_ensembl = pd.merge(
        remained_lncRNA,
        ensembl_trans[['gene_id', 'transcript_id']],
        left_on='identifier', right_on='gene_id', how='inner'
    )
    results.append(trans_lnc_ensembl)
    remained_lncRNA = update_remained_lncRNA(remained_lncRNA, trans_lnc_ensembl)
    
for txt_file in sorted_ensembl_files:
    file_path = os.path.join(ensembl_dir, txt_file)

    # Read ensembl transcript file
    ensembl_trans = pd.read_csv(
        file_path, sep='\t', header=None,
        names=['gene_id', 'gene_name', 'transcript_id']
    )
    
    trans_lnc_symbol = pd.merge(
        remained_lncRNA,
        ensembl_trans[['gene_name', 'transcript_id']],
        left_on='identifier', right_on='gene_name', how='inner'
    )
    results.append(trans_lnc_symbol)

    # --- Update remained set ---
    remained_lncRNA = update_remained_lncRNA(remained_lncRNA, trans_lnc_symbol)

# Combine all results into a single DataFrame
trans_lnc = pd.concat(results, ignore_index=True)
trans_lnc = trans_lnc[['identifier', 'transcript_id']]

# Save results to file
trans_lnc.drop_duplicates().to_csv('./human/lnc_trans.csv', index=False, header=['identifier','transcript_id'])

remained_lncRNA.to_csv("./human/no_trans.csv", index=False)

print("Processing complete. Results saved.")


Processing complete. Results saved.


In [2]:
# Mouse

import pandas as pd
import os
import re

# Set the directory containing ensembl transcript data
ensembl_dir = "../../reference_lncRNA/mouse/transcript/ensembl/"

# Read LncBook and NONCODE transcript files
cols = ['gene_id', 'transcript_id']
noncodev5_trans = pd.read_csv('../../reference_lncRNA/mouse/transcript/NONCODEv5_mouse_mm10_lncRNA_trans.txt', sep='\t', header=None, names=cols)

# Read lncRNA ID data
lncRNA = pd.read_csv('../../data/LPI/mouse/lncRNA.csv')
lncRNA = lncRNA[['gene_id','gene_name', 'identifier']]

# Initialize remaining lncRNA list
remained_lncRNA = lncRNA[['gene_id','gene_name', 'identifier']].copy()

# Function to remove matched rows
def update_remained_lncRNA(df, matched_df):
    """Remove matched identifier from the remaining lncRNA list"""
    updated_df = df[~df['identifier'].isin(matched_df['identifier'])]
    return updated_df[['gene_id','gene_name', 'identifier']]

# Store results
results = []

# Get transcript by NONCODE (v5)
trans_lnc_noncodev5 = pd.merge(remained_lncRNA, noncodev5_trans, left_on='identifier', right_on='gene_id', how='inner')
results.append(trans_lnc_noncodev5)
remained_lncRNA = update_remained_lncRNA(remained_lncRNA, trans_lnc_noncodev5)

# Extract version numbers and sort filenames in descending order
def extract_version(filename):
    match = re.search(r'GRCm38\.(\d+)_trans\.txt', filename)
    return int(match.group(1)) if match else -1  # Extract version number, default to -1 if no match

ensembl_files = [f for f in os.listdir(ensembl_dir) if re.match(r"Mus_musculus\.GRCm38\.\d+_trans\.txt$", f)]
sorted_ensembl_files = sorted(ensembl_files, key=extract_version, reverse=True)  # Sort by version number (descending)

# Iterate through sorted ensembl transcript files
for txt_file in sorted_ensembl_files:
    file_path = os.path.join(ensembl_dir, txt_file)

    # Read ensembl transcript file
    ensembl_trans = pd.read_csv(
        file_path, sep='\t', header=None,
        names=['gene_id', 'gene_name', 'transcript_id']
    )

    trans_lnc_ensembl = pd.merge(
        remained_lncRNA,
        ensembl_trans[['gene_id', 'transcript_id']],
        left_on='identifier', right_on='gene_id', how='inner'
    )
    results.append(trans_lnc_ensembl)
    remained_lncRNA = update_remained_lncRNA(remained_lncRNA, trans_lnc_ensembl)
    
for txt_file in sorted_ensembl_files:
    file_path = os.path.join(ensembl_dir, txt_file)

    # Read ensembl transcript file
    ensembl_trans = pd.read_csv(
        file_path, sep='\t', header=None,
        names=['gene_id', 'gene_name', 'transcript_id']
    )
    
    trans_lnc_symbol = pd.merge(
        remained_lncRNA,
        ensembl_trans[['gene_name', 'transcript_id']],
        left_on='identifier', right_on='gene_name', how='inner'
    )
    results.append(trans_lnc_symbol)

    # --- Update remained set ---
    remained_lncRNA = update_remained_lncRNA(remained_lncRNA, trans_lnc_symbol)

# Combine all results into a single DataFrame
trans_lnc = pd.concat(results, ignore_index=True)
trans_lnc = trans_lnc[['identifier', 'transcript_id']]

# Save results to file
trans_lnc.drop_duplicates().to_csv('./mouse/lnc_trans.csv', index=False, header=None)

remained_lncRNA.to_csv("./mouse/no_trans.csv", index=False)

print("Processing complete. Results saved.")


Processing complete. Results saved.


### Step2 Obtain the transcript sequences and filter out transcripts longer than 20,000 nt.

In [None]:
# Human
from Bio import SeqIO
import pandas as pd
import os
import re

# Define sequence length limit
MAX_SEQUENCE_LENGTH = 20000  # Remove genes if any transcript exceeds this length

# Load the mapping file and initialize dictionaries
def load_transcript_ids(mapping_file):
    df = pd.read_csv(mapping_file, dtype=str, header=None, names=['identifier', 'transcript_id'])
    
    gene_to_transcripts = {}  # Store mapping from gene ID to transcript ID list
    transcript_to_gene = {}   # Store mapping from transcript ID to gene ID

    for _, row in df.iterrows():
        gene_id = row['identifier'].strip()
        transcript_id = row['transcript_id'].strip()

        if gene_id not in gene_to_transcripts:
            gene_to_transcripts[gene_id] = set()
        gene_to_transcripts[gene_id].add(transcript_id)
        transcript_to_gene[transcript_id] = gene_id

    return gene_to_transcripts, transcript_to_gene

# Fetch sequences for transcript IDs from a list of FASTA files
def fetch_sequences(fasta_files, gene_to_transcripts):
    found_transcripts = {}  # Store found transcript sequences
    missing_genes = set()   # Store genes whose sequences are missing
    oversized_genes = set() # Store genes that contain transcripts exceeding MAX_SEQUENCE_LENGTH

    # Create a set of all transcript IDs that need to be found
    all_transcript_ids = {tid for tids in gene_to_transcripts.values() for tid in tids}
    needed_ids = all_transcript_ids.copy()  # Copy set to track missing IDs

    for file in fasta_files:
        print(f"Checking file: {file}")  # Debugging output
        if not needed_ids:  # Stop early if all sequences have been found
            break
        for record in SeqIO.parse(file, "fasta"):
            if record.id in needed_ids:
                if len(record.seq) > MAX_SEQUENCE_LENGTH:
                    oversized_genes.add(transcript_to_gene[record.id])  # Mark gene for removal
                else:
                    found_transcripts[record.id] = record.seq  # Store valid transcript
                needed_ids.remove(record.id)  # Remove found ID from the set

    # Identify genes for which all transcripts are missing
    for gene_id, transcript_ids in gene_to_transcripts.items():
        if not all(tid in found_transcripts for tid in transcript_ids):  # If any transcripts of a gene are missing
            missing_genes.add(gene_id)

    # Combine genes to remove: those missing and those with oversized transcripts
    genes_to_remove = missing_genes.union(oversized_genes)

    # Remove the affected genes and their transcripts
    for gene_id in genes_to_remove:
        for tid in gene_to_transcripts[gene_id]:  # Iterate through all transcripts of the gene
            found_transcripts.pop(tid, None)  # Ensure all associated transcripts are removed

    # Filter out removed genes from the gene-to-transcript dictionary
    filtered_gene_to_transcripts = {gene: trans for gene, trans in gene_to_transcripts.items() if gene not in genes_to_remove}

    return found_transcripts, filtered_gene_to_transcripts

# Write the found sequences to a new FASTA file
def write_fasta(sequences, output_file):
    with open(output_file, "w") as f:
        for trans_id, seq in sequences.items():
            SeqIO.write(SeqIO.SeqRecord(seq, id=trans_id, description=""), f, "fasta")

# Write the filtered gene-to-transcript mappings to a new file
def write_filtered_mapping(filtered_gene_to_transcripts, output_mapping_file, sequences):
    with open(output_mapping_file, "w") as f:
        for gene_id, transcript_ids in filtered_gene_to_transcripts.items():
            # keep only transcripts that actually have sequences
            kept = [tid for tid in transcript_ids if tid in sequences]
            for tid in kept:
                f.write(f"{gene_id},{tid}\n")

# File paths and execution
mapping_file = "./human/lnc_trans.csv"
base_directory = "../../reference_lncRNA/human/fasta/"
processed_ensembl_dir = "../../reference_lncRNA/human/fasta/processed_ensembl/"

def extract_version(filename):
    match = re.search(r'v(\d+)', filename)
    #match = re.search(r'GRCh38\.(\d+).ncrna_processed\.fa', filename)
    return int(match.group(1)) if match else -1  # Extract version number, default to -1 if no match

# Get all .fa files in processedensembl directory
processed_ensembl_fa_files = [f for f in os.listdir(processed_ensembl_dir) if f.endswith(".fa")]
sorted_ensembl_files = [os.path.join(processed_ensembl_dir, f) for f in sorted(processed_ensembl_fa_files, key=extract_version, reverse=True)]

fasta_files = [
    os.path.join(base_directory, "NONCODEv6_human_processed.fa"),
    os.path.join(base_directory, "NONCODEv5_human_processed.fa")
] + sorted_ensembl_files  # Add dynamically found files

output_fasta_file = "./human/transcript_sequences.fasta"
output_mapping_file = "./human/filtered_lnc_trans.csv"

# Load required transcript IDs and gene mappings
gene_to_transcripts, transcript_to_gene = load_transcript_ids(mapping_file)

# Fetch sequences and remove missing/oversized genes
sequences, filtered_gene_to_transcripts = fetch_sequences(fasta_files, gene_to_transcripts)

# Write the filtered sequences to output FASTA file
write_fasta(sequences, output_fasta_file)

# Write the filtered gene-transcript mappings to a new file
write_filtered_mapping(filtered_gene_to_transcripts, output_mapping_file, sequences)


In [5]:
# Mouse
from Bio import SeqIO
import pandas as pd
import os
import re

# Define sequence length limit
MAX_SEQUENCE_LENGTH = 20000  # Remove genes if any transcript exceeds this length

# Load the mapping file and initialize dictionaries
def load_transcript_ids(mapping_file):
    df = pd.read_csv(mapping_file, dtype=str, header=None, names=['lncRNA_id', 'transcript_id'])
    
    gene_to_transcripts = {}  # Store mapping from gene ID to transcript ID list
    transcript_to_gene = {}   # Store mapping from transcript ID to gene ID

    for _, row in df.iterrows():
        gene_id = row['lncRNA_id'].strip()
        transcript_id = row['transcript_id'].strip()

        if gene_id not in gene_to_transcripts:
            gene_to_transcripts[gene_id] = set()
        gene_to_transcripts[gene_id].add(transcript_id)
        transcript_to_gene[transcript_id] = gene_id

    return gene_to_transcripts, transcript_to_gene

# Fetch sequences for transcript IDs from a list of FASTA files
def fetch_sequences(fasta_files, gene_to_transcripts):
    found_transcripts = {}  # Store found transcript sequences
    missing_genes = set()   # Store genes whose sequences are missing
    oversized_genes = set() # Store genes that contain transcripts exceeding MAX_SEQUENCE_LENGTH

    # Create a set of all transcript IDs that need to be found
    all_transcript_ids = {tid for tids in gene_to_transcripts.values() for tid in tids}
    needed_ids = all_transcript_ids.copy()  # Copy set to track missing IDs

    for file in fasta_files:
        print(f"Checking file: {file}")  # Debugging output
        if not needed_ids:  # Stop early if all sequences have been found
            break
        for record in SeqIO.parse(file, "fasta"):
            if record.id in needed_ids:
                if len(record.seq) > MAX_SEQUENCE_LENGTH:
                    oversized_genes.add(transcript_to_gene[record.id])  # Mark gene for removal
                else:
                    found_transcripts[record.id] = record.seq  # Store valid transcript
                needed_ids.remove(record.id)  # Remove found ID from the set

    # Identify genes for which all transcripts are missing
    for gene_id, transcript_ids in gene_to_transcripts.items():
        if not all(tid in found_transcripts for tid in transcript_ids):  # If any transcripts of a gene are missing
            missing_genes.add(gene_id)

    # Combine genes to remove: those missing and those with oversized transcripts
    genes_to_remove = missing_genes.union(oversized_genes)

    # Remove the affected genes and their transcripts
    for gene_id in genes_to_remove:
        for tid in gene_to_transcripts[gene_id]:  # Iterate through all transcripts of the gene
            found_transcripts.pop(tid, None)  # Ensure all associated transcripts are removed

    # Filter out removed genes from the gene-to-transcript dictionary
    filtered_gene_to_transcripts = {gene: trans for gene, trans in gene_to_transcripts.items() if gene not in genes_to_remove}

    return found_transcripts, filtered_gene_to_transcripts

# Write the found sequences to a new FASTA file
def write_fasta(sequences, output_file):
    with open(output_file, "w") as f:
        for trans_id, seq in sequences.items():
            SeqIO.write(SeqIO.SeqRecord(seq, id=trans_id, description=""), f, "fasta")

# Write the filtered gene-to-transcript mappings to a new file
def write_filtered_mapping(filtered_gene_to_transcripts, output_mapping_file, sequences):
    with open(output_mapping_file, "w") as f:
        for gene_id, transcript_ids in filtered_gene_to_transcripts.items():
            # keep only transcripts that actually have sequences
            kept = [tid for tid in transcript_ids if tid in sequences]
            for tid in kept:
                f.write(f"{gene_id},{tid}\n")

# File paths and execution
mapping_file = "./mouse/lnc_trans.csv"
base_directory = "../../reference_lncRNA/mouse/fasta/"
processed_ensembl_dir = "../../reference_lncRNA/mouse/fasta/processed_ensembl/"

def extract_version(filename):
    match = re.search(r'v(\d+)', filename)
    return int(match.group(1)) if match else -1  # Extract version number, default to -1 if no match

# Get all .fa files in processedensembl directory
processed_ensembl_fa_files = [f for f in os.listdir(processed_ensembl_dir) if f.endswith(".fa")]
sorted_ensembl_files = [os.path.join(processed_ensembl_dir, f) for f in sorted(processed_ensembl_fa_files, key=extract_version, reverse=True)]

fasta_files = [
    os.path.join(base_directory, "NONCODEv6_mouse_processed.fa"),
    os.path.join(base_directory, "NONCODEv5_mouse_processed.fa")
] + sorted_ensembl_files  # Add dynamically found files

output_fasta_file = "./mouse/transcript_sequences.fasta"
output_mapping_file = "./mouse/filtered_lnc_trans.csv"

# Load required transcript IDs and gene mappings
gene_to_transcripts, transcript_to_gene = load_transcript_ids(mapping_file)

# Fetch sequences and remove missing/oversized genes
sequences, filtered_gene_to_transcripts = fetch_sequences(fasta_files, gene_to_transcripts)

# Write the filtered sequences to output FASTA file
write_fasta(sequences, output_fasta_file)

# Write the filtered gene-transcript mappings to a new file
write_filtered_mapping(filtered_gene_to_transcripts, output_mapping_file, sequences)



Checking file: ../../reference_lncRNA/mouse/fasta/NONCODEv6_mouse_processed.fa
Checking file: ../../reference_lncRNA/mouse/fasta/NONCODEv5_mouse_processed.fa
Checking file: ../../reference_lncRNA/mouse/fasta/processed_ensembl/Mus_musculus.GRCm38.v100.ncrna_processed.fa
Checking file: ../../reference_lncRNA/mouse/fasta/processed_ensembl/Mus_musculus.GRCm38.v99.ncrna_processed.fa
Checking file: ../../reference_lncRNA/mouse/fasta/processed_ensembl/Mus_musculus.GRCm38.v98.ncrna_processed.fa
Checking file: ../../reference_lncRNA/mouse/fasta/processed_ensembl/Mus_musculus.GRCm38.v97.ncrna_processed.fa
Checking file: ../../reference_lncRNA/mouse/fasta/processed_ensembl/Mus_musculus.GRCm38.v96.ncrna_processed.fa
Checking file: ../../reference_lncRNA/mouse/fasta/processed_ensembl/Mus_musculus.GRCm38.v95.ncrna_processed.fa
Checking file: ../../reference_lncRNA/mouse/fasta/processed_ensembl/Mus_musculus.GRCm38.v94.ncrna_processed.fa
Checking file: ../../reference_lncRNA/mouse/fasta/processed_ense