In [25]:
import pandas as pd
import pyranges as pr
from collections import defaultdict
import gffutils


In [26]:
# -----------------------------------------------------------------------------
# 1. Read the BED12 file into a pandas DataFrame
# -----------------------------------------------------------------------------
bed_file = "/Data1/emmab/software/code/data/L1/combined/L1_non_stringent2_full_length.bed"
cols = [
    "chrom",        # 0
    "chromStart",   # 1
    "chromEnd",     # 2
    "name",         # 3 (read ID)
    "score",        # 4
    "strand",       # 5
    "thickStart",   # 6
    "thickEnd",     # 7
    "itemRgb",      # 8
    "blockCount",   # 9
    "blockSizes",   # 10
    "blockStarts"   # 11
]
df_bed = pd.read_csv(bed_file, sep="\t", header=None, names=cols)
df_bed.head()

# keep only first 10,000 rows
df_bed = df_bed.head(10000)

In [27]:

# -----------------------------------------------------------------------------
# 2. Expand BED12 reads into individual exons.
#    Each BED12 entry can have multiple exons ("blocks").
# -----------------------------------------------------------------------------
def expand_bed12(df):
    """Return a DataFrame of exons from BED12 entries."""
    # We'll store exon-level entries in a list of dicts, then convert to DF.
    exon_entries = []
    for idx, row in df.iterrows():
        chrom = row["chrom"]
        strand = row["strand"]
        start = row["chromStart"]
        block_count = row["blockCount"]
        block_sizes = list(map(int, row["blockSizes"].split(",")))
        block_starts = list(map(int, row["blockStarts"].split(",")))
        read_id = row["name"]
        
        for i in range(block_count):
            exon_start = start + block_starts[i]
            exon_end = exon_start + block_sizes[i]
            exon_entries.append({
                "chrom": chrom,
                "start": exon_start,
                "end": exon_end,
                "strand": strand,
                "read_id": read_id
            })
    return pd.DataFrame(exon_entries)

exons_bed = expand_bed12(df_bed)
exons_bed.head()

Unnamed: 0,chrom,start,end,strand,read_id
0,chrIII,7855133,7855380,+,15de427b-8540-4ce5-b0c7-42b46d5539df
1,chrIII,7855433,7855646,+,15de427b-8540-4ce5-b0c7-42b46d5539df
2,chrI,2876929,2877086,+,631fedea-2125-4c77-9be6-534e82449221
3,chrI,2877136,2877352,+,631fedea-2125-4c77-9be6-534e82449221
4,chrI,2877403,2877715,+,631fedea-2125-4c77-9be6-534e82449221


In [28]:

# -----------------------------------------------------------------------------
# 3. Group exons by (chrom, strand, blockCount, positions) to define isoforms.
#    We'll define a "canonical" isoform key by sorting exons and recording
#    their start-end intervals.
# -----------------------------------------------------------------------------
def build_isoform_key(group):
    """Build a tuple of (chrom, strand, [(exon_start, exon_end), ...]) sorted by start."""
    # group is all exons for a single read
    chrom = group.iloc[0]["chrom"]
    strand = group.iloc[0]["strand"]
    
    # Sort exons by start, then build a list of (start, end)
    exon_coords = sorted(zip(group["start"], group["end"]), key=lambda x: x[0])
    return (chrom, strand, tuple(exon_coords))

# Map each read_id -> isoform_key
read_to_isoform = {}
for read_id, grp in exons_bed.groupby("read_id"):
    iso_key = build_isoform_key(grp)
    read_to_isoform[read_id] = iso_key

# Assign isoform_key to each row in df_bed
df_bed["isoform_key"] = df_bed["name"].apply(lambda rid: read_to_isoform[rid])

# Count how many reads per isoform
isoform_counts = (
    df_bed.groupby("isoform_key")
    .size()
    .reset_index(name="read_count")
    .sort_values("read_count", ascending=False)
    .reset_index(drop=True)
)
isoform_counts.head()

Unnamed: 0,isoform_key,read_count
0,"(chrI, -, ((2069085, 2069293), (2069342, 20695...",17
1,"(chrM, +, ((11366, 11691),))",14
2,"(chrI, +, ((4790285, 4790341), (4790412, 47906...",13
3,"(chrV, -, ((12727633, 12727859), (12727905, 12...",12
4,"(chrI, -, ((6220089, 6220205), (6220367, 62205...",12


In [None]:
# wget https://ftp.ebi.ac.uk/pub/databases/wormbase/parasite/releases/WBPS19/species/caenorhabditis_elegans/PRJNA13758/caenorhabditis_elegans.PRJNA13758.WBPS19.annotations.gff3.gz
# 
# import gzip
# import shutil
# import os
# 
# # Path to the gzipped file
# gzipped_file = "/Data1/reference/caenorhabditis_elegans.PRJNA13758.WBPS19.annotations.gff3.gz"
# output_file = gzipped_file.rstrip('.gz')  # Remove .gz extension for the output file
# 
# # Gunzip the file
# with gzip.open(gzipped_file, 'rb') as f_in:
#     with open(output_file, 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)
# 
# # Delete the original gzipped file
# os.remove(gzipped_file)
# 
# print(f"Decompressed and removed: {gzipped_file}")
# print(f"Output file created: {output_file}")

# pass cell


In [None]:

# -----------------------------------------------------------------------------
# 4. Load reference GFF3 annotation using gffutils
# -----------------------------------------------------------------------------
gff_file = "/Data1/reference/caenorhabditis_elegans.PRJNA13758.WBPS19.annotations.gff3"

# Create or connect to the database
db_file = 'c_elegans.db'

# Check if the database already exists
if os.path.exists(db_file):
    print(f"Loading existing gffutils database from '{db_file}'...")
    db = gffutils.FeatureDB(db_file, keep_order=True)
else:
    print(f"Creating gffutils database from '{gff_file}'...")
    db = gffutils.create_db(
        gff_file,
        dbfn=db_file,
        force=True,
        keep_order=True,
        merge_strategy='merge',
        sort_attribute_values=True
    )
    print(f"Database created and saved to '{db_file}'.")

In [ ]:
# Extract all gene features
gene_features = list(db.features_of_type('gene'))

# Build a mapping from gene ID to gene symbol
gene_id_to_symbol = {}
for gene in gene_features:
    gene_id = gene.id
    # Extract gene symbol from attributes; adjust the key if necessary
    # Common attribute keys: 'Name', 'gene', 'gene_symbol'
    gene_symbol = gene.attributes.get('Name', [gene.id])[0]  # Fallback to gene ID if Name not present
    gene_id_to_symbol[gene_id] = gene_symbol

# Extract all transcript features
transcript_features = list(db.features_of_type('transcript'))

# Build a mapping from transcript ID to gene ID and gene symbol
transcript_to_gene = {}
for transcript in transcript_features:
    transcript_id = transcript.id
    # Extract Parent gene ID from attributes; adjust the key if necessary
    # Common attribute keys: 'Parent', 'gene_id', 'gene'
    parent_gene_ids = transcript.attributes.get('Parent', [])
    if not parent_gene_ids:
        # Some GFF3 files might use 'gene' or 'gene_id' instead
        parent_gene_ids = transcript.attributes.get('gene', [])
    if not parent_gene_ids:
        # If still not found, use the transcript ID as gene ID (unlikely)
        parent_gene_ids = [transcript_id]
    # Assuming one parent gene per transcript
    parent_gene_id = parent_gene_ids[0]
    gene_symbol = gene_id_to_symbol.get(parent_gene_id, parent_gene_id)
    transcript_to_gene[transcript_id] = {
        "gene_id": parent_gene_id,
        "gene_symbol": gene_symbol
    }

transcript_exons = defaultdict(list)
for transcript in transcript_features:
    transcript_id = transcript.id
    gene_info = transcript_to_gene.get(transcript_id, {"gene_id": "unknown", "gene_symbol": "unknown"})
    exons = list(db.children(transcript, featuretype='exon', order_by='start'))
    exon_coords = sorted([(e.start, e.end) for e in exons], key=lambda x: x[0])
    key = (transcript.chrom, transcript.strand, tuple(exon_coords))
    # Store transcript ID and gene symbol
    transcript_exons[key].append({
        "transcript_id": transcript_id,
        "gene_symbol": gene_info["gene_symbol"]
    })


In [None]:
def classify_isoform(iso_key):
    """Return a dict with classification (known or novel) 
       and possible matching transcript IDs and gene symbols."""
    if iso_key in transcript_exons:
        matching_info = transcript_exons[iso_key]
        transcript_ids = [info["transcript_id"] for info in matching_info]
        gene_symbols = list(set(info["gene_symbol"] for info in matching_info))
        return {
            "classification": "known",
            "matching_transcripts": ";".join(transcript_ids),
            "gene_symbols": ";".join(gene_symbols)
        }
    else:
        return {
            "classification": "novel",
            "matching_transcripts": "",
            "gene_symbols": ""
        }

classification_results = []
for i, row in isoform_counts.iterrows():
    iso_key = row["isoform_key"]
    read_count = row["read_count"]
    class_info = classify_isoform(iso_key)
    classification_results.append({
        "isoform_key": iso_key,
        "read_count": read_count,
        "classification": class_info["classification"],
        "matching_transcripts": class_info["matching_transcripts"],
        "gene_symbols": class_info["gene_symbols"]
    })

df_classification = pd.DataFrame(classification_results)


In [ ]:
# Show first 10 isoforms
display(df_classification.head(10))

# Save the table as CSV if desired
output_csv = "isoform_classification_results_with_genes.csv"
df_classification.to_csv(output_csv, index=False)
print(f"Classification results with gene symbols saved to '{output_csv}'.")
