In [1]:
import pandas as pd

In [4]:
gtf_file = "Drosophila_melanogaster.BDGP6.46.111.gtf"

def parse_gtf(gtf_path):
    gene_lengths = {}
    with open(gtf_path) as f:
        for line in f:
            if line.startswith("#"):
                continue
            fields = line.strip().split("\t")
            if fields[2] != "exon":
                continue
            chrom, source, feature, start, end, score, strand, frame, attr = fields
            start, end = int(start), int(end)
            gene_id = None
            for a in attr.strip().split(";"):
                if "gene_id" in a:
                    gene_id = a.strip().split(" ")[1].strip('"')
                    break
            if gene_id:
                gene_lengths.setdefault(gene_id, []).append((start, end))
    
    gene_total_lengths = {}
    for gene_id, intervals in gene_lengths.items():
        intervals.sort()
        merged = []
        for start, end in intervals:
            if not merged or merged[-1][1] < start:
                merged.append([start, end])
            else:
                merged[-1][1] = max(merged[-1][1], end)
        total_len = sum(e - s + 1 for s, e in merged)
        gene_total_lengths[gene_id] = total_len
    
    return pd.DataFrame.from_dict(gene_total_lengths, orient="index", columns=["Length"]).rename_axis("GeneID").reset_index()

In [5]:
df_lengths = parse_gtf(gtf_file)
df_lengths.to_csv("gene_lengths.csv", index=False)