In [83]:
#!/usr/bin/env python3
# coding: utf-8

"""
2020/10/09
author:guisen chen
email:thecgs001@foxmail.com
"""

import re
from collections import defaultdict

def parse_gff(file_gff):
    
    gene_ID_dict = defaultdict(list) # all mRNA id corresponding to gene id 
    mRNA_ID_dict = defaultdict(list) # all CDS and exon id corresponding to mRNA id
    ID_info_dict = defaultdict(list) # info of gene, mRNA, CDS and exon
    CDS_length_dict = defaultdict(list) # all CDS length corresponding to mRNA id 
    mRNA_length_dict = defaultdict(int) # all mRNA length
    
    for line in open(file_gff,'r'):
        
        if line.startswith('#') or line == "\n": #remove '#' and '\n'
            continue
        line_list = line.strip().split("\t")
        
        if line_list[2] == "gene": # parse gene
            gene_ID = re.search("ID=(.*?);",line_list[8]).group(1).strip()
            ID_info_dict[gene_ID] = line_list[0:7]
        
        if line_list[2] == "mRNA" or line_list[2] == "transcript": # parse mRNA
            mRNA_ID = re.search("ID=(.*?);",line_list[8]).group(1)
            gene_ID = re.search("Parent=(.*?);",line_list[8]).group(1).strip()
            if gene_ID in gene_ID_dict:
                gene_ID_dict[gene_ID].append(mRNA_ID)
            else:
                gene_ID_dict[gene_ID] = [mRNA_ID]
            ID_info_dict[mRNA_ID] = line_list[0:7]
            
        if line_list[2] == "CDS": # parse CDS
            CDS_ID = re.search("ID=(.*?);",line_list[8]).group(1)
            mRNA_ID = re.search("Parent=(.*?);",line_list[8]).group(1).strip()
            if CDS_ID not in mRNA_ID_dict[mRNA_ID]:
                if mRNA_ID in mRNA_ID_dict:
                    mRNA_ID_dict[mRNA_ID].append(CDS_ID)
                else:
                    mRNA_ID_dict[mRNA_ID] = [CDS_ID]
            ID_info_dict[CDS_ID].append(line_list[0:7])
            CDS_length = int(line_list[4]) - int(line_list[3]) + 1
            CDS_length_dict[mRNA_ID].append(CDS_length)
        
        if line_list[2] == "exon": # parse exon
            exon_ID = re.search("ID=(.*?);",line_list[8]).group(1)
            mRNA_ID = re.search("Parent=(.*?);",line_list[8]).group(1).strip()
            if mRNA_ID in mRNA_ID_dict:
                mRNA_ID_dict[mRNA_ID].append(exon_ID)
            else:
                mRNA_ID_dict[mRNA_ID] = [exon_ID]
            ID_info_dict[exon_ID] = line_list[0:7]
    
    
    for mRNA_ID,CDS_length_list in CDS_length_dict.items():
        mRNA_length_dict[mRNA_ID] = sum(CDS_length_list)
    
    del(CDS_length_dict)    
    return gene_ID_dict,mRNA_ID_dict,ID_info_dict,mRNA_length_dict

if __name__ == '__main__':
    print(parse_gff('/home/guisen/GCF_005281545.1_ASM528154v1_genomic.gff'))

(defaultdict(<class 'list'>, {'gene-slc25a48': ['rna-XM_033617728.1'], 'gene-glra1': ['rna-XM_033620666.1', 'rna-XM_033620672.1', 'rna-XM_033620678.1', 'rna-XM_033620686.1'], 'gene-g3bp1': ['rna-XM_033629617.1'], 'gene-sparc': ['rna-XM_033625137.1'], 'gene-atox1': ['rna-XM_033620350.1'], 'gene-slc36a1': ['rna-XM_033621851.1', 'rna-XM_033621868.1', 'rna-XM_033621859.1'], 'gene-mtnr1al': ['rna-XM_033617664.1'], 'gene-fat2': ['rna-XM_033619531.1', 'rna-XM_033619522.1'], 'gene-lman2': ['rna-XM_033632889.1', 'rna-XM_033632881.1'], 'gene-LOC117258837': ['rna-XM_033630027.1', 'rna-XM_033630017.1'], 'gene-selenot2': ['rna-XM_033630040.1'], 'gene-LOC117258709': ['rna-XM_033629813.1'], 'gene-rnf4': ['rna-XM_033632698.1'], 'gene-sqstm1': ['rna-XM_033629700.1', 'rna-XM_033629709.1', 'rna-XM_033629718.1'], 'gene-mrnip': ['rna-XM_033629740.1', 'rna-XM_033629731.1'], 'gene-LOC117258627': ['rna-XM_033629678.1', 'rna-XM_033629687.1']}), defaultdict(<class 'list'>, {'rna-XM_033617728.1': ['exon-XM_03361