## About

This notebook shows ho to extract INSDC_ mentions in the Sequence Ontology for creating items for feature and qualifier naming config.

In [75]:
import json
import urllib.request

# to load local file
# with open("so.json") as f:
#     content = json.load(f)

# to load from the SO repo
url = "https://raw.githubusercontent.com/The-Sequence-Ontology/SO-Ontologies/master/Ontology_Files/so.json"
with urllib.request.urlopen(url) as response:
    content = json.loads(response.read())

In [21]:
xs = content["graphs"][0]["nodes"]

In [22]:
xs[0].keys()

dict_keys(['id', 'meta', 'type', 'lbl'])

In [37]:
filtered = [
    (
        x["lbl"], 
        [d["val"] for d in x["meta"]["synonyms"] if d["val"].startswith("INSDC_")],
    )
    for x in xs
    if any(syn for syn in x.get("meta", dict()).get("synonyms", ()) if syn["val"].startswith("INSDC_"))
]

In [54]:
feature_only = [
    (name, vals) for (name, vals) in filtered 
    if (any(v.startswith("INSDC_feature:") for v in vals) and
        all(not v.startswith("INSDC_qualifier:") for v in vals) and
        all(not v.startswith("INSDC_note:") for v in vals)
       )
]
rest = [tup for tup in filtered if tup not in feature_only]

In [58]:
feature_only.sort()
for name, lis in feature_only:
    print(f"[{name}]")
    for v in sorted(lis):
        if v.startswith("INSDC_feature:"):
            x = v.removeprefix("INSDC_feature:")
            print(f'feature_key = "{x}"')
    print()

[CDS]
feature_key = "CDS"

[C_gene_segment]
feature_key = "C_region"

[D_gene_segment]
feature_key = "D_segment"

[D_loop]
feature_key = "D-loop"

[J_gene_segment]
feature_key = "J_segment"

[N_region]
feature_key = "N_region"

[STS]
feature_key = "STS"

[S_region]
feature_key = "S_region"

[V_gene_segment]
feature_key = "V_segment"

[V_region]
feature_key = "V_region"

[binding_site]
feature_key = "misc_binding"

[centromere]
feature_key = "centromere"

[exon]
feature_key = "exon"

[five_prime_UTR]
feature_key = "5'UTR"

[gap]
feature_key = "assembly_gap"
feature_key = "gap"

[gene]
feature_key = "gene"

[iDNA]
feature_key = "iDNA"

[intron]
feature_key = "intron"

[mRNA]
feature_key = "mRNA"

[mature_protein_region]
feature_key = "mat_peptide"

[mature_protein_region_of_CDS]
feature_key = "mat_peptide"

[mobile_genetic_element]
feature_key = "mobile_element"

[modified_DNA_base]
feature_key = "modified_base"

[operon]
feature_key = "operon"

[oriT]
feature_key = "oriT"

[origin_of_re

In [72]:
QUAL_KEYS_MONO = [
    "ribosomal_slippage",
    "trans_splicing",
]

rest.sort()
for name, lis in rest:
    print(f"[{name}]")
    for v in sorted(lis):
        if v.startswith("INSDC_feature:"):
            x = v.removeprefix("INSDC_feature:")
            print(f'feature_key = "{x}"')
            if x == "regulatory":
                print(f'qualifier_key = "regulatory_class"')
            elif x == "ncRNA":
                print(f'qualifier_key = "ncRNA_class"')
            elif x == "repeat_region":
                print(f'qualifier_key = "rpt_type"')
            elif x == "misc_recomb":
                print(f'qualifier_key = "recombination_class"')               
            elif ("pseudogen" in x) or ("pseudogen" in name):
                print(f'qualifier_key = "pseudogene"')
                
        elif v.startswith("INSDC_qualifier:"):
            x = v.removeprefix("INSDC_qualifier:")
            if x in QUAL_KEYS_MONO:
                print(f'qualifier_key = "{x}"')
                print('qualifier_value = ""')
            else:
                print(f'qualifier_value = "{x}"')
        elif v.startswith("INSDC_note:"):
            x = v.removeprefix("INSDC_note:")
            print(f'qualifier_key = "note"')
            print(f'qualifier_value = "{x}"')
        elif v.startswith("INSDC_misc_feature"):
            print(f'feature_key = "misc_feature"')
        else:
            print(f"unknown ::: {v}")
    print()

[CAAT_signal]
feature_key = "regulatory"
qualifier_key = "regulatory_class"
qualifier_value = "CAAT_signal"

[CAGE_cluster]
feature_key = "misc_feature"
qualifier_key = "note"
qualifier_value = "CAGE_cluster"

[DNaseI_hypersensitive_site]
feature_key = "regulatory"
qualifier_key = "regulatory_class"
qualifier_value = "DNase_I_hypersensitive_site"

[GC_rich_promoter_region]
feature_key = "regulatory"
qualifier_key = "regulatory_class"
qualifier_value = "GC_rich_promoter_region"

[RNase_MRP_RNA]
feature_key = "ncRNA"
qualifier_key = "ncRNA_class"
qualifier_value = "RNase_MRP_RNA"

[RNase_P_RNA]
feature_key = "ncRNA"
qualifier_key = "ncRNA_class"
qualifier_value = "RNase_P_RNA"

[SRP_RNA]
feature_key = "ncRNA"
qualifier_key = "ncRNA_class"
qualifier_value = "SRP_RNA"

[TATA_box]
feature_key = "regulatory"
qualifier_key = "regulatory_class"
qualifier_value = "TATA_box"

[TSS]
feature_key = "misc_feature"
qualifier_key = "note"
qualifier_value = "transcription_start_site"

[X_element_combin