# **Dataset Creation**

#### **Description**

#### **Imports**

In [1]:
import pandas as pd
# import fireducks.pandas as pd

In [2]:
import obonet
import networkx as nx

## **Primary data**

In [3]:
dataset_nn = pd.read_csv("../../data/2025-08_cdrna_sequences.tsv.gz", sep="\t")

## **Data cleaning**

In [4]:
dataset_nn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20595 entries, 0 to 20594
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   cluster_id        20595 non-null  int64 
 1   version           20595 non-null  object
 2   seqres            20595 non-null  object
 3   seqres_can        20595 non-null  object
 4   upi               20595 non-null  object
 5   md5               20595 non-null  object
 6   rna_type_so       20595 non-null  object
 7   rna_type_insdc    20595 non-null  object
 8   ontology_term_id  20595 non-null  object
 9   entry_entity_id   20595 non-null  object
dtypes: int64(1), object(9)
memory usage: 1.6+ MB


In [5]:
dataset_nn.head()

Unnamed: 0,cluster_id,version,seqres,seqres_can,upi,md5,rna_type_so,rna_type_insdc,ontology_term_id,entry_entity_id
0,0,v1-2,UUUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGU...,UUUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGU...,URS000080E226,8c29ac94fb1a187b14036d4f9cbc9d83,cytosolic_SSU_rRNA,rRNA,SO:0000650,1fjg_1
1,0,v1-3,UUUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGU...,UUUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGU...,URS000080E226,8c29ac94fb1a187b14036d4f9cbc9d83,cytosolic_SSU_rRNA,rRNA,SO:0000650,1hnw_1
2,0,v1-4,UUUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGU...,UUUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGU...,URS000080E226,8c29ac94fb1a187b14036d4f9cbc9d83,cytosolic_SSU_rRNA,rRNA,SO:0000650,1hnx_1
3,0,v1-3,UUUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGU...,UUUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGU...,URS000080E226,8c29ac94fb1a187b14036d4f9cbc9d83,cytosolic_SSU_rRNA,rRNA,SO:0000650,1hnz_1
4,0,v1-3,UUUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGU...,UUUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGU...,URS000080E226,8c29ac94fb1a187b14036d4f9cbc9d83,cytosolic_SSU_rRNA,rRNA,SO:0000650,1hr0_1


In [6]:
dataset_nn['seqres_length'] = dataset_nn.seqres_can.str.len()

In [7]:
set().union(*dataset_nn.seqres_can.map(set))

{'A', 'C', 'F', 'G', 'I', 'M', 'N', 'T', 'U', 'W', 'X'}

### **Retaining tRNA**
They usually have modified residues and aminoacylated ends.

In [37]:
mask_w_trna = dataset_nn.rna_type_so.str.contains("tRNA")

In [38]:
set().union(*dataset_nn.loc[(mask_w_trna)].seqres_can.map(set))

{'A', 'C', 'F', 'G', 'I', 'M', 'N', 'U', 'W', 'X'}

### **Removing sequences with 'T'**

In [26]:
mask_wo_t = ~dataset_nn.seqres_can.str.contains("T")

### **Removing sequences with non-standard nucleotides**

In [11]:
def contains_nonstandard(nucleotides: set) -> bool:
    """
    Check if a set of characters contains non-standard nucleotides.
    
    Parameters:
                * set_of_nts: a set of nucleotides.

    Return: True if the set contains non-standard nucleotides, False otherwise.
    """
    standards = set(['A', 'C', 'G', 'U'])
    return len(nucleotides.difference(standards)) != 0

In [12]:
# Take only sequences with standard nucleotides
standards = set(['A', 'C', 'G', 'U'])

In [13]:
# No existe 'reduce' para pd.Series
# dataset_nn.seqres_can.reduce(lambda x, y: set(x).union(set(y)))
nucleotides = set().union(*dataset_nn.seqres_can.map(set))

In [14]:
nucleotides.difference(standards)

{'F', 'I', 'M', 'N', 'T', 'W', 'X'}

In [15]:
for nt in nucleotides:
    print(nt, dataset_nn.loc[dataset_nn.seqres_can.str.contains(nt)].shape)

C (19807, 11)
U (20210, 11)
A (20086, 11)
W (5, 11)
X (495, 11)
G (19989, 11)
N (174, 11)
M (4, 11)
F (6, 11)
T (6, 11)
I (68, 11)


In [40]:
mask_wo_nonstd = ~dataset_nn.seqres_can.map(set).apply(lambda x: contains_nonstandard(nucleotides=x))

In [41]:
mask_wo_nonstd.value_counts()

seqres_can
True     19916
False      679
Name: count, dtype: int64

In [42]:
dataset_nn.loc[mask_wo_nonstd].rna_type_insdc.value_counts()

rna_type_insdc
rRNA                   9783
misc_RNA               4298
tRNA                   2958
ncRNA                  1848
ribozyme                243
snRNA                   231
piRNA                   220
sRNA                     96
miRNA                    74
other                    51
hammerhead_ribozyme      37
SRP_RNA                  27
snoRNA                   11
telomerase_RNA           10
RNase_P_RNA               8
tmRNA                     6
pre_miRNA                 5
RNase_MRP_RNA             4
precursor_RNA             4
Y_RNA                     1
antisense_RNA             1
Name: count, dtype: int64

### **Removing mono-poly-nucleotide sequences**
Most of them are synthetic constructs for modeling, mRNA fragments, regions or RNA motifs.

In [36]:
mask_wo_mono_poly = ~dataset_nn.seqres_can.map(set).apply(lambda x: len(x) == 1)

### **Removing sequences with low confidence (more than 10% ambiguous residues, 'N' or 'X')**

In [20]:
def lt_perc_of_nt(series: pd.Series, nts: set, perc: float) -> bool:
    return series.isin(nts).sum() < perc * series.shape[0]

def gt_perc_of_nt(series: pd.Series, nts: set, perc: float) -> bool:
    return series.isin(nts).sum() > perc * series.shape[0]

In [34]:
mask_wo_low_conf = ~dataset_nn.seqres_can.apply(lambda x: gt_perc_of_nt(pd.Series(list(x)), {"X", "N"}, 0.1))

In [35]:
dataset_nn.loc[~mask_wo_low_conf]

Unnamed: 0,cluster_id,version,seqres,seqres_can,upi,md5,rna_type_so,rna_type_insdc,ontology_term_id,entry_entity_id,seqres_length
9326,31,v1-4,(LCC)(LCC)(LCC)(LCG)ACUUAAGUCGG,NNNGACUUAAGUCGG,URS0000A77850,767f33a0637c02b597d1d5d3d515e203,transcript,misc_RNA,SO:0000673,5krg_1,15
9327,31,v1-2,(LCC)(LCC)(LCC)(LCG)ACUUAAGUCGG,NNNGACUUAAGUCGG,URS0000A77850,767f33a0637c02b597d1d5d3d515e203,transcript,misc_RNA,SO:0000673,5l00_1,15
9328,31,v1-6,(LCC)(LCC)(LCC)(LCG)ACUUAAGUCGG,NNNGACUUAAGUCGG,URS0000A77850,767f33a0637c02b597d1d5d3d515e203,transcript,misc_RNA,SO:0000673,5v0h_1,15
9329,31,v2-0,(LCC)(LCC)(LCC)(LCG)ACUUAAGUCGG,NNNGACUUAAGUCGG,URS0000A77850,767f33a0637c02b597d1d5d3d515e203,transcript,misc_RNA,SO:0000673,6c8m_1,15
9330,31,v2-0,(LCC)(LCC)(LCC)(LCG)ACUUAAGUCGG,NNNGACUUAAGUCGG,URS0000A77850,767f33a0637c02b597d1d5d3d515e203,transcript,misc_RNA,SO:0000673,6c8n_1,15
9331,31,v1-3,(LCC)(LCC)(LCC)(LCG)ACUUAAGUCGG,NNNGACUUAAGUCGG,URS0000A77850,767f33a0637c02b597d1d5d3d515e203,transcript,misc_RNA,SO:0000673,6c8o_1,15
9332,31,v1-2,(LCC)(LCC)(LCC)(LCG)ACUUAAGUCG(GDP),NNNGACUUAAGUCGG,URS0000A77850,767f33a0637c02b597d1d5d3d515e203,transcript,misc_RNA,SO:0000673,6u6j_1,15
9333,31,v1-2,(LCC)(LCC)(LCC)(LCG)ACUUAAGUC(G46)(G46),NNNGACUUAAGUCGG,URS0000A77850,767f33a0637c02b597d1d5d3d515e203,transcript,misc_RNA,SO:0000673,7u87_1,15
9334,31,v1-2,(LKC)(LCC)(LCC)(LCG)ACUUAAGUC(G46)(G46),NNNGACUUAAGUCGG,URS0000A77850,767f33a0637c02b597d1d5d3d515e203,transcript,misc_RNA,SO:0000673,7u88_1,15
9335,31,v1-2,(LKC)(LCC)(LCC)(LCG)ACUUAAGUCG(G46),NNNGACUUAAGUCGG,URS0000A77850,767f33a0637c02b597d1d5d3d515e203,transcript,misc_RNA,SO:0000673,7u89_1,15


### **Cleaning up the dataset**
Using different masks.

In [57]:
mask_unified = ((mask_wo_nonstd) | (mask_w_trna)) & (mask_wo_mono_poly) & (mask_wo_t) & (mask_wo_low_conf)

In [58]:
dataset_nn_clean, removed_nonstd = dataset_nn.loc[mask_unified].reset_index(drop=True)\
                            , dataset_nn.loc[~mask_unified].reset_index(drop=True)

In [59]:
removed_nonstd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1024 entries, 0 to 1023
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   cluster_id        1024 non-null   int64 
 1   version           1024 non-null   object
 2   seqres            1024 non-null   object
 3   seqres_can        1024 non-null   object
 4   upi               1024 non-null   object
 5   md5               1024 non-null   object
 6   rna_type_so       1024 non-null   object
 7   rna_type_insdc    1024 non-null   object
 8   ontology_term_id  1024 non-null   object
 9   entry_entity_id   1024 non-null   object
 10  seqres_length     1024 non-null   int64 
dtypes: int64(2), object(9)
memory usage: 88.1+ KB


In [60]:
dataset_nn_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19571 entries, 0 to 19570
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   cluster_id        19571 non-null  int64 
 1   version           19571 non-null  object
 2   seqres            19571 non-null  object
 3   seqres_can        19571 non-null  object
 4   upi               19571 non-null  object
 5   md5               19571 non-null  object
 6   rna_type_so       19571 non-null  object
 7   rna_type_insdc    19571 non-null  object
 8   ontology_term_id  19571 non-null  object
 9   entry_entity_id   19571 non-null  object
 10  seqres_length     19571 non-null  int64 
dtypes: int64(2), object(9)
memory usage: 1.6+ MB


In [65]:
dataset_nn_clean.seqres_can.apply(lambda x: len(set(x)) == 1).value_counts()

seqres_can
False    19571
Name: count, dtype: int64

In [66]:
dataset_nn_clean.seqres_can.str.contains("T").value_counts()

seqres_can
False    19571
Name: count, dtype: int64

### **Getting minimal columns representation**

In [67]:
columns = [   'cluster_id'
            , 'seqres_length'
            , 'rna_type_insdc'
            , 'rna_type_so'
            , 'ontology_term_id'
            , 'md5' ]

In [68]:
dataset_nn_clean = dataset_nn_clean[columns].drop_duplicates().reset_index(drop=True)

In [69]:
dataset_nn_clean.shape

(4810, 6)

In [70]:
dataset_nn_clean.head()

Unnamed: 0,cluster_id,seqres_length,rna_type_insdc,rna_type_so,ontology_term_id,md5
0,0,1522,rRNA,cytosolic_SSU_rRNA,SO:0000650,8c29ac94fb1a187b14036d4f9cbc9d83
1,0,1514,rRNA,cytosolic_SSU_rRNA,SO:0000650,783a3d08855f96a473eb4dbb04681c5f
2,0,1521,rRNA,cytosolic_SSU_rRNA,SO:0000650,49b76b092e1d4fcad47237dd3e9762d5
3,0,1520,rRNA,cytosolic_SSU_rRNA,SO:0000650,b528a320ae1004320d4f8e86364374e2
4,0,1509,rRNA,cytosolic_SSU_rRNA,SO:0000650,4b927c4c94a25e5d6cb5943a400c5f05


### **Sequence Ontology terms selection**

In [71]:
dataset_nn_clean.rna_type_insdc.value_counts()

rna_type_insdc
misc_RNA               2400
rRNA                   1059
tRNA                    559
ncRNA                   430
piRNA                    71
snRNA                    62
ribozyme                 60
sRNA                     57
miRNA                    34
SRP_RNA                  22
other                    16
hammerhead_ribozyme      11
snoRNA                    5
pre_miRNA                 5
RNase_P_RNA               5
precursor_RNA             4
RNase_MRP_RNA             3
tmRNA                     3
telomerase_RNA            2
Y_RNA                     1
antisense_RNA             1
Name: count, dtype: int64

In [72]:
unspecific_types = ['misc_RNA', 'other', 'ncRNA']

In [73]:
seq_w_unspecific_types = set(dataset_nn_clean.loc[dataset_nn_clean.rna_type_insdc.isin(unspecific_types)].md5.unique())

In [74]:
seq_w_1_unspecific_type = set(dataset_nn_clean.loc[dataset_nn_clean.md5.isin(seq_w_unspecific_types)].groupby('md5').filter(lambda x: x.shape[0] == 1).md5.unique())

In [75]:
seq_w_specific_types = set(dataset_nn_clean.loc[~dataset_nn_clean.rna_type_insdc.isin(unspecific_types)].md5.unique())

In [76]:
seq_w_conflict = seq_w_unspecific_types.intersection(seq_w_specific_types)

In [77]:
seq_w_gt1_only_unspecific_types = set(dataset_nn_clean.loc[dataset_nn_clean.md5.isin(seq_w_unspecific_types.difference(seq_w_specific_types))].groupby('md5').filter(lambda x: x.shape[0] > 1).md5.unique())

In [78]:
seq_w_1_specific_type = set(dataset_nn_clean.loc[dataset_nn_clean.md5.isin(seq_w_specific_types)].groupby('md5').filter(lambda x: x.shape[0] == 1).md5.unique())

In [79]:
print(f"Number of sequences with conflicts: {len(seq_w_conflict)}")
# Can't decide which rna type is
print(f"Number of sequences with more than one unspecific type: {len(seq_w_gt1_only_unspecific_types)}")
print(f"Number of sequences with one unspecific type: {len(seq_w_1_unspecific_type)}")
print(f"Number of sequences with one specific type: {len(seq_w_1_specific_type)}")

Number of sequences with conflicts: 73
Number of sequences with more than one unspecific type: 115
Number of sequences with one unspecific type: 1918
Number of sequences with one specific type: 991


In [80]:
seq_w_gt1_specific_seen = set()
for seq, g in dataset_nn_clean.groupby('md5'):
    specifics = set(g.rna_type_insdc.unique()) - set(unspecific_types)
    if len(specifics) > 1:
        seq_w_gt1_specific_seen.add(seq)

In [81]:
# Can't decide which rna type is
print(f"Number of sequences with more than one specific type: {len(seq_w_gt1_specific_seen)}")
print(f"Number of sequences who can be specifically annotated: {len(seq_w_conflict - seq_w_gt1_specific_seen)}")

Number of sequences with more than one specific type: 13
Number of sequences who can be specifically annotated: 60


In [82]:
for clstr, g in dataset_nn_clean.loc[dataset_nn_clean.md5.isin(seq_w_gt1_specific_seen)].groupby('cluster_id'):
    print(f"cluster: {clstr}\t size: {g.shape[0]}\t types: {sorted(g.rna_type_so.unique())}")

cluster: 86	 size: 16	 types: ['miRNA', 'ncRNA', 'piRNA', 'transcript']
cluster: 93	 size: 16	 types: ['ncRNA', 'piRNA', 'rRNA', 'transcript']
cluster: 272	 size: 16	 types: ['miRNA', 'ncRNA', 'piRNA', 'transcript']
cluster: 701	 size: 16	 types: ['miRNA', 'ncRNA', 'piRNA', 'transcript']
cluster: 757	 size: 16	 types: ['ncRNA', 'piRNA', 'rRNA', 'transcript']
cluster: 989	 size: 16	 types: ['miRNA', 'ncRNA', 'piRNA', 'transcript']
cluster: 997	 size: 9	 types: ['ncRNA', 'piRNA', 'rRNA']
cluster: 1146	 size: 9	 types: ['ncRNA', 'piRNA', 'snRNA']
cluster: 1242	 size: 8	 types: ['ncRNA', 'snRNA']
cluster: 1496	 size: 9	 types: ['ncRNA', 'piRNA', 'rRNA']
cluster: 2942	 size: 20	 types: ['miRNA', 'ncRNA', 'pre_miRNA', 'primary_transcript']
cluster: 3468	 size: 16	 types: ['ncRNA', 'piRNA', 'snRNA', 'transcript']
cluster: 3582	 size: 16	 types: ['miRNA', 'ncRNA', 'piRNA', 'transcript']


In [84]:
dataset_nn_clean.loc[dataset_nn_clean.cluster_id == 3582]

Unnamed: 0,cluster_id,seqres_length,rna_type_insdc,rna_type_so,ontology_term_id,md5
4269,3582,20,miRNA,miRNA,SO:0000276,026b32aa4e223efee37b91ce28038a7b
4270,3582,20,miRNA,miRNA,SO:0000673,026b32aa4e223efee37b91ce28038a7b
4271,3582,20,miRNA,miRNA,SO:0001035,026b32aa4e223efee37b91ce28038a7b
4272,3582,20,miRNA,miRNA,SO:0000655,026b32aa4e223efee37b91ce28038a7b
4273,3582,20,misc_RNA,transcript,SO:0000276,026b32aa4e223efee37b91ce28038a7b
4274,3582,20,misc_RNA,transcript,SO:0000673,026b32aa4e223efee37b91ce28038a7b
4275,3582,20,misc_RNA,transcript,SO:0001035,026b32aa4e223efee37b91ce28038a7b
4276,3582,20,misc_RNA,transcript,SO:0000655,026b32aa4e223efee37b91ce28038a7b
4277,3582,20,piRNA,piRNA,SO:0000276,026b32aa4e223efee37b91ce28038a7b
4278,3582,20,piRNA,piRNA,SO:0000673,026b32aa4e223efee37b91ce28038a7b


#### **Working with OBO file**

In [85]:
so = obonet.read_obo(path_or_file="../../data/so.obo", ignore_obsolete=True, encoding="utf-8")

In [86]:
class NodeBranch():
    def __init__( self
                , node     : str
                , name     : str
                , distance : int
                , path     : list | None ):
        self.node     = node
        self.name     = name
        self.distance = distance
        self.path     = path

    def __repr__(self):
        return f"NodeBranch(\n\tnode={self.node},\n\tname={self.name},\n\tdistance={self.distance},\n\tpath={self.path})"


def get_node_branches(graph: nx.MultiDiGraph, source_node: str, target_nodes: dict) -> list[NodeBranch]:
    """
    """
    G_u = graph.to_undirected()  # para garantizar conexidad en grafos dirigidos

    paths = []
    for label, node in target_nodes.items():
        if node in graph:
            try:
                dist = nx.shortest_path_length(G_u, source=source_node, target=node)
                path = nx.shortest_path(G_u, source=source_node, target=node)
                path = [(n, graph.nodes[n].get("name", "")) for n in path]
            except nx.NetworkXNoPath:
                dist = None
                path = None
        else:
            print(f"[Warning] - Node '{node}' ('{label}') not in the given graph.")
        paths.append(NodeBranch(node, graph.nodes[node].get("name", ""), dist, path))
    return paths


def closest_rna_class(graph: nx.MultiDiGraph, source_node: str, target_nodes: dict) -> NodeBranch | None:
    paths = get_node_branches(graph, source_node, target_nodes)

    if not paths:
        return None

    return min(paths, key=lambda x: x.distance)


def farthest_rna_class(graph: nx.MultiDiGraph, source_node: str, target_nodes: dict) -> NodeBranch | None:
    paths = get_node_branches(graph, source_node, target_nodes)

    if not paths:
        return None

    return max(paths, key=lambda x: x.distance)

In [87]:
# "ncRNA"               : "SO:0000655"
# "transcript"          : "SO:0000673"
# "piRNA"               : "SO:0001035"
# "mature_transcript"   : "SO:0000233"
# "rRNA"                : "SO:0000252"
# "miRNA_primary_transcript_region" : "SO:0001243"
targets  =  { "primary_transcript"        : "SO:0000185"
            , "primary_transcript_region" : "SO:0000835"
            , "pre_miRNA"                 : "SO:0001244"
            , "miRNA"                     : "SO:0000276"
}

targets = {   "rRNA": "SO:0000252"
            , "piRNA": "SO:0001035" }

# transcript                    : "SO:0000673"
#       |
#       L primary_transcript    : "SO:0000185"
#               |
#               L mature_transcript: "SO:0000233"
#               |           L ncRNA: "SO:0000655"
#               |           |   L lncRNA | sncRNA | rRNA | sncRNA
#               |           |     circular_ncRNA | enzymatic_RNA | etc
#               |           |
#               |           L mRNA
#               |
#               L primary_transcript_region : "SO:0000835"
#                           L pre_miRNA     : "SO:0001244"
#                                   L miRNA : "SO:0000276"

print(closest_rna_class(graph=so, source_node="SO:0000673", target_nodes=targets))
print()
print(farthest_rna_class(graph=so, source_node="SO:0000673", target_nodes=targets))

NodeBranch(
	node=SO:0000252,
	name=rRNA,
	distance=3,
	path=[('SO:0000673', 'transcript'), ('SO:0000233', 'mature_transcript'), ('SO:0000655', 'ncRNA'), ('SO:0000252', 'rRNA')])

NodeBranch(
	node=SO:0001035,
	name=piRNA,
	distance=5,
	path=[('SO:0000673', 'transcript'), ('SO:0000233', 'mature_transcript'), ('SO:0000655', 'ncRNA'), ('SO:0002247', 'sncRNA'), ('SO:0000370', 'small_regulatory_ncRNA'), ('SO:0001035', 'piRNA')])


In [88]:
# 'mature_transcript'               :  "SO:0000233" # mature_transcript
# 'primary_transcript_region'       :  "SO:0000835" # primary_transcript_region
# 'miRNA_primary_transcript_region' :  "SO:0001243" # miRNA_primary_transcript_region

term_id = "SO:0000673" 
candidates = {    'primary_transcript' : "SO:0000185"   # primary_transcript
                , 'pre_miRNA'          : "SO:0001244"   # pre_miRNA
                , 'miRNA'              : "SO:0000276"   # miRNA
}

print(farthest_rna_class(so, source_node=term_id, target_nodes=candidates))

NodeBranch(
	node=SO:0000276,
	name=miRNA,
	distance=5,
	path=[('SO:0000673', 'transcript'), ('SO:0000233', 'mature_transcript'), ('SO:0000655', 'ncRNA'), ('SO:0002247', 'sncRNA'), ('SO:0000370', 'small_regulatory_ncRNA'), ('SO:0000276', 'miRNA')])


In [89]:
# Show all fields of a term in obo file:
so.nodes['SO:0000276']

{'name': 'miRNA',
 'alt_id': ['SO:0000649'],
 'def': '"Small, ~22-nt, RNA molecule that is the endogenous transcript of a miRNA gene (or the product of other non coding RNA genes). Micro RNAs are produced from precursor molecules (SO:0001244) that can form local hairpin structures, which ordinarily are processed (usually via the Dicer pathway) such that a single miRNA molecule accumulates from one arm of a hairpin precursor molecule. Micro RNAs may trigger the cleavage of their target molecules or act as translational repressors." [PMID:11081512, PMID:12592000]',
 'subset': ['SOFA'],
 'synonym': ['"INSDC_feature:ncRNA" BROAD []',
  '"INSDC_qualifier:miRNA" EXACT []',
  '"micro RNA" EXACT []',
  '"microRNA" EXACT []',
  '"small temporal RNA" EXACT []',
  '"stRNA" EXACT []'],
 'xref': ['http://en.wikipedia.org/wiki/MiRNA "wiki"',
  'http://en.wikipedia.org/wiki/StRNA "wiki"'],
 'is_a': ['SO:0000370'],
 'relationship': ['derives_from SO:0001244']}

In [None]:
# Get all nodes that have 'is_a': SO:0000655
isa_children = [ n for n, data in so.nodes(data=True)
                    if 'is_a' in data and 'SO:0000655' in data['is_a'] ]

In [442]:
from pprint import pprint

In [96]:
wo_children = set()
w_children = set()
for c in isa_children:
    if list(so.predecessors(c)):
        w_children.add((c, so.nodes[c].get('name','')))
    else:
        wo_children.add((c, so.nodes[c].get('name','')))
print('wo_children')
pprint(wo_children)
print()
print('w_children')
pprint(w_children)

wo_children
{('SO:0000013', 'scRNA'),
 ('SO:0000390', 'telomerase_RNA'),
 ('SO:0000404', 'vault_RNA'),
 ('SO:0000405', 'Y_RNA'),
 ('SO:0000590', 'SRP_RNA'),
 ('SO:0000989', 'class_II_RNA'),
 ('SO:0000990', 'class_I_RNA'),
 ('SO:0001800', 'tasiRNA'),
 ('SO:0001870', 'enhancerRNA'),
 ('SO:0002022', 'priRNA'),
 ('SO:0002031', 'shRNA'),
 ('SO:0002120', 'three_prime_overlapping_ncrna'),
 ('SO:0002291', 'circular_ncRNA'),
 ('SO:0002340', 'RNA_7SK'),
 ('SO:0002352', 'sisRNA'),
 ('SO:0002354', 'sbRNA'),
 ('SO:0002356', 'hpRNA'),
 ('SO:0005843', 'rRNA_cleavage_RNA')}

w_children
{('SO:0000252', 'rRNA'),
 ('SO:0000372', 'enzymatic_RNA'),
 ('SO:0000602', 'guide_RNA'),
 ('SO:0000644', 'antisense_RNA'),
 ('SO:0001877', 'lncRNA'),
 ('SO:0001927', 'telomeric_transcript'),
 ('SO:0002247', 'sncRNA')}


In [95]:
# Create a DataFrame with all possible fields
table = []
for c in so.predecessors("SO:0000655"):
    node_data = so.nodes[c].copy()
    node_data.update({
        "id": c
        , "parents_count": so.out_degree(c)
        , "children_count": so.in_degree(c)
    })
    table.append(node_data)
pd.DataFrame(table).sort_values(by="children_count", ascending=False).reset_index(drop=True)

Unnamed: 0,name,def,subset,synonym,is_a,relationship,id,parents_count,children_count,comment,xref,created_by,creation_date,intersection_of
0,lncRNA,"""A non-coding RNA generally longer than 200 nu...",,"[""INSDC_feature:ncRNA"" BROAD [], ""INSDC_qualif...",[SO:0000655],,SO:0001877,1,5,Updated the definition of lncRNA (SO:0001877) ...,[http://www.gencodegenes.org/gencode_biotypes....,kareneilbeck,2012-02-14T05:18:01Z,
1,telomeric_transcript,"""A non-coding transcript derived from the tran...",,"[""telomeric transcript"" EXACT []]",[SO:0000655],,SO:0001927,1,4,,,kareneilbeck,2012-10-31T01:42:15Z,
2,sncRNA,"""A non-coding RNA less than 200 nucleotides in...",,"[""Small noncoding RNA"" EXACT []]",[SO:0000655],,SO:0002247,1,4,Added as per request from GitHub Issue #485 (h...,,david,2020-05-13T11:07:30Z,
3,guide_RNA,"""A short 3'-uridylated RNA that can form a dup...",[SOFA],"[""gRNA"" EXACT [], ""guide RNA"" EXACT [], ""INSDC...",[SO:0000655],,SO:0000602,1,3,,"[http://en.wikipedia.org/wiki/Guide_RNA ""wiki""]",,,
4,rRNA,"""rRNA is an RNA component of a ribosome that c...",[SOFA],"[""INSDC_feature:rRNA"" EXACT [], ""INSDC_qualifi...",[SO:0000655],[derives_from SO:0000209],SO:0000252,2,3,Definition updated 10 June 2021 as part of res...,"[http://en.wikipedia.org/wiki/RRNA ""wiki""]",,,
5,enzymatic_RNA,"""An RNA sequence that has catalytic activity w...",[SOFA],"[""enzymatic RNA"" EXACT []]",[SO:0000655],[has_quality SO:0001185],SO:0000372,2,3,This was moved to be a child of transcript (SO...,,,,
6,antisense_RNA,"""Antisense RNA is RNA that is transcribed from...",[SOFA],"[""antisense RNA"" EXACT [], ""INSDC_feature:ncRN...",[SO:0000655],[derives_from SO:0000645],SO:0000644,2,2,,"[http://en.wikipedia.org/wiki/Antisense_RNA ""w...",,,
7,vault_RNA,"""A family of RNAs are found as part of the eni...",[SOFA],"[""INSDC_feature:ncRNA"" BROAD [], ""INSDC_qualif...",[SO:0000655],,SO:0000404,1,0,,"[http://en.wikipedia.org/wiki/Vault_RNA ""wiki""]",,,
8,scRNA,"""A small non coding RNA sequence, present in t...",[SOFA],"[""INSDC_feature:ncRNA"" BROAD [], ""INSDC_qualif...",[SO:0000655],[derives_from SO:0000012],SO:0000013,2,0,,,,,
9,Y_RNA,"""Y RNAs are components of the Ro ribonucleopro...",[SOFA],"[""INSDC_feature:ncRNA"" BROAD [], ""INSDC_qualif...",[SO:0000655],,SO:0000405,1,0,,"[http://en.wikipedia.org/wiki/Y_RNA ""wiki""]",,,
