# **Dataset Creation**

#### **Description**

#### **Imports**

In [1]:
import pandas as pd

## **Primary data**

In [2]:
dataset_nn = pd.read_csv("../../data/2025-09_cdrna_sequences.tsv.gz", sep="\t")

## **Data cleaning**

In [3]:
dataset_nn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   cluster_id      4177 non-null   int64 
 1   seqres_can      4177 non-null   object
 2   rna_type_cdrna  4177 non-null   object
dtypes: int64(1), object(2)
memory usage: 98.0+ KB


In [4]:
dataset_nn.head()

Unnamed: 0,cluster_id,seqres_can,rna_type_cdrna
0,0,UUUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGU...,rRNA
1,0,UGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGUGC...,rRNA
2,0,UUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGUG...,rRNA
3,0,UUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGUG...,rRNA
4,0,UUGUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGUG...,rRNA


In [5]:
dataset_nn['seqres_length'] = dataset_nn.seqres_can.str.len()

In [6]:
set().union(*dataset_nn.seqres_can.map(set))

{'A', 'C', 'F', 'G', 'I', 'M', 'N', 'T', 'U', 'W', 'X'}

### **Removing length outliers**
It is known that eukaryotic **28S rRNA** has a length of around 5000 nt, so a maximum length of 6000 nt is set.

In [7]:
dataset_nn.seqres_can.map(len).value_counts().sort_index(ascending=False)

seqres_can
19000      1
16770      2
8129       2
5227       2
5070       9
        ... 
14        84
13        89
12       148
11       238
10       442
Name: count, Length: 569, dtype: int64

In [8]:
mask_max_length = dataset_nn.seqres_length <= 6000

### **Retaining tRNA**
They usually have modified residues and aminoacylated ends.

In [9]:
mask_w_trna = dataset_nn.rna_type_cdrna.str.contains("tRNA")

In [10]:
set().union(*dataset_nn.loc[(mask_w_trna)].seqres_can.map(set))

{'A', 'C', 'F', 'G', 'I', 'M', 'N', 'U', 'W', 'X'}

### **Removing sequences with 'T'**

In [11]:
mask_wo_t = ~dataset_nn.seqres_can.str.contains("T")

### **Removing sequences with non-standard nucleotides**

In [12]:
def contains_nonstandard(nucleotides: set) -> bool:
    """
    Check if a set of characters contains non-standard nucleotides.
    
    Parameters:
                * set_of_nts: a set of nucleotides.

    Return: True if the set contains non-standard nucleotides, False otherwise.
    """
    standards = set(['A', 'C', 'G', 'U'])
    return len(nucleotides.difference(standards)) != 0

In [13]:
# Standard nucleotides
standards = set(['A', 'C', 'G', 'U'])

In [14]:
# No existe 'reduce' para pd.Series
nucleotides = set().union(*dataset_nn.seqres_can.map(set))

In [15]:
nucleotides.difference(standards)

{'F', 'I', 'M', 'N', 'T', 'W', 'X'}

In [16]:
for nt in nucleotides:
    print(nt, dataset_nn.loc[dataset_nn.seqres_can.str.contains(nt)].shape)

M (1, 4)
F (3, 4)
T (6, 4)
W (3, 4)
C (3845, 4)
A (3934, 4)
I (22, 4)
G (3891, 4)
X (257, 4)
N (109, 4)
U (3902, 4)


In [17]:
mask_wo_nonstd = ~dataset_nn.seqres_can.map(set).apply(lambda x: contains_nonstandard(nucleotides=x))

In [18]:
mask_wo_nonstd.value_counts()

seqres_can
True     3817
False     360
Name: count, dtype: int64

In [19]:
dataset_nn.loc[mask_wo_nonstd].rna_type_cdrna.value_counts()

rna_type_cdrna
transcript        2086
rRNA               693
tRNA               325
mRNA               287
riboswitch         128
enzymatic           97
snRNA               44
ncRNA               43
intron              24
SRP_RNA             21
other               18
mixed               15
piRNA               11
miRNA               10
snoRNA               8
tmRNA                3
telomerase_RNA       2
Y_RNA                1
antisense_RNA        1
Name: count, dtype: int64

### **Removing mono-poly-nucleotide sequences**
Most of them are synthetic constructs for modeling, mRNA fragments, regions or RNA motifs.

In [20]:
mask_wo_mono_poly = ~dataset_nn.seqres_can.map(set).apply(lambda x: len(x) == 1)

### **Removing sequences with low confidence (more than 10% ambiguous residues, 'N' or 'X')**

In [21]:
def lt_perc_of_nt(series: pd.Series, nts: set, perc: float) -> bool:
    return series.isin(nts).sum() < perc * series.shape[0]

def gt_perc_of_nt(series: pd.Series, nts: set, perc: float) -> bool:
    return series.isin(nts).sum() > perc * series.shape[0]

In [22]:
mask_wo_low_conf = ~dataset_nn.seqres_can.apply(lambda x: gt_perc_of_nt(pd.Series(list(x)), {"X", "N"}, 0.1))

In [23]:
dataset_nn.loc[~mask_wo_low_conf]

Unnamed: 0,cluster_id,seqres_can,rna_type_cdrna,seqres_length
97,31,NNNGACUUAAGUCGG,transcript,15
98,31,UNNGACUUAAGUCGG,transcript,15
276,155,NNAGACUUAAGUCU,transcript,14
1333,1153,NNNGACUUAAGUC,transcript,13
1582,1406,GUAAGAGCCUAGCAUGUAGAANNNNNNNNNNNNNNNNNNNNNNNNN...,intron,113
1957,1799,AUCUAUAAUAGCCUCXUCX,mRNA,19
1987,1829,GGAGGUNNNNNNAUG,mRNA,15
2458,2311,GGAXXXGAGUCC,transcript,12
2462,2315,GGAXXXGAGUCC,transcript,12
2479,2332,XXXXXXUXXUUX,transcript,12


### **Cleaning up the dataset**
Using different masks.

In [24]:
mask_unified = ((mask_wo_nonstd) | (mask_w_trna)) & (mask_max_length) & (mask_wo_mono_poly) & (mask_wo_t) & (mask_wo_low_conf)

In [25]:
dataset_nn_clean, removed_nonstd = dataset_nn.loc[mask_unified].reset_index(drop=True)\
                            , dataset_nn.loc[~mask_unified].reset_index(drop=True)

In [26]:
removed_nonstd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   cluster_id      493 non-null    int64 
 1   seqres_can      493 non-null    object
 2   rna_type_cdrna  493 non-null    object
 3   seqres_length   493 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 15.5+ KB


In [27]:
dataset_nn_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3684 entries, 0 to 3683
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   cluster_id      3684 non-null   int64 
 1   seqres_can      3684 non-null   object
 2   rna_type_cdrna  3684 non-null   object
 3   seqres_length   3684 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 115.3+ KB


In [28]:
dataset_nn_clean.seqres_can.apply(lambda x: len(set(x)) == 1).value_counts()

seqres_can
False    3684
Name: count, dtype: int64

In [30]:
dataset_nn_clean.seqres_length.apply(lambda x: x > 6000).value_counts()

seqres_length
False    3684
Name: count, dtype: int64

In [31]:
dataset_nn_clean.seqres_can.str.contains("T").value_counts()

seqres_can
False    3684
Name: count, dtype: int64

### **Cherry-Picking de tipos de RNA**

In [58]:
dataset_nn_clean.rna_type_cdrna.value_counts(dropna=False)

rna_type_cdrna
transcript        1648
rRNA               681
tRNA               366
mRNA               266
riboswitch         128
enzymatic           82
snRNA               44
ncRNA               43
intron              24
SRP_RNA             21
other               18
mixed               12
piRNA               10
miRNA               10
snoRNA               8
tmRNA                3
telomerase_RNA       2
Y_RNA                1
antisense_RNA        1
Name: count, dtype: int64

In [59]:
# Seleccionar solo aquellos tipos con más de 5 casos
dataset_nn_clean = dataset_nn_clean[dataset_nn_clean['rna_type_cdrna'].map(dataset_nn_clean['rna_type_cdrna'].value_counts()) > 5]

In [60]:
dataset_nn_clean.rna_type_cdrna.value_counts(dropna=False)

rna_type_cdrna
transcript    1648
rRNA           681
tRNA           366
mRNA           266
riboswitch     128
enzymatic       82
snRNA           44
ncRNA           43
intron          24
SRP_RNA         21
other           18
mixed           12
piRNA           10
miRNA           10
snoRNA           8
Name: count, dtype: int64

### **Getting minimal columns representation**

In [32]:
columns_to_select = ['seqres_can', 'rna_type_cdrna']

In [61]:
dataset_nn_clean = dataset_nn_clean[columns_to_select].drop_duplicates().reset_index(drop=True)

In [62]:
dataset_nn_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3361 entries, 0 to 3360
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   seqres_can      3361 non-null   object
 1   rna_type_cdrna  3361 non-null   object
dtypes: object(2)
memory usage: 52.6+ KB


In [63]:
dataset_nn_clean.to_csv(  path_or_buf = "../../data/2025-09_cdrna_sequences_clean.tsv.gz"
                        , sep         = "\t"
                        , index       = False
                        , compression = "gzip" )