In [1]:
import sys
sys.path.append("/usr/local/lib/python3.10/site-packages/")

In [2]:
import RNA

In [3]:
print(RNA.__version__)

2.5.1


In [4]:
# The RNA sequence alignment
sequences = [
    "CUGCCUCACAACGUUUGUGCCUCAGUUACCCGUAGAUGUAGUGAGGGU",
    "CUGCCUCACAACAUUUGUGCCUCAGUUACUCAUAGAUGUAGUGAGGGU",
    "---CUCGACACCACU---GCCUCGGUUACCCAUCGGUGCAGUGCGGGU"
]
 
# compute the consensus sequence
cons = RNA.consensus(sequences)
 
# predict Minmum Free Energy and corresponding secondary structure
(ss, mfe) = RNA.alifold(sequences);
 
# print output
print("{}\n{} [ {:6.2f} ]".format(cons, ss, mfe))

CUGCCUCACAACAUUUGUGCCUCAGUUACCCAUAGAUGUAGUGAGGGU
...((((((.(((((((((...........))))))))).)))))).. [ -10.86 ]


In [5]:
import numpy as np
import pandas as pd

In [6]:
print(np.__version__)
print(pd.__version__)

1.24.3
2.0.2


In [7]:
def get_sequence_from_pattern(pattern: str):
    """
    PT1: RYRY-4*N-3*(RY)-3*N-3*(RY)-4*N-3*(RY)-3*N-RYRY
    R=(A, G), Y= (U, C), and N= (A, C, G, U).
    """
    
    pools = {"R":["A", "G"], "Y":["U", "C"], "N":["A", "C", "G", "U"]}
    
    sequence = ""
    
    for character in pattern:
        sequence += np.random.choice(pools[character])
        
    return sequence
    
    

In [30]:
pattern1 = "RYRYNNNNRYRYRYNNNRYRYRYNNNNRYRYRYNNNRYRY"
pattern2 = "RRYYNNNRRYYNNNNRRYYNNRRYYNNNNRRYYNNNRRYY"
pattern3 = "RRYYNNNRRRYYYNNNNRRRYYYNNNNRRRYYYNNNRRYY"
pattern4 = "RRYYNNNRYRYRYNNNNRYRYRYNNNNRYRYRYNNNRYRY"

In [31]:
print(get_sequence_from_pattern(pattern1))

GUGUGCCUAUAUAUGGAGUGUGCAGAGGUGCGUUUGAUGU


In [32]:
sequences = []
for n in range(100):
    sequences.append(get_sequence_from_pattern(pattern1))
for n in range(100):
    sequences.append(get_sequence_from_pattern(pattern2))
for n in range(100):
    sequences.append(get_sequence_from_pattern(pattern3))
for n in range(100):
    sequences.append(get_sequence_from_pattern(pattern4))

In [33]:
print(sequences[:10])

['ACGCCUACACGCGCACUAUACGCCGCCACACGCAUUAUAC', 'GUACAAGCAUGCGCUCUACGUACUCGAACGCAUACAACGC', 'GCGCGCUGACGCGUUAGGUGCAUCUGGAUGCACGACACAU', 'AUGCCUCGGCGCGCUAUGCAUAUUGUGGUAUAUAGGGCAC', 'GUAUCACCAUGUGCAGUGUGCGCCCGCGCGUAUAAGGUAU', 'AUAUCUUUGUGUGUAGUACACGCAAUGGCAUACGAAGUGU', 'ACGCCAGGGCGCGUCUUGCACAUUUGAGUGUGCUUUACGU', 'ACGUGUUCGUGCGUGAGGUAUACGUUUAUGUGUCCUGCAC', 'GCAUCGGGACGUACUCGGCGUAUGUCCACGCGCAGGACGU', 'ACACGUGGACAUGUUGGAUACACCCUGGUAUACUAAAUGU']


In [34]:
secondary_structure = []
minimum_free_energies = []

In [35]:
for seq in sequences:
    (temp_ss, temp_mfe) = RNA.fold(seq)
    secondary_structure.append(temp_ss)
    minimum_free_energies.append(temp_mfe)

In [36]:
print(secondary_structure[:10])
print(minimum_free_energies[:10])

['.(((......)))........((........)).......', '........(((((.((.........)).))))).......', '(((((....)))))...((((((....)))))).......', '..((....))((.(((((.((......)).))))).))..', '.....(((.(((((.(((((....)))))))))).)))..', '....((((((((((.((.......)).))))))))))...', '.(((....)))(((...((((((....))))))...))).', '........((((...(((.((((......)))))))))))', '((..((((((((((.....)))))))).)).)).......', '........((((.((((((((......)))).))))))))']
[-3.0999999046325684, -5.5, -12.899999618530273, -9.699999809265137, -12.0, -9.600000381469727, -11.699999809265137, -9.0, -14.600000381469727, -5.599999904632568]


In [37]:
data = {
    'Secondary Structure': secondary_structure,
    'Free Energy': minimum_free_energies
}

# Create the DataFrame
all_sequences = pd.DataFrame(data, index=sequences)

In [38]:
print(all_sequences.shape)

(400, 2)


In [None]:
"""
The cutoff energy for sequence selection was chosen as ΔGmean−SD=−9.5 kcal/mol 
based on values for the random sequences which is higher than the mean energy 
value for the patterned sequences.
"""

In [39]:
filtered_sequences = all_sequences[all_sequences["Free Energy"] < -9.5]

In [40]:
print(filtered_sequences.shape)

(147, 2)


In [41]:
display(filtered_sequences.head(5))

Unnamed: 0,Secondary Structure,Free Energy
GCGCGCUGACGCGUUAGGUGCAUCUGGAUGCACGACACAU,(((((....)))))...((((((....)))))).......,-12.9
AUGCCUCGGCGCGCUAUGCAUAUUGUGGUAUAUAGGGCAC,..((....))((.(((((.((......)).))))).))..,-9.7
GUAUCACCAUGUGCAGUGUGCGCCCGCGCGUAUAAGGUAU,.....(((.(((((.(((((....)))))))))).)))..,-12.0
AUAUCUUUGUGUGUAGUACACGCAAUGGCAUACGAAGUGU,....((((((((((.((.......)).))))))))))...,-9.6
ACGCCAGGGCGCGUCUUGCACAUUUGAGUGUGCUUUACGU,.(((....)))(((...((((((....))))))...))).,-11.7


In [None]:
"""
Therefore, we set a constraint that the secondary structure of our sequences with 40 
bases should have at least 11 unpaired bases. This number seemed optimal, since a 
higher number of unpaired bases will significantly reduce the presence of sequences 
with high secondary structure free energy while a lower number will reduce the 
potential sites for ligand binding.

Based on these findings, we developed a set of criteria that limited the presence of 
sequences with abundant simple structural motifs and maximized the presence of stable 
low-energy structures. For the pool of 40-mer sequences these criteria are as follows [4]: 
(1) in the lowest energy conformation bases 1 and 2 form pairs with bases 40 and 39; 
(2) the free energy of secondary structure formation is less than −9.5 kcal/mol; 
(3) there are at least 11 bases that do NOT form Watson–Crick pairs. Only the sequences 
    that passed the selection criteria were forwarded to the next step in the selection process.
"""