# Install / Setup


In [3]:
# load the required python components
import sys
import os
# load the Python Image Library (PIL)
from PIL import Image
import RNA
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# define some python helper functions to make working more convenient
#
# concat multiple images of different size
def get_concat_h_blank(im1, im2, color=(255, 255, 255)):
    dst = Image.new('RGB', (im1.width + im2.width, max(im1.height, im2.height)), color)
    dst.paste(im1, (0, 0))
    dst.paste(im2, (im1.width, 0))
    return dst

def get_concat_h_multi_blank(im_list):
    _im = im_list.pop(0)
    for im in im_list:
        _im = get_concat_h_blank(_im, im)
    return _im
# use as shown
# img3 = get_concat_h_multi_blank([img1, img2])
# img3

print('Installed Vienna RNA Package v' + RNA.__version__)

Installed Vienna RNA Package v2.7.0


# Select RNA Families

# Obtain Data

In [4]:
#adapted from example project

# Download seed alignment in fasta
!wsl wget https://rfam.org/family/RF01988/alignment/fasta -O RF01988_seed.fa
# Download seed alignment without gap (ungapped)
!wsl wget https://rfam.org/family/RF01988/alignment/fastau -O RF01988_seed_ungapped.fa
# Download covariance model
!wsl wget https://rfam.org/family/RF01988/cm -O RF01988_rfam.cm
# Download and extract all sequences
!wsl wget https://ftp.ebi.ac.uk/pub/databases/Rfam/15.1/fasta_files/RF01988.fa.gz
!wsl gunzip RF01988.fa.gz

--2026-01-29 11:13:04--  https://rfam.org/family/RF01988/alignment/fasta
Resolving rfam.org (rfam.org)... 193.62.193.83
Connecting to rfam.org (rfam.org)|193.62.193.83|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 330 [text/plain]
Saving to: â€˜RF01988_seed.faâ€™

     0K                                                       100% 67.0M=0s

2026-01-29 11:13:04 (67.0 MB/s) - â€˜RF01988_seed.faâ€™ saved [330/330]

--2026-01-29 11:13:05--  https://rfam.org/family/RF01988/alignment/fastau
Resolving rfam.org (rfam.org)... 193.62.193.83
Connecting to rfam.org (rfam.org)|193.62.193.83|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 330 [text/plain]
Saving to: â€˜RF01988_seed_ungapped.faâ€™

     0K                                                       100% 63.2M=0s

2026-01-29 11:13:05 (63.2 MB/s) - â€˜RF01988_seed_ungapped.faâ€™ saved [330/330]

--2026-01-29 11:13:05--  https://rfam.org/family/RF01988/cm
Resolving rfam.org (rfam.org)... 19

In [5]:
#CHECK IF VIENNA RNA works

# Consensus structure prediction using RNAalifold
# Enabling option --sci returns sci value
!wsl RNAalifold --sci --mis --cfactor=0.6 --nfactor=0.5 --aln RF01988_seed.fa

GYGUCUGACACGGCCCWUCGGUUGCAGGUCUGCACCAWUCGGUCGGUAAYGGCGC
(((((..((.(((((.((.(((.(((....)))))).)).)))))))...))))) (-17.26 = -17.30 +   0.04) [sci = 0.8309]


4 sequences; length of alignment 55.


In [6]:
#CHECK IF INFERNAL WORKS

# Compute Infernal bit scores using cmsearch
# The tool computes for each sequence in RF01185.fa using covariance model RF01185_rfam.cm
# Option --tblout stores the output table including bit score in RF01185_rfam.score
!wsl cmsearch --tblout RF01988_rfam.score RF01988_rfam.cm RF01988.fa

# cmsearch :: search CM(s) against a sequence database
# INFERNAL 1.1.5 (Sep 2023)
# Copyright (C) 2023 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query CM file:                         RF01988_rfam.cm
# target sequence database:              RF01988.fa
# tabular output of hits:                RF01988_rfam.score
# number of worker threads:              4
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Query:       SECIS_2  [CLEN=55]
Accession:   RF01988
Description: Selenocysteine insertion sequence 2
Hit scores:
 rank     E-value  score  bias  sequence                           start    end   mdl trunc   gc  description
 ----   --------- ------ -----  --------------------------------- ------ ------   --- ----- ----  -----------
  (1) !   1.9e-22   87.6   0.0  AE006468.2/4527101-4527047             1     55 +  cm    no 0.65  Salmonella ent

In [7]:
#CHECK IF locarna works

# Here we run mlocarna for multiple sequence alignment on ungapped seed sequences and
# store the resulting alignment into stockholm format including the consensus structure 
!wsl mlocarna --stockholm --consensus-structure alifold RF01988_seed_ungapped.fa

mLocARNA --- multiple Local (and global) Alignment of RNA --- LocARNA 2.0.1


U00096.2/4296977-4296923   GUGUCUGACACGGCCCAUCGGUUGCAGGUCUGCACCAAUCGGUCGGUAAUGGCGC
CU928158.2/4397661-4397715 GCGUCUGACACGGCCCAUCGGUUGCAGGUCUGCACCAGUCGGUCGGUAACGGCGC
ABXW01000042.1/36779-36725 GUGUCUGACACGGCCCUUCGGUUGCAGGUCUGCAGCGUUCGGUCGGUAAUGGCGC
BAAW01013798.1/777-831     GCGUCUGACACGGGCCAUCGGUUGCCGGUCUGCGCCCAUCGGUCGGUGACGGCGC

alifold                    (((((..((.(((((.((.(((.(((....)))))).)).)))))))...))))) (-27.19 = -17.30 +  -9.89)
Results written to target directory /mnt/c/Users/seppo/Bioinformatik_Praktikum_WS25/RF01988_seed_ungapped.out.


# Prepare Sequence Sets

# Generate Alignments

# Predict Consensus Secondary Structures

# Evaluate Alignments Quantitatively

# Manual Error Detection (Professor’s Key Point)

# Correct Alignment Errors

# Structure-Guided Re-alignment

# Build Covariance Models (CMs)

# Evaluate Models Using Bit Scores

# Interpret and Compare Results