**2025W 301599-1 Praktikum Bioinformatik**  
Julianne Weiß  
Sarah Strolz  
Sebastian Rossböck
# Comparison of Sequence-Based and Structure-Based RNA Alignments
Sequence-based and structure-aware alignment strategies are compared, and the effect of manual inspection and structure-guided correction on alignment
quality is evaluated.

In [17]:
import sys
import os
#Helper functions for reading FASTA / STOCKHOLM and CLUSTAL
#and for calculating Sum of Pairs Score
def read_fasta_alignment(filename):
    sequences = {}
    current_id = None
    
    with open(filename, "r") as f:
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                current_id = line[1:].split()[0]
                sequences[current_id] = ""
            elif current_id is not None:
                sequences[current_id] += line
    
    return sequences


def read_clustal_alignment(filename):
    sequences = {}
    
    with open(filename, "r") as f:
        for line in f:
            line = line.rstrip()
            # Skip header, empty lines, and conservation lines
            if not line or line.startswith("CLUSTAL") or line.startswith(" ") or "*" in line or ":" in line:
                continue
            
            parts = line.split()
            if len(parts) >= 2:
                seq_id = parts[0]
                seq_chunk = parts[1]
                
                if seq_id not in sequences:
                    sequences[seq_id] = ""
                sequences[seq_id] += seq_chunk
    
    return sequences


def read_stockholm_alignment(filename):
    sequences = {}
    
    with open(filename, "r") as f:
        for line in f:
            line = line.rstrip()
            # Skip header lines and annotation lines
            if line.startswith("#") or line.startswith("//") or not line:
                continue
            
            parts = line.split()
            if len(parts) >= 2:
                seq_id = parts[0]
                seq = parts[1]
                
                if seq_id not in sequences:
                    sequences[seq_id] = ""
                sequences[seq_id] += seq
    
    return sequences


def calculate_sps(reference_aln, test_aln):
    """
    Calculate Sum-of-Pairs Score (SPS)
    
    SPS = (number of correctly aligned residue pairs) / (total pairs in reference)
    """
    # Find common sequence IDs
    common_ids = set(reference_aln.keys()) & set(test_aln.keys())
    
    if len(common_ids) < 2:
        print(f"Warning: Only {len(common_ids)} common sequences found")
        return 0.0
    
    common_ids = sorted(common_ids)
    
    # Check alignment lengths
    ref_len = len(next(iter(reference_aln.values())))
    test_len = len(next(iter(test_aln.values())))
    
    print(f"Reference alignment length: {ref_len}")
    print(f"Test alignment length: {test_len}")
    print(f"Common sequences: {len(common_ids)}")
    
    # Count correctly aligned pairs
    total_pairs = 0
    correct_pairs = 0
    
    # For each column position in reference
    for ref_col in range(ref_len):
        # For each pair of sequences
        for i in range(len(common_ids)):
            for j in range(i + 1, len(common_ids)):
                seq1_id = common_ids[i]
                seq2_id = common_ids[j]
                
                # Get residues at this column in reference
                ref_res1 = reference_aln[seq1_id][ref_col]
                ref_res2 = reference_aln[seq2_id][ref_col]
                
                # Skip if either is a gap in reference
                if ref_res1 == '-' or ref_res2 == '-':
                    continue
                
                total_pairs += 1
                
                # Find these residues in test alignment
                # We need to find the corresponding non-gap positions
                test_pos1 = find_residue_position(test_aln[seq1_id], reference_aln[seq1_id], ref_col)
                test_pos2 = find_residue_position(test_aln[seq2_id], reference_aln[seq2_id], ref_col)
                
                # Check if they're aligned in test (same column position)
                if test_pos1 is not None and test_pos2 is not None and test_pos1 == test_pos2:
                    correct_pairs += 1
    
    if total_pairs == 0:
        return 0.0
    
    sps = correct_pairs / total_pairs
    print(f"Correct pairs: {correct_pairs}")
    print(f"Total pairs: {total_pairs}")
    
    return sps


def find_residue_position(test_seq, ref_seq, ref_col):
    """
    Find where the residue at ref_col in ref_seq appears in test_seq
    Returns the column position in test_seq, or None if not found
    """
    # Count non-gap characters up to ref_col in reference
    ref_residue_count = 0
    for i in range(ref_col + 1):
        if ref_seq[i] != '-':
            ref_residue_count += 1
    
    # Find the same residue number in test sequence
    test_residue_count = 0
    for test_col in range(len(test_seq)):
        if test_seq[test_col] != '-':
            test_residue_count += 1
            if test_residue_count == ref_residue_count:
                return test_col
    
    return None

def get_sci(filename):
    """Extract Structure Conservation Index from RNAalifold output"""
    with open(filename, "r") as f:
        for line in f:
            if "[sci =" in line:
                # Format: (...) [sci = 0.8020]
                sci_str = line.split("[sci =")[1].split("]")[0].strip()
                return float(sci_str)
    return None

# Extract bit scores from cmsearch output
def extract_bit_scores(filename):
    """Extract bit scores from cmsearch output"""
    bit_scores = []
    
    with open(filename, "r") as f:
        for line in f:
            line_stripped = line.strip()
            
            # Look for lines that start with (number) followed by ! or ?
            if line_stripped.startswith("("):
                parts = line_stripped.split()
                # Format: (rank) ! E-value score ...
                if len(parts) >= 4 and (parts[1] == "!" or parts[1] == "?"):
                    try:
                        # Fourth element is the bit score
                        bit_score = float(parts[3])
                        bit_scores.append(bit_score)
                    except:
                        pass
    
    return bit_scores

In [18]:
#Download seed alignment and all sequences for family
print("-------------------------------------")
print("------------Download Data------------")
print("-------------------------------------")
# Download seed alignment in fasta
!wget https://rfam.org/family/RF01852/alignment/fasta -O RF01852_seed.fa
# Download seed alignment without gap (ungapped) (these will be needed by clustalw and locarna)
!wget https://rfam.org/family/RF01852/alignment/fastau -O RF01852_seed_ungapped.fa
# Download and extract all sequences
!wget https://ftp.ebi.ac.uk/pub/databases/Rfam/15.1/fasta_files/RF01852.fa.gz
!wsl gunzip RF01852.fa.gz

print("cell done")

-------------------------------------
------------Download Data------------
-------------------------------------


--2026-02-09 18:25:17--  https://rfam.org/family/RF01852/alignment/fasta
Resolving rfam.org (rfam.org)... 193.62.193.83
Connecting to rfam.org (rfam.org)|193.62.193.83|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15834 (15K) [text/plain]
Saving to: 'RF01852_seed.fa'

     0K .......... .....                                      100% 2,54M=0,006s

2026-02-09 18:25:18 (2,54 MB/s) - 'RF01852_seed.fa' saved [15834/15834]

--2026-02-09 18:25:18--  https://rfam.org/family/RF01852/alignment/fastau
Resolving rfam.org (rfam.org)... 193.62.193.83
Connecting to rfam.org (rfam.org)|193.62.193.83|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12757 (12K) [text/plain]
Saving to: 'RF01852_seed_ungapped.fa'

     0K .......... ..                                         100%  892K=0,01s

2026-02-09 18:25:18 (892 KB/s) - 'RF01852_seed_ungapped.fa' saved [12757/12757]

--2026-02-09 18:25:19--  https://ftp.ebi.ac.uk/pub/databases/Rfam/15.1/fasta_file

cell done


gzip: RF01852.fa already exists;	not overwritten


In [19]:
#create the full alignment
# Download the Rfam covariance model
!wsl wget https://rfam.org/family/RF01852/cm -O RF01852_rfam.cm
# Align all sequences to the covariance model to create the full alignment
!wsl cmalign RF01852_rfam.cm RF01852.fa > RF01852_full_alignment.sto

print("cell done")

--2026-02-09 18:25:31--  https://rfam.org/family/RF01852/cm
Resolving rfam.org (rfam.org)... 193.62.193.83
Connecting to rfam.org (rfam.org)|193.62.193.83|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 63575 (62K) [text/plain]
Saving to: â€˜RF01852_rfam.cmâ€™

     0K .......... .......... .......... .......... .......... 80% 1.03M 0s
    50K .......... ..                                         100%  986K=0.06s

2026-02-09 18:25:31 (1.01 MB/s) - â€˜RF01852_rfam.cmâ€™ saved [63575/63575]



cell done


In [20]:
#Create Sequence-based alignments are produced with Clustal W
#Followed by consensus structure prediction using RNAalifold
print("---------------------------------------------------")
print("----------Create Sequence Based Alignment----------")
print("---------------------------------------------------")
#Read the Sequences from the ungapped seed file and save it to a file, to prepare it as input for clustalw
sequences = {}
current_id = None

#read sequenceId / sequence from file
with open("RF01852_seed_ungapped.fa","r") as f:
    for line in f:
        #print(line)
        if line.startswith(">"):
            current_id = line[1:].split()[0]
            sequences[current_id] = ""
        elif current_id is not None:
            sequences[current_id] += line

#print it to make sure we have data (we expect 4-20 sequences)
for seq_id, seq in sequences.items():
    print(f"{seq_id}: {len(seq)} nt")

#write sequences to file
with open("RF01852_for_clustalw.fa", "w") as f:
    for seq_id, seq in sequences.items():
        f.write(f">{seq_id}\n{seq}\n")
        
#Run clustalw
!wsl clustalw -infile=RF01852_for_clustalw.fa -outfile=RF01852_clustalw.aln -output=CLUSTAL

#Use the clustalw alignment as input for RNAalifold to predict consensus structure
# --sci flag for structure conservation index
!wsl RNAalifold --mis --cfactor=0.6 --nfactor=0.5 --sci RF01852_clustalw.aln > RF01852_clustalw_alifold.out

#show RNAalifold output
print("-------------------------------------")
print("----------RNAalifold output----------")
print("-------------------------------------")
with open("RF01852_clustalw_alifold.out","r") as f:
    for line in f:
        print(line)

print("cell done")

---------------------------------------------------
----------Create Sequence Based Alignment----------
---------------------------------------------------
CP000112.1/1531804-1531715: 92 nt
X75790.1/378-471: 96 nt
AM180252.1/236707-236618: 92 nt
CP000527.1/2088048-2088137: 92 nt
AE017285.1/1369596-1369507: 92 nt
CP000473.1/4117336-4117425: 92 nt
CP000859.1/2690342-2690435: 96 nt
CP000252.1/931834-931927: 96 nt
CP000478.1/1085869-1085963: 97 nt
CP001390.1/1963878-1963785: 96 nt
CP000142.2/981495-981588: 96 nt
AE017180.1/909828-909735: 96 nt
CP000698.1/4043694-4043787: 96 nt
AE017143.1/1662251-1662341: 93 nt
AE004439.1/1994469-1994558: 92 nt
AE016827.1/2264473-2264383: 93 nt
CP000851.1/552724-552634: 93 nt
AE014299.1/117643-117553: 93 nt
CP000653.1/86005-85915: 93 nt
X17419.1/71-161: 93 nt
X13994.1/39-129: 93 nt
Y00299.1/1-91: 93 nt
X15993.1/1-91: 93 nt
CP000783.1/4104750-4104660: 93 nt
AM286415.1/4504272-4504182: 93 nt
AE009952.1/4494849-4494759: 93 nt
BX571859.1/132063-132153: 93 nt
S6

109 sequences; length of alignment 106.


In [21]:
#what are we doing here?
print("----------------------------------------------------")
print("----------Create Structure Based Alignment----------")
print("----------------------------------------------------")

#we use locarna to create the structure based alignment
# --consensus-structture alifold means, that it used RNAalifold internally for the consensus structure. This way we can compare it better with the clustalw/RNAalifold approach
# --stockholm means that the output is in the stockholm format, which we will need for infernal
!wsl mlocarna --stockholm --consensus-structure alifold RF01852_seed_ungapped.fa

print("cell done")

----------------------------------------------------
----------Create Structure Based Alignment----------
----------------------------------------------------
mLocARNA --- multiple Local (and global) Alignment of RNA --- LocARNA 2.0.1


ABGA01167771.1/2921-2836   GCCCGGAUGAUCCUCAGUGGUCUGGGG-UGCAGGCUUCAAACCUGUAGCUGUCU-AGC----------GACAGAGUGGUUCAAUUCCACCUUUGAAG----G
AL031595.4/5352-5437       GCUCGGAUGAUCCUCAGUGGUCUGGGG-UGCAGGCUUCAAACCUGUAGCUGUCU-AGU----------GACAGAGUGGUUCAAUUCCACCUUUGUAG----G
AY609101.1/307-396         GCCCGRAUGAUCCUCAGUGGUCUGGGG-UGCAGGCUUCAAACCUGUAGCUGUCU-AGC----------GACAGAGUGGUUCAAUUCCACCUUUCGGGCGGCA
AY609098.1/307-396         GCCCGGAUGAUCCUCAGUGGUCUGGGG-UGCAGGCUUCAAACCUGUAGCUGUCU-AGC----------GACAGAGUGGUUCAAUUCCACCUUUCGGGCGGCR
AY609120.1/306-395         GCCCGGAUGAUCCUCAGUGGUCUGGGG-UGCAGGCUUCAAACCUGUAGCUGUCU-AGC----------GACAGAGUGGUUCAAUUCCACCUUUCGGGCGGCG
DQ239472.1/307-396         GCCCGGAUGAUCCUCAGUGGUCUGGGG-UGCAGGCUUCAAACCUGUAGCUGUCU-AGC----------GACAGAGUGGUUCAAUUC

In [22]:
#Now we can compare the two
#We load the reference alignment and create the SPS of the clustalw alignment and the locarna alignment with it.
#to see how well the alignments compare to the reference alignment from rfam

# Load reference alignment (Rfam seed)
print("Loading reference alignment...")
reference = read_fasta_alignment("RF01852_seed.fa")

# Load Clustal W alignment
print("\n--- Clustal W Alignment ---")
clustalw = read_clustal_alignment("RF01852_clustalw.aln")
sps_clustalw = calculate_sps(reference, clustalw)
print(f"SPS for Clustal W: {sps_clustalw:.4f}")

# Load LocARNA alignment
print("\n--- LocARNA Alignment ---")
locarna = read_stockholm_alignment("RF01852_seed_ungapped.out/results/result.stk")
sps_locarna = calculate_sps(reference, locarna)
print(f"SPS for LocARNA: {sps_locarna:.4f}")

# Load SCI values
sci_clustalw = get_sci("RF01852_clustalw_alifold.out")

#We need to run RNAalifold on the locarna alignment to get the SCI (can we do this better?)
!wsl RNAalifold --mis --cfactor=0.6 --nfactor=0.5 --sci RF01852_seed_ungapped.out/results/result.stk > RF01852_locarna_alifold.out

sci_locarna = get_sci("RF01852_locarna_alifold.out")

# Summary
print("RESULTS")
print(f"Clustal W: SPS = {sps_clustalw:.4f}, SCI = {sci_clustalw:.4f}")
print(f"LocARNA:   SPS = {sps_locarna:.4f}, SCI = {sci_locarna:.4f}")

print("cell done")

Loading reference alignment...

--- Clustal W Alignment ---
Reference alignment length: 119
Test alignment length: 106
Common sequences: 109
Correct pairs: 334899
Total pairs: 508597
SPS for Clustal W: 0.6585

--- LocARNA Alignment ---
Reference alignment length: 119
Test alignment length: 102
Common sequences: 109
Correct pairs: 397583
Total pairs: 508597
SPS for LocARNA: 0.7817
RESULTS
Clustal W: SPS = 0.6585, SCI = 0.2095
LocARNA:   SPS = 0.7817, SCI = 0.9062
cell done


109 sequences; length of alignment 102.


In [24]:
#Building the covariance models
print("----------------------------------------------------")
print("---------------Build Covariance Models--------------")
print("----------------------------------------------------")
#Infernal requires Stockholm format with a consensus structure annotation
#locarna already provides this format, but for clustalw, we need to create it.

# Clustal W alignment with consensus structure
def create_stockholm_with_structure(alignment_dict, structure, output_file):
    with open(output_file, "w") as f:
        f.write("# STOCKHOLM 1.0\n")
        f.write(f"#=GF SQ {len(alignment_dict)}\n")
        
        # Write sequences
        for seq_id, seq in alignment_dict.items():
            f.write(f"{seq_id:<30} {seq}\n")
        
        # Write consensus structure
        f.write(f"#=GC SS_cons{' '*19} {structure}\n")
        f.write("//\n")

# Create Stockholm file for Clustal W alignment
#the consensus structure was directly copied from the RNAalifold output above
consensus_structure = "..((((((.....(((.....)))...........(((.......))).............................(((((.......)))))))))))......"
create_stockholm_with_structure(clustalw, consensus_structure, "RF01852_clustalw.stk")

# LocARNA already has Stockholm format with structure, so just copy it
!wsl cp RF01852_seed_ungapped.out/results/result.stk RF01852_locarna.stk

print("cell done")

----------------------------------------------------
---------------Build Covariance Models--------------
----------------------------------------------------
cell done


In [25]:
# Build CM from Clustal W alignment
!wsl cmbuild RF01852_clustalw.cm RF01852_clustalw.stk

# Build CM from LocARNA alignment
!wsl cmbuild RF01852_locarna.cm RF01852_locarna.stk

print("cell done")

# cmbuild :: covariance model construction from multiple sequence alignments
# INFERNAL 1.1.5 (Sep 2023)
# Copyright (C) 2023 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# CM file:                                            RF01852_clustalw.cm
# alignment file:                                     RF01852_clustalw.stk
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#                                                                      rel entropy
#                                                                      -----------
# idx    name                     nseq eff_nseq   alen  clen  bps bifs    CM   HMM description
# ------ -------------------- -------- -------- ------ ----- ---- ---- ----- ----- -----------
       1 RF01852_clustalw          109     5.96    106    94   17    2 0.608 0.477 
#
# CPU time: 0.19u 0.00s 00:00:00.19 Elapsed: 0

In [33]:
#Prepare test sequences:
# Load the full alignment, ie all known sequences belonging to this family
full_aln = read_stockholm_alignment("RF01852_full_alignment.sto")

print(f"Full alignment: {len(full_aln)} sequences")
print(f"Seed alignment: {len(reference)} sequences")

#we only want the sequences that were not used in the seed. -> they should be independent
seed_ids = set(reference.keys())
full_ids = set(full_aln.keys())
independent_ids = full_ids - seed_ids

print(f"independent_ids: {len(independent_ids)}")

# Write independent sequences to fasta file
with open("RF01852_independent.fa", "w") as f:
    for seq_id in independent_ids:
        seq = full_aln[seq_id].replace("-", "")  # Remove gaps
        seq = seq.replace(".", "")  # Remove gaps
        f.write(f">{seq_id}\n{seq}\n")

print("Created RF01852_independent.fa")

print("cell done")

Full alignment: 4325 sequences
Seed alignment: 109 sequences
independent_ids: 4216
Created RF01852_independent.fa
cell done


In [27]:
#run cmcalibrate, a cm file must be calibrated before it can be used with cmsearch
!wsl cmcalibrate RF01852_clustalw.cm
!wsl cmcalibrate RF01852_locarna.cm

print("cell done")

# cmcalibrate :: fit exponential tails for CM E-values
# INFERNAL 1.1.5 (Sep 2023)
# Copyright (C) 2023 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# CM file:                                     RF01852_clustalw.cm
# number of worker threads:                    4
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#
# Calibrating CM(s):
#
#                        predicted                                                   actual   
#                       running time              percent complete                running time
# model name            (hr:min:sec)  [........25........50........75..........]  (hr:min:sec)
# --------------------  ------------  ------------------------------------------  ------------
#
# Calibration summary statistics:
#
#                           exponential tail fit mu        exponential tail fit lambda         total

In [34]:
#run infernals cmsearch (it searches sequences and tries to find matches to the covariance model)
!wsl cmsearch RF01852_clustalw.cm RF01852_independent.fa > RF01852_clustalw_search.out
!wsl cmsearch RF01852_locarna.cm RF01852_independent.fa > RF01852_locarna_search.out

print("cell done")

cell done


In [35]:
# Extract scores from both searches
clustalw_scores = extract_bit_scores("RF01852_clustalw_search.out")
locarna_scores = extract_bit_scores("RF01852_locarna_search.out")

print("RESULTS\n")

print(f"Clustal W CM: {len(clustalw_scores)} hits")
if clustalw_scores:
    print(f"  Mean bit score: {sum(clustalw_scores)/len(clustalw_scores):.2f}")
    print(f"  Max bit score: {max(clustalw_scores):.2f}")
    print(f"  Min bit score: {min(clustalw_scores):.2f}")
    significant_hits = [s for s in clustalw_scores if s > 50]
    print(f"  Significant hits (>50 bits): {len(significant_hits)}")

print(f"\nLocARNA CM: {len(locarna_scores)} hits")
if locarna_scores:
    print(f"  Mean bit score: {sum(locarna_scores)/len(locarna_scores):.2f}")
    print(f"  Max bit score: {max(locarna_scores):.2f}")
    print(f"  Min bit score: {min(locarna_scores):.2f}")
    significant_hits = [s for s in locarna_scores if s > 50]
    print(f"  Significant hits (>50 bits): {len(significant_hits)}")

print("cell done")

RESULTS

Clustal W CM: 8454 hits
  Mean bit score: 58.45
  Max bit score: 86.50
  Min bit score: 10.20
  Significant hits (>50 bits): 6238

LocARNA CM: 8442 hits
  Mean bit score: 70.79
  Max bit score: 93.30
  Min bit score: 13.80
  Significant hits (>50 bits): 7736
cell done
