# Control the number of files with computed consel files
I made two folders: consider_pseudo and ignore_pseudo to separate the cases. In each RNA folder of each group, there should be 4 files.

In [71]:
from os.path import join 
import os

DIR_WORKING = '/Users/u7875558/Documents/PhD/RNAPhylo/allModels_SEED/'
DIR_OUTPUTS = join(DIR_WORKING, 'outputs')
DIR_DNA = '/Users/u7875558/Documents/PhD/RNAPhylo/allModels_SEED/outputs/DNAtrees'

MODEL = 'S7E'
DIR_CONSEL = join(DIR_OUTPUTS, 'AU_Test', MODEL)


LOG_FILE='/Users/u7875558/Documents/PhD/RNAPhylo/allModels_SEED/logs/2025-03-21_10-08-12.log'
SUBFOLDERS=['consider_pseudo', 'ignore_pseudo']
suffixes = {'consider_pseudo': 'wpseu_consel', 'ignore_pseudo': 'ipseu_consel'}
issue_str_rna = ['RF00207', 'RF00390', 'RF01380', 'RF01338', 'RF01047', 'RF03760', 'RF03969', 'RF00976', 'RF03623']

# control if the right consel file was produced
def check_consel_output(dir_consel, groups):
    #dir_consel_gr = join(dir_consel, group)
    rnas = os.listdir(DIR_DNA)
    prob_rnas = dict()
    for group in groups:
        issue_rnas = list()
        dir_consel_gr = join(dir_consel, group)
        #os.makedirs(dir_consel_gr, exist_ok=True)
        if os.path.isdir(dir_consel_gr):
            for rna in os.listdir(dir_consel_gr):
                consel_rna_path = join(dir_consel_gr, rna)
                consel_files = 0
                for f in os.listdir(consel_rna_path):
                    if suffixes[group] in f:
                        consel_files += 1
                if consel_files != 4:
                    issue_rnas.append(rna)
            prob_rnas[group] = sorted(issue_rnas)
    return prob_rnas

check_consel_output(DIR_CONSEL, SUBFOLDERS)

{'ignore_pseudo': []}

# Single script -- test single model

In [20]:
import os 
from Bio import Phylo 
from os.path import join 

def check_branch_length(diroutput, rna):
    expected_files = [f"{i:02d}" for i in range(1, 11)]
    dirRNA = os.path.join(diroutput, rna)

    delete_files = list()
    for file_name in os.listdir(dirRNA):
        for seed in expected_files:
            if file_name.startswith('RAxML_bestTree') and file_name.endswith(seed):
                tree_path = os.path.join(dirRNA, file_name)
                tree = Phylo.read(tree_path, 'newick')
                for clade in tree.find_clades():
                    if clade.branch_length and clade.branch_length > 1:
                        delete_files.append(file_name)
    
    if len(delete_files) > 0:
        return rna

def extractAnalysedRNAs(diroutput, log_file):
    # This function produces two sets of accepted RNAs -- RNAs containing pseudoknots and RNAs containing no pseudoknots    
    # extract accepted RNAs which do not have any branch length > 1
    rnas = os.listdir(DIR_DNA)
    
    unaccepted_rnas=list()
    accepted_rnas=list()
    
    for rna in rnas:
        # consider only the outputs from using DNA
        #dir_working = join(diroutput, SUBFOLDERS[1])
        if check_branch_length(DIR_DNA, rna) == rna:
            unaccepted_rnas.append(rna)
        else:
            accepted_rnas.append(rna)

    # extract RNAs containing pseudoknots
    with open(log_file, 'r') as f:
        lines=f.readlines()

    nopseudo_rnas = [line.split()[5] for line in lines if 'not have pseudoknots' in line ]
    accepted_nopseudo=set(accepted_rnas) & set(nopseudo_rnas)
    
    pseudo_rnas = set(rnas)-set(nopseudo_rnas)
    accepted_pseudo = set(accepted_rnas) & set(pseudo_rnas)
    
    issue_str_rnas = ['RF00207', 'RF00390', 'RF01380', 'RF01338', 'RF01047', 'RF03760', 'RF03969', 'RF00976', 'RF03623']
    #issue_consel_rnas = ['RF02401', 'RF00177', 'RF02842', 'RF02540', 'RF02345']
    working_rnas = set(accepted_rnas) - set(issue_str_rnas) #- set(issue_consel_rnas)
    working_pseudo = set(accepted_pseudo) - set(issue_str_rnas) #- set(issue_consel_rnas)
    
    return accepted_rnas, accepted_pseudo, accepted_nopseudo, working_rnas, working_pseudo

accepted_rnas, accepted_pseudo, accepted_nopseudo, rnas, pseudo = extractAnalysedRNAs(DIR_CONSEL, LOG_FILE)
print(len(accepted_rnas), len(accepted_pseudo), len(accepted_nopseudo), len(rnas), len(pseudo))

1796 72 1724 1787 71


In [21]:
import pandas as pd
import subprocess

# Function to parse the consel output file using catpv
def parse_consel_output(file_path, rna_id):
    # Run the catpv command and capture its output
    result = subprocess.run(
        ["/Users/u7875558/tools/consel/bin/catpv", file_path],
        stdout=subprocess.PIPE,
        text=True
    )
    
    # Print the raw output from catpv for debugging
    #print("Raw catpv output:")
    #print(result.stdout)
    
    # Extract the relevant lines from the output
    lines = result.stdout.splitlines()
    
    # Initialize lists to store the extracted data
    ranks=[]
    items = []
    p_values = []
    
    for line in lines:
        line = line.strip()
        # Only process lines that contain data and start with "# " followed by numbers
        if line.startswith("#") and line[1:].strip()[0].isdigit():
            #print(f"Processing line: {line}")  # Debugging print
            columns = line[1:].split()  # Remove leading "# " and split columns
            #print(f"Columns extracted: {columns}")  # Debugging print
            ranks.append(columns[0])

            if columns[1] == '1':
                items.append('DNA')
            else:
                items.append('RNA')
            #items.append(columns[1])  # Item column
            p_values.append(float(columns[3]))  # AU column
    
    # Create a DataFrame with the extracted data
    df = pd.DataFrame({
        'RNA': [rna_id] * len(items),
        'rank': ranks, 
        'item': items,
        'p-value': p_values
    })
    return df

# Initialize an empty DataFrame to store all results
all_rnas_ipseu_df = pd.DataFrame(columns=['RNA', 'rank', 'item', 'p-value'])

# Process each RNA file
for rna_id in rnas:
    file_path = os.path.join(DIR_CONSEL,  "ignore_pseudo", rna_id, f"{rna_id}_ipseu_consel.pv")
    
    # Check if the file exists before processing
    if os.path.exists(file_path):
        df = parse_consel_output(file_path, rna_id)
        all_rnas_ipseu_df = pd.concat([all_rnas_ipseu_df, df], ignore_index=True)
    else:
        print(f"File {file_path} not found, skipping.")

# Display the combined DataFrame for all RNAs
all_rnas_ipseu_df

  all_rnas_ipseu_df = pd.concat([all_rnas_ipseu_df, df], ignore_index=True)


Unnamed: 0,RNA,rank,item,p-value
0,RF03313,1,RNA,0.993
1,RF03313,2,DNA,0.007
2,RF00693,1,RNA,0.763
3,RF00693,2,DNA,0.237
4,RF02756,1,RNA,0.992
...,...,...,...,...
3569,RF00258,2,DNA,0.305
3570,RF00170,1,RNA,0.576
3571,RF00170,2,DNA,0.424
3572,RF00014,1,RNA,0.915


In [22]:
len(all_rnas_ipseu_df[all_rnas_ipseu_df['p-value'] < 0.05]['RNA'].tolist())

183

In [23]:
all_rnas_ipseu_df[all_rnas_ipseu_df['p-value'] < 0.05]

Unnamed: 0,RNA,rank,item,p-value
1,RF03313,2,DNA,0.0070
5,RF02756,2,DNA,0.0080
57,RF04084,2,DNA,0.0180
59,RF02610,2,DNA,0.0300
61,RF00762,2,DNA,0.0110
...,...,...,...,...
3427,RF02796,2,DNA,0.0190
3455,RF04169,2,DNA,0.0140
3477,RF02694,2,DNA,0.0001
3493,RF01481,2,DNA,0.0180


# TEST - Concrete script to create a table of p values from AU TEST

In [45]:
#!/usr/bin/env python3
"""
RF_consel_pvalues_table.py

Extract AU p-values for the DNA tree (ignore pseudoknots) across all S6*/S16* models,
apply FDR correction per model, and output a wide table (RNAs × models).
"""

import os
from os.path import join
import subprocess
import numpy as np
import pandas as pd
from Bio import Phylo
from sklearn.preprocessing import normalize
from statsmodels.stats.multitest import multipletests

# ─── USER PARAMETERS ───────────────────────────────────────────────────────────

# Base working directories
DIR_WORKING    = '/Users/u7875558/Documents/PhD/RNAPhylo/allModels_SEED'
DIR_OUTPUTS    = join(DIR_WORKING, 'outputs')
# Directory with DNA trees to list RNAs
DIR_DNA        = join(DIR_OUTPUTS, 'DNAtrees')
# Base CONS EL outputs folder (each model under here)
DIR_CONSEL_BASE = join(DIR_OUTPUTS, 'AU_Test')

# Subfolders under each model in CONS EL outputs
SUBFOLDERS     = ['consider_pseudo', 'ignore_pseudo']
# Suffix mapping for CONSEL files
SUFFIXES       = {
    'consider_pseudo': 'wpseu_consel',
    'ignore_pseudo':   'ipseu_consel'
}
# RNAs to always exclude
ISSUE_RNAS     = [
    'RF00207','RF00390','RF01380','RF01338','RF01047',
    'RF03760','RF03969','RF00976','RF03623'
]
# Replicate seeds for tree filenames
EXPECTED_SEEDS = {f"{i:02d}" for i in range(1, 11)}

# ─── UTILITY FUNCTIONS ────────────────────────────────────────────────────────

def check_branch_length(diroutput, rna):
    """
    Return True if ANY RAxML_bestTree replicate under diroutput/rna has branch_length > 1.
    """
    folder = join(diroutput, rna)
    for fn in os.listdir(folder):
        if fn.startswith('RAxML_bestTree') and fn[-2:] in EXPECTED_SEEDS:
            tree = Phylo.read(join(folder, fn), 'newick')
            if any(cl.branch_length and cl.branch_length > 1 for cl in tree.find_clades()):
                return True
    return False


def get_accepted_rnas():
    """
    List RNAs under DIR_DNA with no overlong branches and not in ISSUE_RNAS.
    """
    accepted = []
    for rna in os.listdir(DIR_DNA):
        path = join(DIR_DNA, rna)
        if not os.path.isdir(path) or rna in ISSUE_RNAS:
            continue
        if not check_branch_length(DIR_DNA, rna):
            accepted.append(rna)
    return sorted(accepted)


def parse_consel_output(pv_path):
    """
    Run catpv on a .pv file and return the AU p-value for the DNA tree item.
    """
    proc = subprocess.run(
        ['/Users/u7875558/tools/consel/bin/catpv', pv_path],
        stdout=subprocess.PIPE,
        text=True
    )
    for line in proc.stdout.splitlines():
        line = line.strip()
        if line.startswith('#') and line[1:].strip()[0].isdigit():
            cols = line[1:].split()
            # item '1' → DNA
            if cols[1] == '1':
                return float(cols[3])
    return np.nan


def check_consel_output(dir_consel, groups):
    """
    Verify each RNA folder under dir_consel/group has exactly 4 .pv files.
    Return dict(group→list of RNA with issues).
    """
    issues = {}
    for group in groups:
        grp_dir = join(dir_consel, group)
        bad = []
        if os.path.isdir(grp_dir):
            for rna in os.listdir(grp_dir):
                rdir = join(grp_dir, rna)
                cnt = sum(1 for f in os.listdir(rdir) if SUFFIXES[group] in f)
                if cnt != 4:
                    bad.append(rna)
        issues[group] = sorted(bad)
    return issues

# ─── MAIN ──────────────────────────────────────────────────────────────────────

# 1) Discover all models under CONS EL base
models = sorted(
    m for m in os.listdir(DIR_CONSEL_BASE)
    if os.path.isdir(join(DIR_CONSEL_BASE, m)) and (m.startswith('S6'))
)

print(f"[INFO] Found models: {models}")

# 2) Optional: check CONS EL file completeness
for model in models:
    mdl_dir = join(DIR_CONSEL_BASE, model)
    issues = check_consel_output(mdl_dir, SUBFOLDERS)
    print(f"[INFO] missing .pv files in model {model}: {issues}")

# 3) Gather raw AU p-values for DNA
records = []
accepted = get_accepted_rnas()
for model in models:
    consel_model = join(DIR_CONSEL_BASE, model, 'ignore_pseudo')
    for rna in accepted:
        pv_file = join(consel_model, rna, f"{rna}_{SUFFIXES['ignore_pseudo']}.pv")
        if not os.path.exists(pv_file):
            continue
        p_raw = parse_consel_output(pv_file)
        records.append({'Model': model, 'RNA': rna, 'p_raw': p_raw})

# 4) Build DataFrame and apply FDR correction by model
df = pd.DataFrame(records)
adjusted = []
for model, grp in df.groupby('Model'):
    rej, p_adj, *_ = multipletests(grp['p_raw'], method='fdr_bh')
    tmp = grp.copy()
    tmp['p_adj'] = p_adj
    adjusted.append(tmp)

df_final = pd.concat(adjusted, ignore_index=True)

# 5) Pivot to wide table (RNAs × models)
df_table = (
    df_final
    .pivot(index='RNA', columns='Model', values='p_adj')
    .sort_index()
)

# 6) Output
print(df_table.head())
out_csv = join(DIR_CONSEL_BASE, f'DNA_AU_pvalues_ignore_pseudo_{MODEL}_models.csv')
df_table.to_csv(out_csv)
print(f"[INFO] Saved table to: {out_csv}")
df_final

[INFO] Found models: ['S6A', 'S6B', 'S6C', 'S6D', 'S6E']
[INFO] missing .pv files in model S6A: {'consider_pseudo': [], 'ignore_pseudo': []}
[INFO] missing .pv files in model S6B: {'consider_pseudo': [], 'ignore_pseudo': []}
[INFO] missing .pv files in model S6C: {'consider_pseudo': [], 'ignore_pseudo': []}
[INFO] missing .pv files in model S6D: {'consider_pseudo': [], 'ignore_pseudo': []}
[INFO] missing .pv files in model S6E: {'consider_pseudo': [], 'ignore_pseudo': []}
Model         S6A       S6B       S6C
RNA                                  
RF00011  0.500867  0.504742  0.504475
RF00014  0.389105  0.504742  0.504475
RF00016  0.500867  0.504742  0.504475
RF00018  0.500867  0.504742  0.504475
RF00019  0.500867  0.504742  0.504475
[INFO] Saved table to: /Users/u7875558/Documents/PhD/RNAPhylo/allModels_SEED/outputs/AU_Test/DNA_AU_pvalues_ignore_pseudo_S7D_models.csv


Unnamed: 0,Model,RNA,p_raw,p_adj
0,S6A,RF00011,0.288,0.500867
1,S6A,RF00014,0.027,0.389105
2,S6A,RF00016,0.330,0.500867
3,S6A,RF00018,0.268,0.500867
4,S6A,RF00019,0.318,0.500867
...,...,...,...,...
5356,S6C,RF04297,0.291,0.504475
5357,S6C,RF04300,0.405,0.504475
5358,S6C,RF04302,0.332,0.504475
5359,S6C,RF04303,0.332,0.504475


# Concrete script to produce a dataframe containing all p values from AU test of all models.

In [1]:
#!/usr/bin/env python3
"""
RF_consel_pvalues_table.py

Extract AU p-values for the DNA tree (ignore pseudoknots) across all S6*/S16* models,
and assemble a wide table (RNAs × models) of those raw p-values.
"""

import os
from os.path import join
import subprocess
import numpy as np
import pandas as pd
from Bio import Phylo

# ─── USER PARAMETERS ───────────────────────────────────────────────────────────

DIR_WORKING     = '/Users/u7875558/Documents/PhD/RNAPhylo/allModels_SEED'
DIR_OUTPUTS     = join(DIR_WORKING, 'outputs')
DIR_DNA         = join(DIR_OUTPUTS, 'DNAtrees')
# CONS EL outputs: outputs/AU_Test/<model>/ignore_pseudo/<RNA>/<RNA>_ipseu_consel.pv
DIR_CONSEL_BASE = join(DIR_OUTPUTS, 'AU_Test')

SUBFOLDER    = 'raxml'
EXPECTED_SEEDS = {f"{i:02d}" for i in range(1, 11)}
SUFFIX_IGNORE = 'ipseu_consel'  # suffix before .pv for ignore_pseudo

# RNAs to exclude
ISSUE_RNAS = [
    'RF00207','RF00390','RF01380','RF01338','RF01047',
    'RF03760','RF03969','RF00976','RF03623'
]

# ─── UTILITY FUNCTIONS ────────────────────────────────────────────────────────

def check_branch_length(dna_dir, rna):
    """
    Return True if any replicate tree under dna_dir/rna has branch_length > 1.
    """
    folder = join(dna_dir, rna)
    for fn in os.listdir(folder):
        if fn.startswith('RAxML_bestTree') and fn[-2:] in EXPECTED_SEEDS:
            tree = Phylo.read(join(folder, fn), 'newick')
            if any(cl.branch_length and cl.branch_length > 1 for cl in tree.find_clades()):
                return True
    return False


def get_accepted_rnas(dna_dir):
    """
    List RNAs in dna_dir with no overlong branches and not in ISSUE_RNAS.
    """
    accepted = []
    for rna in os.listdir(dna_dir):
        path = join(dna_dir, rna)
        if not os.path.isdir(path) or rna in ISSUE_RNAS:
            continue
        if not check_branch_length(dna_dir, rna):
            accepted.append(rna)
    return sorted(accepted)


def parse_consel_output(pv_path):
    """
    Run catpv on a .pv file and return the AU p-value for the DNA tree.
    """
    proc = subprocess.run(
        ['/Users/u7875558/tools/consel/bin/catpv', pv_path],
        stdout=subprocess.PIPE, text=True
    )
    for line in proc.stdout.splitlines():
        line = line.strip()
        if line.startswith('#') and line[1:].strip()[0].isdigit():
            cols = line[1:].split()
            if cols[1] == '1':  # DNA is item '1'
                return float(cols[3])
    return np.nan

# ─── MAIN PROCESS ─────────────────────────────────────────────────────────────

# Discover S6*/S16*/S7* models
models = sorted(
    m for m in os.listdir(DIR_CONSEL_BASE)
    if os.path.isdir(join(DIR_CONSEL_BASE, m)) and ((m.startswith('S')) )
)

accepted_rnas = get_accepted_rnas(DIR_DNA)
records = []

for model in models:
    consel_ignore = join(DIR_CONSEL_BASE, model, 'ignore_pseudo')
    for rna in accepted_rnas:
        pv_file = join(consel_ignore, rna, f"{rna}_{SUFFIX_IGNORE}.pv")
        if not os.path.exists(pv_file):
            continue
        p_raw = parse_consel_output(pv_file)
        records.append({'Model': model, 'RNA': rna, 'p_raw': p_raw})

# Build DataFrame and pivot to wide format
df = pd.DataFrame(records)
df_table = df.pivot(index='RNA', columns='Model', values='p_raw').sort_index()

# Output
def main():
    print(df_table.head())
    out_csv = join(DIR_CONSEL_BASE, 'AU_pvalues_ignore_pseudo_all_models.csv')
    df_table.to_csv(out_csv)
    print(f"Saved raw DNA-tree AU p-values to: {out_csv}")

if __name__ == '__main__':
    main()

Model      S16   S16A   S16B    S6A    S6B    S6C    S6D    S6E    S7A    S7B  \
RNA                                                                             
RF00011  0.222  0.582  0.098  0.288  0.253  0.293  0.246  0.212  0.297  0.259   
RF00014  0.004  0.055  0.032  0.027  0.274  0.295  0.407  0.505  0.056  0.085   
RF00016  0.576  0.184  0.162  0.330  0.277  0.352  0.359  0.257  0.487  0.522   
RF00018  0.242  0.264  0.229  0.268  0.327  0.423  0.393  0.362  0.182  0.211   
RF00019  0.407  0.338  0.227  0.318  0.356  0.320  0.316  0.293  0.351  0.324   

Model      S7C    S7D    S7E  
RNA                           
RF00011  0.107  0.422  0.128  
RF00014  0.157  0.015  0.103  
RF00016  0.154  0.247  0.132  
RF00018  0.153  0.248  0.190  
RF00019  0.327  0.316  0.321  
Saved raw DNA-tree AU p-values to: /Users/u7875558/Documents/PhD/RNAPhylo/allModels_SEED/outputs/AU_Test/AU_pvalues_ignore_pseudo_all_models.csv
