# Control the number of files with computed consel files -- There should be in total 7 files

In [45]:
from os.path import join 
import os

DIR_WORKING = '/Users/u7875558/Documents/PhD/RNAPhylo/allModels_SEED/'
DIR_OUTPUTS = join(DIR_WORKING, 'outputs')
MODEL = 'S16B'
DIR_CONSEL = join(DIR_OUTPUTS, 'AU_Test', MODEL)


LOG_FILE='/Users/u7875558/Documents/PhD/RNAPhylo/allModels_SEED/logs/2025-03-21_10-08-12.log'
SUBFOLDERS=['consider_pseudo', 'ignore_pseudo']
suffixes = {'consider_pseudo': 'wpseu_consel', 'ignore_pseudo': 'ipseu_consel'}
issue_str_rna = ['RF00207', 'RF00390', 'RF01380', 'RF01338', 'RF01047', 'RF03760', 'RF03969', 'RF00976', 'RF03623']

# control if the right consel file was produced
def check_consel_output(dir_consel, groups):
    #dir_consel_gr = join(dir_consel, group)
    rnas = os.listdir(join(DIR_OUTPUTS, MODEL, 'raxml'))
    prob_rnas = dict()
    for group in groups:
        issue_rnas = list()
        dir_consel_gr = join(dir_consel, group)
        for rna in os.listdir(dir_consel_gr):
            consel_rna_path = join(dir_consel_gr, rna)
            consel_files = 0
            for f in os.listdir(consel_rna_path):
                if suffixes[group] in f:
                    consel_files += 1
            if consel_files != 4:
                issue_rnas.append(rna)
        prob_rnas[group] = sorted(issue_rnas)
    return prob_rnas

check_consel_output(DIR_CONSEL, SUBFOLDERS)

{'consider_pseudo': [],
 'ignore_pseudo': ['RF00177', 'RF02345', 'RF02401', 'RF02540', 'RF02842']}

In [24]:
def check_branch_length(diroutput, rna):
    expected_files = [f"{i:02d}" for i in range(1, 11)]
    dirRNA = os.path.join(diroutput, rna)

    delete_files = list()
    for file_name in os.listdir(dirRNA):
        for seed in expected_files:
            if file_name.startswith('RAxML_bestTree') and file_name.endswith(seed):
                tree_path = os.path.join(dirRNA, file_name)
                tree = Phylo.read(tree_path, 'newick')
                for clade in tree.find_clades():
                    if clade.branch_length and clade.branch_length > 1:
                        delete_files.append(file_name)
    
    if len(delete_files) > 0:
        return rna

def extractAnalysedRNAs(diroutput, log_file):
    # This function produces two sets of accepted RNAs -- RNAs containing pseudoknots and RNAs containing no pseudoknots    
    # extract accepted RNAs which do not have any branch length > 1
    rnas = os.listdir(join(diroutput, SUBFOLDERS[-1]))
    
    unaccepted_rnas=list()
    accepted_rnas=list()
    
    for rna in rnas:
        # consider only the outputs from using DNA
        dir_working = join(diroutput, SUBFOLDERS[1])
        if check_branch_length(dir_working, rna) == rna:
            unaccepted_rnas.append(rna)
        else:
            accepted_rnas.append(rna)

    # extract RNAs containing pseudoknots
    with open(log_file, 'r') as f:
        lines=f.readlines()

    nopseudo_rnas = [line.split()[5] for line in lines if 'not have pseudoknots' in line ]
    accepted_nopseudo=set(accepted_rnas) & set(nopseudo_rnas)
    
    pseudo_rnas = set(rnas)-set(nopseudo_rnas)
    accepted_pseudo = set(accepted_rnas) & set(pseudo_rnas)
    
    issue_str_rnas = ['RF00207', 'RF00390', 'RF01380', 'RF01338', 'RF01047', 'RF03760', 'RF03969', 'RF00976', 'RF03623']
    issue_consel_rnas = ['RF02401', 'RF00177', 'RF02842', 'RF02540', 'RF02345']
    working_rnas = set(accepted_rnas) - set(issue_str_rnas) - set(issue_consel_rnas)
    working_pseudo = set(accepted_pseudo) - set(issue_str_rnas) - set(issue_consel_rnas)
    
    return accepted_rnas, accepted_pseudo, accepted_nopseudo, working_rnas, working_pseudo

accepted_rnas, accepted_pseudo, accepted_nopseudo, rnas, pseudo = extractAnalysedRNAs(DIR_CONSEL, LOG_FILE)
print(len(accepted_rnas), len(accepted_pseudo), len(accepted_nopseudo), len(rnas), len(pseudo))

1788 72 1716 1782 71


In [27]:
import pandas as pd
import subprocess

# Function to parse the consel output file using catpv
def parse_consel_output(file_path, rna_id):
    # Run the catpv command and capture its output
    result = subprocess.run(
        ["/Users/u7875558/Documents/PhD/tools/consel/bin/catpv", file_path],
        stdout=subprocess.PIPE,
        text=True
    )
    
    # Print the raw output from catpv for debugging
    #print("Raw catpv output:")
    #print(result.stdout)
    
    # Extract the relevant lines from the output
    lines = result.stdout.splitlines()
    
    # Initialize lists to store the extracted data
    ranks=[]
    items = []
    p_values = []
    
    for line in lines:
        line = line.strip()
        # Only process lines that contain data and start with "# " followed by numbers
        if line.startswith("#") and line[1:].strip()[0].isdigit():
            #print(f"Processing line: {line}")  # Debugging print
            columns = line[1:].split()  # Remove leading "# " and split columns
            #print(f"Columns extracted: {columns}")  # Debugging print
            ranks.append(columns[0])

            if columns[1] == '1':
                items.append('DNA')
            else:
                items.append('RNA')
            #items.append(columns[1])  # Item column
            p_values.append(float(columns[3]))  # AU column
    
    # Create a DataFrame with the extracted data
    df = pd.DataFrame({
        'RNA': [rna_id] * len(items),
        'rank': ranks, 
        'item': items,
        'p-value': p_values
    })
    return df

# Initialize an empty DataFrame to store all results
all_rnas_ipseu_df = pd.DataFrame(columns=['RNA', 'rank', 'item', 'p-value'])

# Process each RNA file
for rna_id in rnas:
    file_path = os.path.join(DIR_CONSEL,  "ignore_pseudo", rna_id, f"{rna_id}_ipseu_consel.pv")
    
    # Check if the file exists before processing
    if os.path.exists(file_path):
        df = parse_consel_output(file_path, rna_id)
        all_rnas_ipseu_df = pd.concat([all_rnas_ipseu_df, df], ignore_index=True)
    else:
        print(f"File {file_path} not found, skipping.")

# Display the combined DataFrame for all RNAs
all_rnas_ipseu_df

  all_rnas_ipseu_df = pd.concat([all_rnas_ipseu_df, df], ignore_index=True)


          RNA rank item  p-value
0     RF00467    1  RNA    0.507
1     RF00467    2  DNA    0.493
2     RF00711    1  RNA    0.857
3     RF00711    2  DNA    0.143
4     RF00019    1  RNA    0.614
...       ...  ...  ...      ...
3559  RF01014    2  DNA    0.014
3560  RF01084    1  RNA    0.812
3561  RF01084    2  DNA    0.188
3562  RF00231    1  RNA    0.593
3563  RF00231    2  DNA    0.407

[3564 rows x 4 columns]


In [31]:
len(all_rnas_ipseu_df[all_rnas_ipseu_df['p-value'] < 0.05]['RNA'].tolist())

189

# 