# This script is used to compute the BIC values of each tree inferred in each model.

In [3]:
# This script is used to compute the BIC values of each tree inferred in each model.

import os
from os.path import join
import math
import csv
import pandas as pd

DIR_WORKING = '/Users/u7875558/Documents/RNAPhylo/allModels_SEED'
DIR_OUTPUTS = join(DIR_WORKING, 'outputs')
DIR_INPUTS = join(DIR_WORKING, 'inputs')
DIR_FASTA = join(DIR_INPUTS, 'fasta_files')

MODELS= [m for m in os.listdir(DIR_OUTPUTS) if m.startswith('S')]

# Number of free parameters:
K_MAP = {
    "DNAtrees": 5 + 3 +1,
    "S6A": 14 + 5 + 1,
    "S6B":  2 + 5 + 1,
    "S6C":  2 + 2 + 1,
    "S6D":  1 + 2 + 1,
    "S6E":  1 + 5 + 1,
    "S7A": 20 + 6 + 1,
    "S7B": 20 + 3 + 1,
    "S7C":  9 + 6 + 1,
    "S7D":  3 + 6 + 1,
    "S7E":  1 + 6 + 1,
    "S7F":  3 + 3 + 1,
    "S16":119 +15 + 1,
    "S16A": 4 +15 + 1,
    "S16B": 0 +15 + 1
}
SIG_RNAS = ['RF00740', 'RF00872', 'RF01038', 'RF02613', 'RF04290']

# regex to pull out
# - final log-likelihood 
# LH_RE = re.compile(r"Final.*Score.*:\s*(-?[0-9]+\.[0-9]+)")

# ────────────────────────────────────────────────────────────────────
def parse_fasta(fi_fasta):
    """
    Return (taxa_count, site_count) for a FASTA alignment
    """
    taxa = 0
    seq_line = ""
    first = False

    with open(fi_fasta) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith('>'):
                taxa += 1
                if not first:
                    first = True
                    seq_line = ""
                elif first:
                    # we hit header for 2nd recrod -- done collecting the first one
                    break
            else:
                if first:
                    seq_line += line
    
    # 'seq_line' now hold the entire first sequence
    return taxa, len(seq_line)

rows = []
for model in K_MAP.keys():
    if model.startswith("DNA"):
        model_dir = join(DIR_OUTPUTS, model)
    else:
        model_dir = os.path.join(DIR_OUTPUTS, model, 'raxmlP_iPseu')
    
    if not os.path.isdir(model_dir):
        continue
    
    for rna in SIG_RNAS:
        rna_dir = join(model_dir, rna)
        if not os.path.isdir(rna_dir):
            continue

        # get taxa and sites from FASTA
        fasta_file = join(DIR_FASTA, f"{rna}.nodup.fa")
        if not os.path.isfile(fasta_file):
            print(f"Missing fasta file for {rna}")
            continue
        n_seq, n_sites = parse_fasta(fasta_file)

        # parse each replicate
        for fn in os.listdir(rna_dir):
            if not fn.startswith("RAxML_info."):
                continue
            seed = fn.split(".")[-1]
            logL = None

            with open(join(rna_dir, fn)) as inf:
                for line in inf.readlines():
                    if line.startswith("Final GAMMA"):
                        logL = float(line.split(" ")[-1])
                    if logL:
                        break

            #if logL is None:
            #    print(f"No log-likelihood is found in {fn}, {rna_dir}")
            #continue

            k_mod = K_MAP.get(model)
            k_br = 2 * n_seq - 3
            k_tot = k_mod + k_br
            bic = -2 * logL + k_tot * math.log(n_sites)

            rows.append({
                "RNA":      rna,
                "model":    model,
                "seed":      seed,
                "logL":     logL,
                "n_seq":   n_seq,
                "n_sites":  n_sites,
                "k_mod":    k_mod,
                "k_br":     k_br,
                "k_tot":    k_tot,
                "BIC":      round(bic, 5)
            })

# build data frame
df = pd.DataFrame(rows)

df

TypeError: unsupported operand type(s) for *: 'int' and 'NoneType'