In [None]:
import os
import sys 
from Bio import SeqIO, Phylo
import random
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np

In [None]:
os.chdir("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0151")

dirname_list = ["fasta", "figures", "newick"]
for dirname in dirname_list:
    try:
        os.mkdir(dirname)
    except:
        None

In [None]:
inhandle  = open("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0151/fasta/LTPs132_SSU_aligned.fasta.trim.dereplicated.fractalin.fa", 'r')
outhandle = open("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0151/fasta/LTPs132_SSU_aligned.fasta.trim.dereplicated.fractalin.shuffled.fa", 'w')

for record in SeqIO.parse(inhandle, 'fasta'):
    sequence        = str(record.seq)
    name            = record.name
    sequence_no_gap = sequence.replace('-', '')
    random_char_list_no_gap = random.sample(sequence_no_gap, len(sequence_no_gap))

    sequence_random = ""
    for char in sequence:
        if(char == '-'):
            sequence_random += char
        else:
            sequence_random += random_char_list_no_gap.pop()
    
    outhandle.write(">"+name+"_random"+"\n"  )
    outhandle.write(sequence_random)
    outhandle.write("\n"           )

In [None]:
mpl.rcParams['font.family']       = 'Helvetica'
mpl.rcParams['font.sans-serif']   = ["Helvetica","Arial","DejaVu Sans","Lucida Grande","Verdana"]
mpl.rcParams['figure.figsize']    = [4,3]
mpl.rcParams['font.size']         = 9
mpl.rcParams["axes.labelcolor"]   = "#000000"
mpl.rcParams["axes.linewidth"]    = 1.0 
mpl.rcParams["xtick.major.width"] = 1.0
mpl.rcParams["ytick.major.width"] = 1.0
cmap1 = plt.cm.tab10
cmap2 = plt.cm.Set3  
colors1 = [cmap1(i) for i in range(0,10)]
colors2 = [cmap2(i) for i in range(0,12)] 
plt.style.use('default')

In [None]:
df_result = pd.read_csv("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0151/result/NK_0151_result.csv", names = ['TASK_ID', 'Method', 'Nseq', 'Subsample', 'Threshold', 'Memory', 'Mem_unit', 'Time', 'Time_unit', 'Ntips', 'NRFD'])
df_result['Coverage'] = df_result['Ntips'] / 13896
df_result['Accuracy'] = 1 - df_result['NRFD']
df_result['add_ratio'] = (df_result['Nseq'] - 13897) / 13897
df_result

In [None]:
df_result[df_result['Threshold']==100000].groupby('Method').mean()

In [None]:
for method in ['rapidnjNJ', 'raxmlMP', 'fasttreeML']:

    df_result_original = df_result[(df_result['Method']==method) & (df_result['Threshold']==100000)]
    df_result_fractal  = df_result[(df_result['Method']==method) & (df_result['Threshold']==4000)]


    fig=plt.figure(figsize=(2,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax.scatter(x = df_result_fractal['add_ratio']*100,  y = df_result_fractal ['Accuracy']*100, color = "#88C9D4", s = 40)
    ax.scatter(x = df_result_original['add_ratio']*100, y = df_result_original['Accuracy']*100, color = "#F8D686", s = 10)
    
    ax.set_xlim(-1,25)
    ax.set_ylim(0,60)

    ax.set_xlabel("Additional noise ratio (%)")
    ax.set_ylabel("Accuracy (%)")
    ax.set_title(method)

    plt.savefig("figures/NK_0151_noise_effect_"+method+".pdf", bbox_inches = 'tight')
    plt.close()

In [None]:
tree = Phylo.read("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0151/result/TASK_33_FRACTALout.nwk", 'newick')
with open("figures/for_itol.txt" ,'w') as handle:
    handle.write("DATASET_SYMBOL\nSEPARATOR SPACE\nDATASET_LABEL ko:K00001\nCOLOR #ffff00\nDATA\n")
    for tip in tree.get_terminals():
        name = tip.name
        if (name.split("_")[-1]=='random'):
            handle.write(name + " 2 2 " + "rgba(256,200,0,0.8) 1 1 " + name + "\n")

In [None]:
classification_rate_list = []
for taskid in range(1, 37):
    tree = Phylo.read("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0151/result/TASK_ID"+str(taskid)+"_FRACTALout.nwk", 'newick')

    # label nonterminals
    for i, node in enumerate(tree.get_nonterminals()):
        node.name = "internal"+str(i)

    # node > (Ntips_16S, Ntips_noise)
    node2Ntips = {}

    for tip in tree.get_terminals():
        is_noise = (tip.name).split("_")[-1]=='random'
        if (is_noise):
            node2Ntips[tip.name] = np.array([0, 1])
        else:
            node2Ntips[tip.name] = np.array([1, 0])

    nonterminal_list = tree.get_nonterminals()
    while len(nonterminal_list) > 0:
        nonterminal = nonterminal_list.pop()
        Ntips_child0 = node2Ntips[nonterminal.clades[0].name]
        Ntips_child1 = node2Ntips[nonterminal.clades[1].name]
        node2Ntips[nonterminal.name] = np.array([Ntips_child0[0] + Ntips_child1[0], Ntips_child0[1] + Ntips_child1[1]])

    node_classifyratio_list = []
    for key in node2Ntips.keys():
        Ntip_array            = node2Ntips[key]
        Ntip_array_the_other  = node2Ntips[tree.clade.name] - node2Ntips[key]
        node_classifyratio    = (Ntip_array[0] + Ntip_array_the_other[1]) / (sum(Ntip_array) + sum(Ntip_array_the_other))
        node_classifyratio_list.append({"name":key, "ClassificationRatio":max(node_classifyratio, 1 - node_classifyratio)})

    classification_rate_list.append([taskid, sorted(node_classifyratio_list, key=lambda x:x['ClassificationRatio'])[-1]['ClassificationRatio']])

In [None]:
df_classification_rate = pd.DataFrame(classification_rate_list, columns = ['TASK_ID', 'ClassificationRatio'])

In [None]:
df_merged = pd.merge(df_result, df_classification_rate, on = 'TASK_ID')
df_merged

In [None]:
for method in ['rapidnjNJ', 'raxmlMP', 'fasttreeML']:

    df_result_original = df_merged[(df_result['Method']==method) & (df_merged['Threshold']==100000)]
    df_result_fractal  = df_merged[(df_merged['Method']==method) & (df_merged['Threshold']==4000)]


    fig=plt.figure(figsize=(2,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    ax.scatter(x = df_result_fractal['add_ratio']*100,  y = df_result_fractal ['ClassificationRatio']*100, color = "#88C9D4", s = 40)
    ax.scatter(x = df_result_original['add_ratio']*100, y = df_result_original['ClassificationRatio']*100, color = "#F8D686", s = 10)
    
    ax.set_xlim(0,25)
    ax.set_ylim(0,105)

    ax.set_xlabel("Additional noise ratio (%)")
    ax.set_ylabel("Classification rate (%)")
    ax.set_title(method)

    plt.savefig("figures/NK_0151_classification_ratio_"+method+".pdf", bbox_inches = 'tight')
    plt.close()

In [None]:
# tree branch coloring
classification_rate_list = []
for taskid in range(1, 37):
    tree = Phylo.read("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0151/result/TASK_ID"+str(taskid)+"_FRACTALout.nwk", 'newick')

    # label nonterminals
    for i, node in enumerate(tree.get_nonterminals()):
        node.name = "node"+str(i)+"internal"
        node.confidence = None
        node.branch_length = 1
    for i, node in enumerate(tree.get_terminals()):
        node.name = (node.name).replace("_", "")
        node.confidence = None
        node.branch_length = 1
        

    Phylo.write(tree, "/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0151/newick/TASK_ID"+str(taskid)+"_FRACTALout.internal_named.nwk", 'newick')

    # node > (Ntips_16S, Ntips_noise)
    node2Ntips = {}

    for tip in tree.get_terminals():
        is_noise = ('random' in tip.name)
        if (is_noise):
            node2Ntips[tip.name] = np.array([0, 1])
        else:
            node2Ntips[tip.name] = np.array([1, 0])

    nonterminal_list = tree.get_nonterminals()
    while len(nonterminal_list) > 0:
        nonterminal = nonterminal_list.pop()
        Ntips_child0 = node2Ntips[nonterminal.clades[0].name]
        Ntips_child1 = node2Ntips[nonterminal.clades[1].name]
        node2Ntips[nonterminal.name] = np.array([Ntips_child0[0] + Ntips_child1[0], Ntips_child0[1] + Ntips_child1[1]])

    parent_clade_noiseratio = []
    for clade in tree.get_nonterminals():
        for child in clade.clades:
            key = child.name
            Ntip_array            = node2Ntips[key]
            Ntip_array_the_other  = node2Ntips[tree.clade.name] - node2Ntips[key]
            noise_ratio           = max(Ntip_array[1] / sum(Ntip_array), Ntip_array_the_other[1] / sum(Ntip_array_the_other))
            parent_clade_noiseratio.append((clade.name, child.name, noise_ratio))

    cm_name='inferno'# B->G->R
    cm = plt.get_cmap(cm_name)

    with open("figures/for_itol_TASK_ID"+str(taskid)+".txt",'w') as handle:
        handle.write("TREE_COLORS\nSEPARATOR TAB\nDATASET_LABEL\t"+str(taskid)+"\nCOLOR\t#FF9300\nDATA\n")
        for parent_name, name, noise_ratio in parent_clade_noiseratio:
            color_idx   = int(noise_ratio * 255)
            color_vec   = cm(color_idx)
            r_idx       = int(color_vec[0] * 255)
            g_idx       = int(color_vec[1] * 255)
            b_idx       = int(color_vec[2] * 255)
            if noise_ratio==1:
                handle.write(str(name) + "\tclade\t" + "#F7C845\tnormal\n")

In [None]:
with open("figures/for_itol_TASK_ID"+str(taskid)+".txt",'w') as handle:
    handle.write("TREE_COLORS\nSEPARATOR TAB\nDATASET_LABEL\t"+str(taskid)+"\nCOLOR\t#ffff00\nDATA\n")
    for parent_name, name, noise_ratio in parent_clade_noiseratio:
        color_idx   = int(noise_ratio * 255)
        color_vec   = cm(color_idx)
        r_idx       = int(color_vec[0] * 255)
        g_idx       = int(color_vec[1] * 255)
        b_idx       = int(color_vec[2] * 255)
        if noise_ratio==1:
            handle.write(str(name) + "\tclade\t" + "#FF9300\tnormal\n")
        #else:
        #    handle.write(str(parent_name) + "|" + str(name) + "\tbranch\t" + "rgba("+str(r_idx)+","+str(g_idx)+","+str(b_idx)+",1)\tnormal\n")
            
        #handle.write(str(parent_name) + "|" + str(name) + "\tbranch\t" + "rgba("+str(r_idx)+","+str(g_idx)+","+str(b_idx)+",1)\tnormal\n")