In [None]:
from Bio import Phylo, SeqIO
from matplotlib import pyplot as plt
import matplotlib as mpl
import numpy as np
import os
import statistics as stat
import pandas as pd
import seaborn as sns
import copy
import matplotlib.colors as colors
from matplotlib.colors import LinearSegmentedColormap
import gzip

In [None]:
mpl.rcParams['font.family']       = 'Helvetica'
mpl.rcParams['font.sans-serif']   = ["Helvetica","Arial","DejaVu Sans","Lucida Grande","Verdana"]
mpl.rcParams['figure.figsize']    = [4,3]
mpl.rcParams['font.size']         = 9
mpl.rcParams["axes.labelcolor"]   = "#000000"
mpl.rcParams["axes.linewidth"]    = 1.0 
mpl.rcParams["xtick.major.width"] = 1.0
mpl.rcParams["ytick.major.width"] = 1.0
cmap1 = plt.cm.tab10
cmap2 = plt.cm.Set3  
colors1 = [cmap1(i) for i in range(0,10)]
colors2 = [cmap2(i) for i in range(0,12)] 
plt.style.use('default')

def generate_cmap(colors):
    color_list = []
    values = range(len(colors))
    vmax   = int(np.max(values))
    for v, c in enumerate(colors):
        color_list.append( (v*1.0/ vmax, c) )
    return LinearSegmentedColormap.from_list('custom_cmap', color_list)

In [None]:
os.chdir("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148")
try:
    os.mkdir("figures")
except:
    None

In [None]:
# L

def L_read_data(csvfile):
    data = pd.read_csv(csvfile, names = ["FRACTAL_ID", "SIM_ID", "Method", "Mode", "Nseq", "mu", "alpha", "L", "sigma", "Memory", "Memory_unit", "Run time", "Time_unit", "Ntips", "NRFD"])
    data = data.sort_values("FRACTAL_ID")
    data["Coverage"] = data["Ntips"] / data["Nseq"] * 100
    data["Accuracy"] = (1 - data["NRFD"]) * 100
    return data

for method in ["rapidnjNJ", "raxmlMP", "fasttreeML"]:
    data_fractal = L_read_data("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/L/result."+method+"_fractal.csv")
    data_original = L_read_data("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/L/result."+method+"_original.csv")

    df_merged = pd.merge(data_fractal.loc[:,['L','Accuracy']], data_original.loc[:,['L','Accuracy']], on = 'L')
    df_merged = df_merged.rename(columns = {'Accuracy_x':'Accuracy_fractal', 'Accuracy_y':'Accuracy_original'})
    df_merged['Accuracy_diff'] = df_merged['Accuracy_fractal'] - df_merged['Accuracy_original']
    print(method, "The maximum length for which the accuracy of FRACTAL was >5% smaller than original:", max(df_merged[df_merged['Accuracy_diff'] < -5].L))

    fig = plt.figure(figsize=(3,3))
    ax = fig.add_axes([0.1,0.1,0.8,0.5])
    i=0
    ax.set_title(method)
    #ax.set_xlabel("$L$")
    ax.set_ylabel('Accuracy (%)')
    ax.set_xlim(left=-150, right=1650) 
    ax.set_ylim(-5,105)
    ax.scatter(x=data_original['L'],y=data_original['Accuracy'], label='without_FRACTAL',color='#FFD479', s = 7)
    ax.scatter(x=data_fractal['L'],y=data_fractal['Accuracy'], label='with_FRACTAL',color='#73CBD6', s = 7)
    ax.spines["top"].set_color("none")
    ax.spines["right"].set_color("none")
    ax.spines["bottom"].set_color("none")
    ax.tick_params(bottom= False, labelbottom=False)

    ax2 = fig.add_axes([0.1,0.1-0.25,0.8,0.20])
    ax2.set_xlabel("$L$")
    ax2.set_ylabel('Coverage (%)')
    ax2.bar(x=data_fractal['L'],height=data_fractal['Coverage'], color='#429468', width=int(1500/300),)
    ax2.set_title('')
    ax2.set_xlim(left=-150, right=1650) 
    ax2.set_ylim(bottom=-5, top=105) 
    ax2.spines["top"].set_color("none")
    ax2.spines["right"].set_color("none")


    fig.savefig("figures/NK_0148_L_"+method+".pdf",bbox_inches="tight")
    #plt.show()
    plt.close()

In [None]:
# mu - alpha
method = "fasttreeML"
csvfile = "/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/mu_alpha/result."+method+"_fractal.csv"

def mu_alpha_read_data(csvfile):
    data = pd.read_csv(csvfile, names = ["FRACTAL_ID", "SIM_ID", "Method", "Mode", "Nseq", "mu", "alpha", "L", "sigma", "Memory", "Memory_unit", "Run time", "Time_unit", "Ntips", "NRFD"])
    data = data.sort_values("FRACTAL_ID")
    data["Coverage"] = data["Ntips"] / data["Nseq"] * 100
    data["Accuracy"] = (1 - data["NRFD"]) * 100
    
    mu_list = list(sorted(set(list(data['mu']))))
    alpha_list = list(reversed(sorted(set(list(data['alpha'])))))

    mu2colidx    = {mu: idx    for idx, mu    in enumerate(mu_list   )}
    alpha2rowidx = {alpha: idx for idx, alpha in enumerate(alpha_list)}
    mtx_accuracy = np.zeros((len(alpha_list), len(mu_list))); mtx_accuracy[:,:] = np.nan
    mtx_coverage = np.zeros((len(alpha_list), len(mu_list))); mtx_coverage[:,:] = np.nan
    for mu, alpha, accuracy, coverage in zip(data['mu'], data['alpha'], data['Accuracy'], data['Coverage']):
        mtx_accuracy[alpha2rowidx[alpha]][mu2colidx[mu]] = accuracy
        mtx_coverage[alpha2rowidx[alpha]][mu2colidx[mu]] = coverage
    
    df_accuracy = pd.DataFrame(mtx_accuracy, index = alpha_list, columns = mu_list)
    df_coverage = pd.DataFrame(mtx_coverage, index = alpha_list, columns = mu_list)
    return df_accuracy, df_coverage

In [None]:
for method in ["rapidnjNJ", "raxmlMP", "fasttreeML"]:
    df_accuracy_fractal, df_coverage_fractal   = mu_alpha_read_data("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/mu_alpha/result."+method+"_fractal.csv")
    df_accuracy_original, df_coverage_original = mu_alpha_read_data("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/mu_alpha/result."+method+"_original.csv")
    df_accuracy_diff = df_accuracy_fractal - df_accuracy_original
    for name, cmap, vmin, vmax, df in [("accuracy_original", 'plasma', 50, 100, df_accuracy_original), ("accuracy_fractal", 'plasma', 50, 100, df_accuracy_fractal), ("accuracy_diff", 'coolwarm_r', -20, 20, df_accuracy_diff), ("coverage_fractal", 'RdPu_r', 90, 100, df_coverage_fractal)]:
        fig = plt.figure(figsize=(3,3))
        ax  = fig.add_axes([0.1,0.1,0.8,0.8])
        ax2 = fig.add_axes([0.1 + 0.9, 0.45,0.4,0.1])
        ax.patch.set_facecolor('grey')
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        sns.heatmap(df, cmap=cmap, ax=ax, cbar_ax=ax2, cbar_kws={'orientation': 'horizontal'}, vmin=vmin, vmax=vmax)
        ax.set_xticks([4.5, 9.5, 14.5, 19.5, 24.5])
        ax.set_xticklabels([1.0,2.0,3.0,4.0,5.0])
        ax.set_yticks(23-np.array([2.5, 7.5, 12.5, 17.5, 22.5]))
        ax.set_yticklabels([0.20, 0.63, 2.00, 6.31, 19.95])
        ax.set_title(method+"_"+name)
        ax.set_xlabel("$\mu$")
        ax.set_ylabel("$\\alpha$")
        plt.savefig("figures/NK_0148_mu_alpha_"+method+"_"+name+".pdf", bbox_inches='tight')
        plt.close()

In [None]:
# sigma - mu
method = "fasttreeML"
csvfile = "/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/mu_alpha/result."+method+"_fractal.csv"

def sigma_mu_read_data(csvfile):
    data = pd.read_csv(csvfile, names = ["FRACTAL_ID", "SIM_ID", "Method", "Mode", "Nseq", "mu", "alpha", "L", "sigma", "Memory", "Memory_unit", "Run time", "Time_unit", "Ntips", "NRFD"])
    data = data.sort_values("FRACTAL_ID")
    data["Coverage"] = data["Ntips"] / data["Nseq"] * 100
    data["Accuracy"] = (1 - data["NRFD"]) * 100
    
    mu_list = list(sorted(set(list(data['mu']))))
    sigma_list_pre = list(reversed(sorted(set(list(data['sigma'])))))
    
    sigma_list = []
    for sigma in sigma_list_pre:
        if (sigma < 3.1):
            sigma_list.append(sigma)

    mu2colidx    = {mu: idx    for idx, mu    in enumerate(mu_list   )}
    sigma2rowidx = {sigma: idx for idx, sigma in enumerate(sigma_list)}
    mtx_accuracy = np.zeros((len(sigma_list), len(mu_list))); mtx_accuracy[:,:] = np.nan
    mtx_coverage = np.zeros((len(sigma_list), len(mu_list))); mtx_coverage[:,:] = np.nan
    for mu, sigma, accuracy, coverage in zip(data['mu'], data['sigma'], data['Accuracy'], data['Coverage']):
        try:
            mtx_accuracy[sigma2rowidx[sigma]][mu2colidx[mu]] = accuracy
            mtx_coverage[sigma2rowidx[sigma]][mu2colidx[mu]] = coverage
        except:
            None
            #print(mu, sigma)
    
    df_accuracy = pd.DataFrame(mtx_accuracy, index = sigma_list, columns = mu_list)
    df_coverage = pd.DataFrame(mtx_coverage, index = sigma_list, columns = mu_list)
    return df_accuracy, df_coverage

In [None]:
for method in ["rapidnjNJ", "raxmlMP", "fasttreeML"]:
    df_accuracy_fractal_1, df_coverage_fractal_1   = sigma_mu_read_data("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/sigma_mu/result."+method+"_fractal.csv")
    df_accuracy_original_1, df_coverage_original_1 = sigma_mu_read_data("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/sigma_mu/result."+method+"_original.csv")

    df_accuracy_fractal_2, df_coverage_fractal_2   = sigma_mu_read_data("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/sigma_mu/result."+method+"_fractal.2.csv")
    df_accuracy_original_2, df_coverage_original_2 = sigma_mu_read_data("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/sigma_mu/result."+method+"_original.2.csv")

    df_accuracy_fractal_max = np.maximum(df_accuracy_fractal_1.replace(np.nan, -1), df_accuracy_fractal_2.replace(np.nan, -1)).replace(-1, np.nan)
    df_coverage_fractal_max = np.maximum(df_coverage_fractal_1.replace(np.nan, -1), df_coverage_fractal_2.replace(np.nan, -1)).replace(-1, np.nan)

    for df_accuracy_fractal, df_coverage_fractal, df_accuracy_original, df_coverage_original, df_label in [(df_accuracy_fractal_1, df_coverage_fractal_1, df_accuracy_original_1, df_coverage_original_1, "1"), (df_accuracy_fractal_2, df_coverage_fractal_2, df_accuracy_original_2, df_coverage_original_2, "2"), (df_accuracy_fractal_max, df_coverage_fractal_max, df_accuracy_original_2, df_coverage_original_2, "max")]:




        df_accuracy_diff = df_accuracy_fractal - df_accuracy_original
        for name, cmap, vmin, vmax, df in [("accuracy_original", 'plasma', 50, 100, df_accuracy_original), ("accuracy_fractal", 'plasma', 50, 100, df_accuracy_fractal), ("accuracy_diff", 'coolwarm_r', -40, 40, df_accuracy_diff), ("coverage_fractal", 'RdPu_r', 0, 100, df_coverage_fractal)]:
            fig = plt.figure(figsize=(3,3))
            ax  = fig.add_axes([0.1,0.1,0.8,0.8])
            ax2 = fig.add_axes([0.1 + 0.9, 0.45,0.4,0.1])
            ax.patch.set_facecolor('grey')
            ax = fig.add_axes([0.1,0.1,0.8,0.8])
            sns.heatmap(df, cmap=cmap, ax=ax, cbar_ax=ax2, cbar_kws={'orientation': 'horizontal'}, vmin=vmin, vmax=vmax)
            #ax.set_xticks([4.5, 9.5, 14.5, 19.5, 24.5])
            #ax.set_xticklabels([1.0,2.0,3.0,4.0,5.0])
            #ax.set_yticks(23-np.array([2.5, 7.5, 12.5, 17.5, 22.5]))
            #ax.set_yticklabels([0.20, 0.63, 2.00, 6.31, 19.95])
            ax.set_title(method+"_"+name+"_"+df_label)
            ax.set_xlabel("$\mu$")
            ax.set_ylabel("$\\sigma$")
            ax.set_xticks([0.5,5.5,10.5,15.5,20.5])
            ax.set_xticklabels(['0.01','0.032','0.10','0.32','1.0'])
            ax.set_yticks([0.5, 10.5, 20.5, 30.5])
            ax.set_yticklabels(['3.0','2.0','1.0','0.0'])
            plt.savefig("figures/NK_0148_sigma_mu_"+method+"_"+name+"_"+df_label+".pdf", bbox_inches='tight')
            plt.close()

In [None]:
# mu - alpha varying FRACTAL parameters
def mu_alpha_fractal(csvfile):
    data    = pd.read_csv(csvfile, names = ["TASK_ID", "SIM_ID", "Method", "None", "Nseq", "Subsample", "Threshold", "Memory", "Memory_unit", "Time", "Time_unit", "Ntips", "NRFD"])
    data    = data.sort_values("Threshold", ascending = False)

    data["Coverage"] = data["Ntips"] / data["Nseq"] * 100
    data["Accuracy"] = (1 - data["NRFD"]) * 100
        
    subsample_list = list(sorted(set(list(data['Subsample']))))
    threshold_list = list(reversed(sorted(set(list(data['Threshold'])))))

    subsample2colidx = {subsample: idx for idx, subsample in enumerate(subsample_list)}
    threshold2rowidx = {threshold: idx for idx, threshold in enumerate(threshold_list)}
    mtx_accuracy = np.zeros((len(threshold_list), len(subsample_list))); mtx_accuracy[:,:] = np.nan
    mtx_coverage = np.zeros((len(threshold_list), len(subsample_list))); mtx_coverage[:,:] = np.nan
    mtx_time     = np.zeros((len(threshold_list), len(subsample_list))); mtx_coverage[:,:] = np.nan
    mtx_memory   = np.zeros((len(threshold_list), len(subsample_list))); mtx_coverage[:,:] = np.nan
    mtx_background = np.zeros((len(threshold_list), len(subsample_list))); mtx_coverage[:,:] = np.nan
    for subsample, threshold, accuracy, coverage, time, memory in zip(data['Subsample'], data['Threshold'], data['Accuracy'], data['Coverage'], data['Time'], data['Memory']):
        mtx_accuracy[threshold2rowidx[threshold]][subsample2colidx[subsample]] = accuracy
        mtx_coverage[threshold2rowidx[threshold]][subsample2colidx[subsample]] = coverage
        mtx_time    [threshold2rowidx[threshold]][subsample2colidx[subsample]] = time
        mtx_memory  [threshold2rowidx[threshold]][subsample2colidx[subsample]] = memory

    for threshold in threshold_list:
        mtx_time_ratio   = np.log2(mtx_time / sum(mtx_time[threshold2rowidx[36204]])*len(mtx_time[threshold2rowidx[36204]]))
        mtx_memory_ratio = np.log2(mtx_memory / sum(mtx_memory[threshold2rowidx[36204]])*len(mtx_memory[threshold2rowidx[36204]]))

    original_ave_accuracy = sum(mtx_accuracy[threshold2rowidx[36204]]) / len(mtx_accuracy[threshold2rowidx[36204]])

    for subsample in subsample_list:
        for threshold in threshold_list:
            mtx_background[threshold2rowidx[threshold]][subsample2colidx[subsample]] = (1 if (subsample > threshold) else 0)

            #if (mtx_accuracy[threshold2rowidx[threshold]][subsample2colidx[subsample]] - original_ave_accuracy < -5)):
            #    mtx_time_ratio[threshold2rowidx[threshold]][subsample2colidx[subsample]] = np.nan
            #    mtx_memory_ratio[threshold2rowidx[threshold]][subsample2colidx[subsample]] = np.nan

    df_accuracy = pd.DataFrame(mtx_accuracy, index = threshold_list, columns = subsample_list)
    df_coverage = pd.DataFrame(mtx_coverage, index = threshold_list, columns = subsample_list)
    df_time     = pd.DataFrame(mtx_time_ratio, index = threshold_list, columns = subsample_list)
    df_memory   = pd.DataFrame(mtx_memory_ratio, index = threshold_list, columns = subsample_list)
    df_background = pd.DataFrame(mtx_background, index = threshold_list, columns = subsample_list)

    return df_accuracy, df_coverage, df_time, df_memory, df_background

In [None]:
### To be deleted ###
# mu - alpha varying FRACTAL parameters
def mu_alpha_fractal(csvfile):
    data    = pd.read_csv(csvfile, names = ["TASK_ID", "SIM_ID", "Method", "None", "Nseq", "Subsample", "Threshold", "Memory", "Memory_unit", "Time", "Time_unit", "Ntips", "NRFD"])
    data    = data.sort_values("Threshold", ascending = False)

    data["Coverage"] = data["Ntips"] / data["Nseq"] * 100
    data["Accuracy"] = (1 - data["NRFD"]) * 100
        
    subsample_list = list(sorted(set(list(data['Subsample']))))
    threshold_list = list(reversed(sorted(set(list(data['Threshold'])))))

    subsample2colidx = {subsample: idx for idx, subsample in enumerate(subsample_list)}
    threshold2rowidx = {threshold: idx for idx, threshold in enumerate(threshold_list)}
    mtx_accuracy = np.zeros((len(threshold_list), len(subsample_list))); mtx_accuracy[:,:] = np.nan
    mtx_coverage = np.zeros((len(threshold_list), len(subsample_list))); mtx_coverage[:,:] = np.nan
    mtx_time     = np.zeros((len(threshold_list), len(subsample_list))); mtx_coverage[:,:] = np.nan
    mtx_memory   = np.zeros((len(threshold_list), len(subsample_list))); mtx_coverage[:,:] = np.nan
    mtx_background = np.zeros((len(threshold_list), len(subsample_list))); mtx_coverage[:,:] = np.nan
    for subsample, threshold, accuracy, coverage, time, memory in zip(data['Subsample'], data['Threshold'], data['Accuracy'], data['Coverage'], data['Time'], data['Memory']):
        mtx_accuracy[threshold2rowidx[threshold]][subsample2colidx[subsample]] = accuracy
        mtx_coverage[threshold2rowidx[threshold]][subsample2colidx[subsample]] = coverage
        mtx_time    [threshold2rowidx[threshold]][subsample2colidx[subsample]] = time
        mtx_memory  [threshold2rowidx[threshold]][subsample2colidx[subsample]] = memory

    #for i in range(len(mtx_time)):
    #    for j in range(len(mtx_time[0])):
    #        if mtx_time[i][j] == 0:
    #            print(i,j) 

    for threshold in threshold_list:
        mtx_time_ratio   = np.log2(mtx_time / sum(mtx_time[threshold2rowidx[36204]])*len(mtx_time[threshold2rowidx[36204]]))
        mtx_memory_ratio = np.log2(mtx_memory / sum(mtx_memory[threshold2rowidx[36204]])*len(mtx_memory[threshold2rowidx[36204]]))

    original_ave_accuracy = sum(mtx_accuracy[threshold2rowidx[36204]]) / len(mtx_accuracy[threshold2rowidx[36204]])

    for subsample in subsample_list:
        for threshold in threshold_list:
            mtx_background[threshold2rowidx[threshold]][subsample2colidx[subsample]] = (1 if (subsample > threshold) else 0)

            if (mtx_accuracy[threshold2rowidx[threshold]][subsample2colidx[subsample]] - original_ave_accuracy < -5):
                mtx_time_ratio[threshold2rowidx[threshold]][subsample2colidx[subsample]] = np.nan
                mtx_memory_ratio[threshold2rowidx[threshold]][subsample2colidx[subsample]] = np.nan

    df_accuracy = pd.DataFrame(mtx_accuracy, index = threshold_list, columns = subsample_list)
    df_coverage = pd.DataFrame(mtx_coverage, index = threshold_list, columns = subsample_list)
    df_time     = pd.DataFrame(mtx_time_ratio, index = threshold_list, columns = subsample_list)
    df_memory   = pd.DataFrame(mtx_memory_ratio, index = threshold_list, columns = subsample_list)
    df_background = pd.DataFrame(mtx_background, index = threshold_list, columns = subsample_list)

    return df_accuracy, df_coverage, df_time, df_memory, df_background

method = "raxmlMP"
dataset = "600"
for method in ["rapidnjNJ", "raxmlMP", "fasttreeML"]:
    for dataset in ["312", "600"]:

        df_accuracy, df_coverage, df_time, df_memory, df_background = mu_alpha_fractal("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/mu_alpha_fractal_param/result."+method+"."+dataset+".csv")

        title = method + "_" + dataset

        fig = plt.figure(figsize=(3,3))
        ax1 = fig.add_axes([0.1,0.1,0.8,0.8],label="a")
        ax2 = fig.add_axes([0.1,0.1,0.8,0.8],label="b")
        ax2.patch.set_alpha(0.0)
        ax1.set_title(title,fontsize=10)
        ax1.patch.set_facecolor('grey')
        sns.heatmap(df_background, ax=ax1, cmap=generate_cmap(['#00008B','#bbbbbb']),cbar_kws={"ticks":[]},cbar=False)
        sns.heatmap(df_accuracy  , ax=ax2, cmap='plasma',vmin=0, vmax=100,cbar=False)
        plt.savefig("figures_test/NK_0148_a_accuracy_"+title+".pdf", bbox_inches = 'tight')
        #plt.show()
        plt.close()

        fig = plt.figure(figsize=(3,3))
        ax1 = fig.add_axes([0.1,0.1,0.8,0.8],label="a")
        ax2 = fig.add_axes([0.1,0.1,0.8,0.8],label="b")
        ax2.patch.set_alpha(0.0)
        ax1.set_title(title,fontsize=10)
        ax1.patch.set_facecolor('grey')
        sns.heatmap(df_background, ax=ax1,cmap=generate_cmap(['#00008B','#bbbbbb']),cbar_kws={"ticks":[]},cbar=False)
        sns.heatmap(df_coverage  , ax=ax2,cmap='RdPu_r',vmin=0, vmax=100,cbar=False)
        plt.savefig("figures_test/NK_0148_b_coverage_"+title+".pdf", bbox_inches = 'tight')
        #plt.show()
        plt.close()

        fig = plt.figure(figsize=(3,3))
        ax1 = fig.add_axes([0.1,0.1,0.8,0.8],label="a")
        ax2 = fig.add_axes([0.1,0.1,0.8,0.8],label="b")
        ax2.patch.set_alpha(0.0)
        ax1.set_title(title,fontsize=10)
        ax1.patch.set_facecolor('grey')
        sns.heatmap(df_background, ax=ax1,cmap=generate_cmap(['#DDDDDD','#bbbbbb']),cbar_kws={"ticks":[]},cbar=False)
        sns.heatmap(df_time  , ax=ax2,cmap='Spectral_r',vmin=-1, vmax=1,cbar=False)
        plt.savefig("figures_test/NK_0148_c_runtime_"+title+".pdf", bbox_inches = 'tight')
        #plt.show()
        plt.close()

        fig = plt.figure(figsize=(3,3))
        ax1 = fig.add_axes([0.1,0.1,0.8,0.8],label="a")
        ax2 = fig.add_axes([0.1,0.1,0.8,0.8],label="b")
        ax2.patch.set_alpha(0.0)
        ax1.set_title(title,fontsize=10)
        ax1.patch.set_facecolor('grey')
        sns.heatmap(df_background, ax=ax1,cmap=generate_cmap(['#DDDDDD','#bbbbbb']),cbar_kws={"ticks":[]},cbar=False)
        sns.heatmap(df_memory  , ax=ax2,cmap='Spectral_r',vmin=-1, vmax=1,cbar=False)
        plt.savefig("figures_test/NK_0148_d_memory_"+title+".pdf", bbox_inches = 'tight')
        #plt.show()
        plt.close()

In [None]:
method = "raxmlMP"
dataset = "600"
for method in ["rapidnjNJ", "raxmlMP", "fasttreeML"]:
    for dataset in ["312", "600"]:

        df_accuracy, df_coverage, df_time, df_memory, df_background = mu_alpha_fractal("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/mu_alpha_fractal_param/result."+method+"."+dataset+".csv")

        title = method + "_" + dataset

        fig = plt.figure(figsize=(3,3))
        ax1 = fig.add_axes([0.1,0.1,0.8,0.8],label="a")
        ax2 = fig.add_axes([0.1,0.1,0.8,0.8],label="b")
        ax2.patch.set_alpha(0.0)
        ax1.set_title(title,fontsize=10)
        ax1.patch.set_facecolor('grey')
        sns.heatmap(df_background, ax=ax1, cmap=generate_cmap(['#00008B','#bbbbbb']),cbar_kws={"ticks":[]},cbar=False)
        sns.heatmap(df_accuracy  , ax=ax2, cmap='plasma',vmin=0, vmax=100,cbar=False)
        plt.savefig("figures/NK_0148_a_accuracy_"+title+".pdf", bbox_inches = 'tight')
        #plt.show()
        plt.close()

        fig = plt.figure(figsize=(3,3))
        ax1 = fig.add_axes([0.1,0.1,0.8,0.8],label="a")
        ax2 = fig.add_axes([0.1,0.1,0.8,0.8],label="b")
        ax2.patch.set_alpha(0.0)
        ax1.set_title(title,fontsize=10)
        ax1.patch.set_facecolor('grey')
        sns.heatmap(df_background, ax=ax1,cmap=generate_cmap(['#00008B','#bbbbbb']),cbar_kws={"ticks":[]},cbar=False)
        sns.heatmap(df_coverage  , ax=ax2,cmap='RdPu_r',vmin=0, vmax=100,cbar=False)
        plt.savefig("figures/NK_0148_b_coverage_"+title+".pdf", bbox_inches = 'tight')
        #plt.show()
        plt.close()

        fig = plt.figure(figsize=(3,3))
        ax1 = fig.add_axes([0.1,0.1,0.8,0.8],label="a")
        ax2 = fig.add_axes([0.1,0.1,0.8,0.8],label="b")
        ax2.patch.set_alpha(0.0)
        ax1.set_title(title,fontsize=10)
        ax1.patch.set_facecolor('grey')
        sns.heatmap(df_background, ax=ax1,cmap=generate_cmap(['#DDDDDD','#bbbbbb']),cbar_kws={"ticks":[]},cbar=False)
        sns.heatmap(df_time  , ax=ax2,cmap='Spectral_r',vmin=-1, vmax=1,cbar=False)
        plt.savefig("figures/NK_0148_c_runtime_"+title+".pdf", bbox_inches = 'tight')
        #plt.show()
        plt.close()

        fig = plt.figure(figsize=(3,3))
        ax1 = fig.add_axes([0.1,0.1,0.8,0.8],label="a")
        ax2 = fig.add_axes([0.1,0.1,0.8,0.8],label="b")
        ax2.patch.set_alpha(0.0)
        ax1.set_title(title,fontsize=10)
        ax1.patch.set_facecolor('grey')
        sns.heatmap(df_background, ax=ax1,cmap=generate_cmap(['#DDDDDD','#bbbbbb']),cbar_kws={"ticks":[]},cbar=False)
        sns.heatmap(df_memory  , ax=ax2,cmap='Spectral_r',vmin=-1, vmax=1,cbar=False)
        plt.savefig("figures/NK_0148_d_memory_"+title+".pdf", bbox_inches = 'tight')
        #plt.show()
        plt.close()

In [None]:
# mu - alpha: entropy distribution

def count_sequence_fast(in_fname):
    with gzip.open(in_fname, 'rt') as handle:
        k, l=0, 0
        for line in handle:
            if(line[0]==">"): k+=1
            elif(k==1): l+=len(line)-1
    return k,l # k: number of sequence, n: sequence length of first sequence (outgroup)

    return max_ratio

def count_table(fasta):
    seqcount,length=count_sequence_fast(fasta)
    ACGT_to_i={'A':0,'C':1,'G':2,'T':3}
    
    cnt_matrix=np.array([[0]*4]*length)
    
    with gzip.open(fasta, 'rt') as handle:
        sequences=SeqIO.parse(handle,'fasta')
        i=0
        for record in sequences:
            for k in range(length):
                cnt_matrix[k][ACGT_to_i[record.seq[k]]]+=1
    return cnt_matrix

def calculate_bit_score(fasta):
    table=count_table(fasta)
    bit_score_list=[]
    for i in range(len(table)):
        count_list=table[i]
        count_sum=sum(count_list)
        entropy=0
        for count in count_list:
            p=float(count)/count_sum
            if(p>0):
                entropy+=-p*np.log2(p)
        bit_score=2-entropy
        bit_score_list.append(bit_score)
    return bit_score_list

def bit_score(bins, fasta_file_list):
    fasta_list=[]
    for filename in fasta_file_list:
        fasta_list.append(filename)
    for fasta in fasta_list:
        print("reading "+fasta)
        bit_score_list=calculate_bit_score(fasta)

        # draw histogram
        fig = plt.figure(figsize=(2,2))
        ax = fig.add_axes([0.1,0.1,0.8,0.8])
        ax.hist(bit_score_list, range=(0, 2), color="#4294C3", bins=bins, alpha=1)

        filename_list=fasta.split("/")
        ax.set_title(filename_list[len(filename_list)-1])
        ax.set_xlabel("Bit Score")
        ax.set_xlim(0,2)
        ax.set_ylim(0,1000)
        ax.tick_params(bottom = False, left = False)
        plt.savefig("figures/NK_0148_"+fasta.split("/")[-1]+".Bit_score_histogram.pdf",bbox_inches="tight")
        plt.close()

file_list = []
for task_id in [1, 11, 24, 289, 312, 577, 587, 600]:
    file_list.append("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/mu_alpha_seq/downloaded/PRESUMEout."+str(task_id)+".fa.gz")
bit_score(30, file_list)

In [None]:
# sigma -mu nhd map
df_nhd = pd.read_csv("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/sigma_mu_nhd/taskid_mu_sigma_nhd_50percentile.csv", names = ['task_id', 'mu', 'sigma', 'nhd'])

mu_list    = list(sorted(set(list(df_nhd['mu']))))
sigma_list = list(reversed(sorted(set(list(df_nhd['sigma'])))))
mu2colidx    = {mu: idx    for idx, mu    in enumerate(mu_list   )}
sigma2rowidx = {sigma: idx for idx, sigma in enumerate(sigma_list)}
mtx_nhd = np.zeros((len(sigma_list), len(mu_list))); mtx_nhd[:,:] = np.nan
for mu, sigma, nhd in zip(df_nhd['mu'], df_nhd['sigma'], df_nhd['nhd']):
    mtx_nhd[sigma2rowidx[sigma]][mu2colidx[mu]] = np.log10(nhd)
    #mtx_nhd[sigma2rowidx[sigma]][mu2colidx[mu]] = nhd)
df_nhdmtx = pd.DataFrame(mtx_nhd, index = sigma_list, columns = mu_list)

In [None]:
cmap = "Spectral_r"
vmin = -2.5
vmax = 00

fig = plt.figure(figsize=(3,3))
ax  = fig.add_axes([0.1,0.1,0.8,0.8])
ax2 = fig.add_axes([0.1 + 0.9, 0.45,0.4,0.1])
ax.patch.set_facecolor('grey')
sns.heatmap(df_nhdmtx, cmap=cmap, ax=ax, cbar_ax=ax2, cbar_kws={'orientation': 'horizontal'}, vmin=vmin, vmax=vmax)
ax.set_title("NHD")
ax.set_xlabel("$\mu$")
ax.set_ylabel("$\\sigma$")
ax.set_xticks([0.5,5.5,10.5,15.5,20.5])
ax.set_xticklabels(['0.01','0.032','0.10','0.32','1.0'])
ax.set_yticks([0.5, 10.5, 20.5, 30.5])
ax.set_yticklabels(['3.0','2.0','1.0','0.0'])
ax2.set_xticklabels(["", "$10^{-2}$", "", "$10^{-1}$", "", "$1$"])
plt.savefig("figures/NK_0148_sigma_mu_nhd.pdf", bbox_inches='tight')
#plt.show()
plt.close()

for method in ["rapidnjNJ", "raxmlMP", "fasttreeML"]:

    df_accuracy_fractal_1, df_coverage_fractal_1   = sigma_mu_read_data("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/sigma_mu/result."+method+"_fractal.csv")
    df_accuracy_original_1, df_coverage_original_1 = sigma_mu_read_data("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/sigma_mu/result."+method+"_original.csv")

    df_accuracy_fractal_2, df_coverage_fractal_2   = sigma_mu_read_data("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/sigma_mu/result."+method+"_fractal.2.csv")
    df_accuracy_original_2, df_coverage_original_2 = sigma_mu_read_data("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0148/result/sigma_mu/result."+method+"_original.2.csv")

    df_accuracy_fractal_max = np.maximum(df_accuracy_fractal_1.replace(np.nan, -1), df_accuracy_fractal_2.replace(np.nan, -1)).replace(-1, np.nan)
    df_coverage_fractal_max = np.maximum(df_coverage_fractal_1.replace(np.nan, -1), df_coverage_fractal_2.replace(np.nan, -1)).replace(-1, np.nan)

    for df_accuracy_fractal, df_coverage_fractal, df_accuracy_original, df_coverage_original, df_label in [(df_accuracy_fractal_1, df_coverage_fractal_1, df_accuracy_original_1, df_coverage_original_1, "1"), (df_accuracy_fractal_2, df_coverage_fractal_2, df_accuracy_original_2, df_coverage_original_2, "2"), (df_accuracy_fractal_max, df_coverage_fractal_max, df_accuracy_original_2, df_coverage_original_2, "max")]:

        #print("point 1")
        df_accuracy_diff = df_accuracy_fractal - df_accuracy_original
        #print("point 2")
        df_comparable_TF = (df_accuracy_diff >= -5) #* (df_coverage_fractal >= 99) 
        #print("point 3")
        df_comparable = df_comparable_TF.replace(False, "").replace(True, "*")
        #print("point 4")

        fig = plt.figure(figsize=(3,3))
        ax  = fig.add_axes([0.1,0.1,0.8,0.8])
        ax3 = fig.add_axes([0.1 + 0.9, 0.45,0.5,0.1], label = 'b')
        ax2 = fig.add_axes([0.1 + 0.9, 0.45,0.5,0.1], label = 'a')
        ax.patch.set_facecolor('grey')
        sns.heatmap(df_nhdmtx, cmap=cmap, ax=ax, cbar_ax=ax2, annot = df_comparable, fmt = '', annot_kws={'fontsize':5}, cbar_kws={'orientation': 'horizontal'}, vmin=vmin, vmax=vmax)
        ax.set_title("NHD "+method+"_"+df_label)
        ax.set_xlabel("$\mu$")
        ax.set_ylabel("$\\sigma$")
        ax.set_xticks([0.5,5.5,10.5,15.5,20.5])
        ax.set_xticklabels(['0.01','0.032','0.10','0.32','1.0'])
        ax.set_yticks([0.5, 10.5, 20.5, 30.5])
        ax.set_yticklabels(['3.0','2.0','1.0','0.0'])
        ax2.tick_params(bottom=False, labelbottom=False)
        ax3.set_xlim(10**vmin, 10**vmax)
        ax3.set_xscale("log")
        ax3.tick_params(left=False, labelleft=False)
        ax3.spines["right"].set_color("none")
        ax3.spines["left"].set_color("none")
        ax3.spines["top"].set_color("none")
        ax3.spines["bottom"].set_color("none")
        
        #ax2.set_xticklabels(["", "$10^{-2}$", "", "$10^{-1}$", "", "$0$"])
        #ax2.set_xticklabels(["$10^{-2}$", "$10^{-1}$", "$0$"])
        plt.savefig("figures/NK_0148_sigma_mu_"+method+"_"+df_label+"_comparable.pdf", bbox_inches='tight')
        #plt.show()
        plt.close()

    mhd_comparable_list = []
    for i, sigma in enumerate(list(df_comparable.index)):
        for j, mu in enumerate(list(df_comparable.columns)):
            mhd_comparable_list.append([sigma, mu, df_comparable_TF.iloc[i,j], df_nhdmtx.iloc[i,j],df_accuracy_original.iloc[i,j]])
    df_mhd_comparable = pd.DataFrame(mhd_comparable_list, columns = ["sigma", 'mu', 'compararble', 'logmhd', 'Original'])
    df_mhd_comparable['total_datacount'] = 1
    df_mhd_comparable['logmhd_class'] = [int(logmhd*10)/10 for logmhd in df_mhd_comparable['logmhd']]
    df_mhd_comparable_bin = df_mhd_comparable.groupby('logmhd_class').sum()
    df_mhd_comparable_bin['Original_med'] = df_mhd_comparable.groupby('logmhd_class').median()['Original']
    fig = plt.figure(figsize=(3,2))
    ax  = fig.add_axes([0.1,0.1,0.8,0.8])
    ax.bar(x = df_mhd_comparable_bin.index, height = df_mhd_comparable_bin['compararble']/df_mhd_comparable_bin['total_datacount']*100, width =0.1, color = '#3F8F92', )
    ax2 = ax.twinx()
    ax2.plot(df_mhd_comparable_bin.index, df_mhd_comparable_bin['Original_med'], color = '#1432F5')
    ax.set_ylim(0,105)
    ax2.set_ylim(0,105)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    ax.set_xticks([-2.05,-1.05,0.05])
    ax.set_xticklabels(["1","10","100"])
    ax.set_xlabel("Median of Normalized Hamming Distance")
    ax.set_ylabel("%condition")
    ax2.set_ylabel("Median\naccuracy (%)")
    ax.set_title(method+"_"+df_label)
    plt.savefig("figures/NK_0148_sigma_mu_ratio_comparable"+method+"_"+df_label+".pdf", bbox_inches='tight')
    plt.close()


    fig = plt.figure(figsize=(10,2))
    ax  = fig.add_axes([0.1,0.1,0.8,0.8])
    sns.violinplot(x = 'logmhd_class', y = 'Original', data = df_mhd_comparable, hue = 'compararble',ax =ax)
    ax.set_xlabel("Log median of Normalized Hamming Distance")
    ax.set_ylabel("Median accuracy\nof original (%)")
    ax.set_title(method+"_"+df_label)
    plt.savefig("figures/NK_0148_sigma_mu_ratio_comparable_"+method+"_"+df_label+".pdf", bbox_inches='tight')
    plt.close()

    original_accuracy_threshold = 70
    df_mhd_comparable_bin['Ratio'] = df_mhd_comparable_bin['compararble']/df_mhd_comparable_bin['total_datacount']
    print(
        "For each MNHD bin in which original software accomplished accuracy of >",original_accuracy_threshold,"%, fractalized software performance was comparable with original one for >", df_mhd_comparable_bin[df_mhd_comparable_bin['Original_med'] > original_accuracy_threshold]['Ratio'].min()*100, "% of conditions"
    )

In [None]:
df_comparable_TF.iloc[0,0]

In [None]:
df_nhdmtx.iloc[0,0]

In [None]:
mhd_comparable_list = []
for i, sigma in enumerate(list(df_comparable.index)):
    for j, mu in enumerate(list(df_comparable.columns)):
        mhd_comparable_list.append([sigma, mu, df_comparable_TF.iloc[i,j], df_nhdmtx.iloc[i,j],df_accuracy_original.iloc[i,j]])
df_mhd_comparable = pd.DataFrame(mhd_comparable_list, columns = ["sigma", 'mu', 'compararble', 'logmhd', 'Original'])
df_mhd_comparable['total_datacount'] = 1
df_mhd_comparable['logmhd_class'] = [int(logmhd*10)/10 for logmhd in df_mhd_comparable['logmhd']]
df_mhd_comparable_bin = df_mhd_comparable.groupby('logmhd_class').sum()
df_mhd_comparable_bin['Original_med'] = df_mhd_comparable.groupby('logmhd_class').median()['Original']
df_mhd_comparable_bin['Ratio'] = df_mhd_comparable_bin['compararble']/df_mhd_comparable_bin['total_datacount']
df_mhd_comparable_bin[df_mhd_comparable_bin['Original_med'] > 70]['Ratio'].min()

In [None]:
fig = plt.figure(figsize=(3,2))
ax  = fig.add_axes([0.1,0.1,0.8,0.8])
ax.bar(x = df_mhd_comparable_bin.index, height = df_mhd_comparable_bin['compararble']/df_mhd_comparable_bin['total_datacount']*100, width =0.1, color = '#3F8F92', )

ax2 = ax.twinx()
ax2.plot(df_mhd_comparable_bin.index, df_mhd_comparable_bin['Original_med'], color = '#1432F5')

ax.set_ylim(0,105)
ax2.set_ylim(0,105)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
ax.set_xticks([-2.05,-1.05,0.05])
ax.set_xticklabels(["1","10","100"])
ax.set_xlabel("Median of Normalized Hamming Distance (%)")
ax.set_ylabel("%condition")
ax2.set_ylabel("Median\naccuracy (%)")

plt.savefig("figures/NK_0148_sigma_mu_ratio_comparable.pdf", bbox_inches='tight')