In [None]:
import random
import os
import sys 
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
from Bio import SeqIO
import gzip
from sklearn.linear_model import LinearRegression
random.seed(0)

In [None]:
mpl.rcParams['font.family']       = 'Helvetica'
mpl.rcParams['font.sans-serif']   = ["Helvetica","Arial","DejaVu Sans","Lucida Grande","Verdana"]
mpl.rcParams['figure.figsize']    = [4,3]
mpl.rcParams['font.size']         = 9
mpl.rcParams["axes.labelcolor"]   = "#000000"
mpl.rcParams["axes.linewidth"]    = 1.0 
mpl.rcParams["xtick.major.width"] = 1.0
mpl.rcParams["ytick.major.width"] = 1.0
cmap1 = plt.cm.tab10
cmap2 = plt.cm.Set3  
colors1 = [cmap1(i) for i in range(0,10)]
colors2 = [cmap2(i) for i in range(0,12)] 
plt.style.use('default')

In [None]:
os.chdir("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0149")

for dirname in ["figures"]:
    try:
        os.mkdir(dirname)
    except:
        None

In [None]:
df_fractal_all = pd.read_csv("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0149/result/result.fractal.all.txt", names = ["TASK_ID","Nseq","Method","rep","Memory","Mem_unit","Time","Time_unit","Ntips","NRFD"])
df_fractal_all["Coverage"] = df_fractal_all["Ntips"] / df_fractal_all["Nseq"] * 100
df_fractal_all["Accuracy"] = (1 - df_fractal_all["NRFD"]) * 100
df_fractal_all

In [None]:
df_fractal_all_mean = df_fractal_all.groupby("Method").mean()
df_fractal_all_mean["Category"] = "fractal_100_nodes"
df_fractal_all_mean["Method"] = df_fractal_all_mean.index

In [None]:
for method in ["rapidnjNJ", "raxmlMP", "fasttreeML"]:
    df_fractal_all_ext = df_fractal_all[df_fractal_all["Method"] == method]
    fig=plt.figure(figsize=(1.7,1.8))
    ax = fig.add_axes([0.1,0.1,0.35,0.8])
    ax2 = fig.add_axes([0.8,0.1,0.35,0.8])

    ax.scatter(x = "rep",y="Accuracy", data=df_fractal_all_ext, color='#7F33FF', alpha=1,s=20)
    ax2.scatter(x = "rep",y="Coverage", data=df_fractal_all_ext, color='#7F33FF', alpha=1,s=20)
    ax.set_xlabel("Trials")
    ax.set_title(method)
    ax2.set_xlabel("Trials")
    ax.set_ylabel("Accuracy")
    ax2.set_ylabel("Coverage")
    ax.set_ylim(70,101)
    ax2.set_ylim(70,101)
    ax.spines["top"].set_color("none")
    ax.spines["right"].set_color("none")
    ax2.spines["top"].set_color("none")
    ax2.spines["right"].set_color("none")
    ax.set_xticks([1,2,3,4,5])
    ax2.set_xticks([1,2,3,4,5])
    ax.set_xlim(0,6)
    ax2.set_xlim(0,6)
    plt.savefig('figures/NK_0149_fractal_all_'+method+'.pdf',bbox_inches='tight')
    plt.close()

In [None]:
df_fractal_subclades = pd.read_csv("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0149/result/result.fractal.subclades.manual.txt", names = ["TASK_ID","Nseq","Method","rep","Memory","Mem_unit","Time","Time_unit","Ntips","NRFD"])
df_fractal_subclades = df_fractal_subclades[df_fractal_subclades["Time"] < 86400]
#df_fractal_subclades["Coverage"] = df_fractal_subclades["Ntips"] /  df_fractal_subclades["Nseq"] * 100
df_fractal_subclades["Accuracy"] = (1 - df_fractal_subclades["NRFD"]) * 100
df_fractal_subclades["Category"] = "fractal_1_node"
df_fractal_subclades

In [None]:
df_original_subclades = pd.read_csv("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0149/result/result.original.subclades.txt", names = ["Memory_mafft","Mem_unit_mafft","Time_mafft","Time_unit_mafft","TASK_ID","SUBCLADE_ID","Method","ignore", "Memory","Mem_unit","Time","Time_unit","Ntips","NRFD"])
df_original_subclades = df_original_subclades.drop(columns = ["ignore"])
#df_original_subclades["Coverage"] = df_original_subclades["Ntips"] /  df_original_subclades["Nseq"] * 100
df_original_subclades["Accuracy"] = (1 - df_original_subclades["NRFD"]) * 100
df_original_subclades["Time"]     = df_original_subclades["Time"] + df_original_subclades["Time_mafft"]
df_original_subclades["Category"] = "original_1_node"
df_original_subclades

In [None]:
df_subclades = pd.concat([df_fractal_subclades, df_original_subclades,df_fractal_all_mean])
df_subclades = df_subclades[(df_subclades["Time"]< 86400) & (df_subclades["Ntips"]>3000)]

for method in ["rapidnjNJ", "raxmlMP", "fasttreeML"]:
    df_subclades_ext = df_subclades[df_subclades["Method"] == method]
    fig=plt.figure(figsize=(2.2,1.8))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    df_subclades_ext_original = df_subclades_ext[df_subclades_ext["Category"] == "original_1_node"]
    df_subclades_ext_fractal_1node     = df_subclades_ext[df_subclades_ext["Category"] == "fractal_1_node"]
    df_subclades_ext_fractal_100nodes  = df_subclades_ext[df_subclades_ext["Category"] == "fractal_100_nodes"]

    ax.scatter(x = df_subclades_ext_fractal_1node["Ntips"], y = df_subclades_ext_fractal_1node   ["Accuracy"], color = "#88C9D4", s = 20)
    ax.scatter(x = df_subclades_ext_fractal_100nodes["Ntips"], y = df_subclades_ext_fractal_100nodes["Accuracy"], color = '#7F33FF', s = 20)
    ax.scatter(x = df_subclades_ext_original["Ntips"], y = df_subclades_ext_original["Accuracy"], color = "#F8D686", s = 5)

    ax.set_xlim(3000,1200000)
    ax.set_xscale("log")
    ax.set_ylim(70,101)
    ax.spines["top"].set_color("none")
    ax.spines["right"].set_color("none")
    ax.set_xlabel("#tips of reconstructed tree")
    ax.set_ylabel("Accuracy (%)")
    ax.set_title(method)
    plt.savefig('figures/NK_0149_subclades_'+method+'.pdf',bbox_inches='tight')
    plt.close()

In [None]:
for method in ["rapidnj", "raxmlmp", "fasttreeml"]:

    df_time = pd.read_table("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0149/result/node_count."+method+".1.txt", names = ["date", "date_sec", "running", "running+waiting"])
    df_time["run_time"] = df_time['date_sec'] - list(df_time['date_sec'])[0]
    df_time = df_time[df_time["running+waiting"]>0]
    print(df_time.max())


    fig = plt.figure(figsize=(2,1.8))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    #sns.lineplot(data=df, x="run_time", y="running+waiting", label = 'running + waiting')
    #sns.lineplot(data=df, x="run_time", y="running",label='running')

    x=list(df_time['run_time'])

    plt.stackplot(x,df_time["running+waiting"], color = "#7638F5", alpha=1 )
    plt.stackplot(x,df_time['running'], color = "#A57FF9", alpha=1 )

    #ax.set_xlim(0,120000)
    ax.set_xlabel("Time (sec)")
    ax.set_ylabel("Number of computing\nnodes used")
    ax.set_title(method)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    plt.savefig("figures/NK_0149_time_node."+method+".pdf", bbox_inches='tight')
    #plt.show()
    plt.close()

In [None]:
df_time