In [None]:
import random
import os
import sys 
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
from Bio import SeqIO, Phylo
import gzip
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy import stats
random.seed(0)

In [None]:
os.chdir("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0150")

for dirname in ["table", "figures"]:
    try:
        os.mkdir(dirname)
    except:
        None

In [None]:
df_clade_seq_entropy = pd.read_table("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0150/from_yusuke/entropy.clade_to_tiplist.all.long.tsv")
df_clade_seq_entropy

In [None]:
df_seq_Nmutations = pd.read_table("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0150/from_yusuke/result/name_seqdist.bcat.txt", names = ['Tip', 'Nmutations', 'Nsubstitutions', 'Ndeletions'])

In [None]:
df_merge = pd.merge(df_clade_seq_entropy, df_seq_Nmutations, on = 'Tip')

In [None]:
records = SeqIO.parse("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0150/fasta/root.bcat.fa", 'fasta')

for record in records:
    root_seq = str(record.seq)

In [None]:
records = SeqIO.parse(gzip.open("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0150/from_ikaken/bcat_parent_delsub.SEQUENCE.fa.gz",'rt'), 'fasta')
name2parentseq = {}
for record in records:
    name2parentseq[record.name] = str(record.seq)

# most abundantなリードを取得した
well2parentseq = {}
for name in name2parentseq.keys():
    if (name.split("_")[1] == "1"):
        well2parentseq[name.split("_")[0]] = str(name2parentseq[name])

# B10とE3はdeletionがポジションの過半数を占めた
# B10については探してもどのparentもdeletionがポジションの過半数をしめていた
# E3についてはE3_5がdeletionの少ない配列だったのでそちらに交換した

#well2parentseq['E3'] =name2parentseq['E3_5']

In [None]:
name_Nparentmutation_Ninheritedmutation_Nmismatchtoparent = []

records = SeqIO.parse(gzip.open("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0150/from_idenken/fractalin.bcat.fa.gz",'rt'), 'fasta')
name2seq = {}
for record in records:
    name2seq[record.name] = str(record.seq)
    if (record.name != "root"):
        seq       = str(record.seq)
        well      = (record.name).split("_")[3]
        parentseq = well2parentseq[well]

        count_parentmutation    = 0
        count_inheritedmutation = 0
        count_mismatchtoparent  = 0
        count_revertant         = 0
        for i in range(len(root_seq)):
            if root_seq[i] != parentseq[i]:
                count_parentmutation += 1
                if seq[i]  == parentseq[i]:
                    count_inheritedmutation += 1
                elif seq[i] == root_seq[i]:
                    count_revertant += 1
            if seq[i]  != parentseq[i]:
                count_mismatchtoparent += 1
        name_Nparentmutation_Ninheritedmutation_Nmismatchtoparent.append([record.name, count_parentmutation, count_inheritedmutation,count_mismatchtoparent, count_revertant])

df_parentdist = pd.DataFrame(name_Nparentmutation_Ninheritedmutation_Nmismatchtoparent, columns = ["Tip", "Nparentmut", "Ninheritmut", "Nmismatch2parent", "Nrevertant"])

In [None]:
df_parentdist

In [None]:
df_merge = pd.merge(df_merge, df_parentdist, on ='Tip')
df_merge["Reverted_position_ratio"] = df_merge['Nrevertant']/df_merge['Nparentmut']
df_merge["Normalized_Reverted_position_ratio"] = (df_merge['Nrevertant']/df_merge['Nparentmut']) / (df_merge['Nmismatch2parent']/198)
df_merge

In [None]:
for target in ['Nmutations', 'Nsubstitutions', 'Ndeletions']:
    fig = plt.figure(figsize=(3,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])

    ax.hist(df_merge[target], range(0,198))

    ax.set_xlabel(target)
    ax.set_ylabel("# seq")
    ax.set_xlim(0,198)
    #ax.set_ylim(1,300000)
    plt.savefig("figures/histogram_"+target+".pdf", bbox_inches='tight')
    ax.set_yscale("log")
    ax.set_ylim(1,2000000)
    plt.savefig("figures/histogram_"+target+"_log.pdf", bbox_inches='tight')
    #plt.close()

In [None]:
df_clade = df_merge.groupby('Clade').mean()
df_clade['Ndata'] = df_merge.groupby('Clade').count()['Nmutations']

df_clade.to_csv("table/clade_entropy_aveNmutations_Ndata.txt", sep = "\t")

# 以下 X配列以上含むクレードのみ解析対象とする
X = 1000
df_clade = df_clade[df_clade['Ndata'] > X]
#df_clade

In [None]:
for target in ['Nmutations', 'Nsubstitutions', 'Ndeletions']:
    fig = plt.figure(figsize=(2,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])

    plt.scatter(df_clade['Entropy'], df_clade[target], alpha = 0.5, s = 1)

    ax.set_ylim(0,50)

    ax.set_xlabel("Entropy")
    ax.set_ylabel(target)
    ax.set_title("Clade size > "+str(X), fontsize = 10)

    plt.savefig("figures/Entropy_"+target+"_"+str(X)+".pdf", bbox_inches='tight')

In [None]:
entropy_threshold = 1

for target in ['Nmutations', 'Nsubstitutions', 'Ndeletions']:

    fig = plt.figure(figsize=(2,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    df_clade["Mixed"] = [True if entropy > entropy_threshold else False for entropy in df_clade['Entropy']]

    sns.violinplot(data = df_clade, y = target, x = 'Mixed')
    ax.set_xlabel("Entropy > "+str(entropy_threshold))
    ax.set_ylabel("Mean "+target)
    ax.set_title("Clade size > "+str(X), fontsize = 10)
    ax.set_ylim(0,50)
    plt.savefig("figures/"+target+"_distribution_"+str(X)+".pdf", bbox_inches='tight')

for target in ['Reverted_position_ratio']:

    fig = plt.figure(figsize=(2,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    df_clade["Mixed"] = [True if entropy > entropy_threshold else False for entropy in df_clade['Entropy']]

    sns.violinplot(data = df_clade, y = target, x = 'Mixed')
    ax.set_xlabel("Entropy > "+str(entropy_threshold))
    ax.set_ylabel("Mean "+target)
    ax.set_title("Clade size > "+str(X), fontsize = 10)
    ax.set_ylim(0,1)
    plt.savefig("figures/"+target+"_distribution_"+str(X)+".pdf", bbox_inches='tight')

for target in ['Normalized_Reverted_position_ratio']:

    fig = plt.figure(figsize=(2,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    df_clade["Mixed"] = [True if entropy > entropy_threshold else False for entropy in df_clade['Entropy']]

    sns.violinplot(data = df_clade, y = target, x = 'Mixed')
    ax.set_xlabel("Entropy > "+str(entropy_threshold))
    ax.set_ylabel("Mean "+target)
    ax.set_title("Clade size > "+str(X), fontsize = 10)
    #ax.set_ylim(0,1)
    plt.savefig("figures/"+target+"_distribution_"+str(X)+".pdf", bbox_inches='tight')

In [None]:
df_merge['Well'] = [tip.split("_")[3] for tip in df_merge['Tip']]

fig = plt.figure(figsize=(2,1))
for i, idx_alphabet in enumerate(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']):
    for j in range(12):
        well = idx_alphabet + str(j+1)

        df_merge_ext = df_merge[df_merge['Well'] == well].sort_values("Entropy")
        ax = fig.add_axes([0.1+i*1.2,0.1+j*1.4,0.8,0.8])

        ax.hist(df_merge_ext['Entropy'], range = (0,6.5), bins = 13)

        ax.set_title(plate, fontsize = 10)
        if (j == 0): ax.set_xlabel("Entropy")
        if (i == 0): ax.set_ylabel("# seq")
        ax.set_xlim(0,6.5)
        #ax.set_ylim(1,300000)
plt.savefig("figures/entropy_histogram_wells.pdf", bbox_inches='tight')
plt.close()

In [None]:
fig = plt.figure(figsize=(2,1))
for i, idx_alphabet in enumerate(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']):
    for j in range(12):
        well = idx_alphabet + str(j+1)

        df_merge_ext = df_merge[df_merge['Well'] == well]
        ax = fig.add_axes([0.1+i*1.2,0.1+j*1.4,0.8,0.8])

        ax.hist(df_merge_ext['Ninheritmut'], range = (0, 30), bins = 31)

        ax.set_title(well, fontsize = 10) 
        if (j == 0): ax.set_xlabel("# inherited from parent")
        if (i == 0): ax.set_ylabel("# seq")
        ax.set_xlim(0,31)
        #ax.set_ylim(1,300000)
plt.savefig("figures/Ninherited_histogram_wells.pdf", bbox_inches='tight')
plt.close()

In [None]:
fig = plt.figure(figsize=(2,1))
for i, idx_alphabet in enumerate(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']):
    for j in range(12):
        well = idx_alphabet + str(j+1)

        df_merge_ext = df_merge[df_merge['Well'] == well]
        ax = fig.add_axes([0.1+i*1.2,0.1+j*1.4,0.8,0.8])

        ax.hist(df_merge_ext['Ninheritmut']/df_merge_ext['Nparen tmut'], range = (0, 1), bins = 20)

        df_merge_ext_messy = df_merge_ext[df_merge_ext['Entropy'] > 1]

        ax.hist(df_merge_ext_messy['Ninheritmut']/df_merge_ext_messy['Nparentmut'], range = (0, 1), bins = 20)

        

        ax.set_title(well, fontsize = 10) 
        if (j == 0): ax.set_xlabel("Inherited ratio")
        if (i == 0): ax.set_ylabel("# seq")
        ax.set_xlim(0,1)
        #ax.set_ylim(1,300000)
plt.savefig("figures/Ninheritedratio_histogram_wells.pdf", bbox_inches='tight')
plt.close()

In [None]:
fig = plt.figure(figsize=(2,1))
for i, idx_alphabet in enumerate(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']):
    for j in range(12):
        well = idx_alphabet + str(j+1)

        df_merge_ext = df_merge[df_merge['Well'] == well]
        ax = fig.add_axes([0.1+i*1.2,0.1+j*1.4,0.8,0.8])

        ax.hist(df_merge_ext['Nrevertant']/df_merge_ext['Nparentmut'], range = (0, 1), bins = 20)

        df_merge_ext_messy = df_merge_ext[df_merge_ext['Entropy'] > 1]

        ax.hist(df_merge_ext_messy['Nrevertant']/df_merge_ext_messy['Nparentmut'], range = (0, 1), bins = 20)

        ax.set_title(well, fontsize = 10) 
        if (j == 0): ax.set_xlabel("Reverted position ratio")
        if (i == 0): ax.set_ylabel("# seq ")
        ax.set_xlim(0,1)
        #ax.set_ylim(1,300000)
plt.savefig("figures/Nrevertantratio_histogram_wells.pdf", bbox_inches='tight')
plt.close()

In [None]:
fig = plt.figure(figsize=(2,1))

ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax.hist(df_merge['Nrevertant']/df_merge['Nparentmut'], range = (0, 1), bins = 20)
df_merge_messy = df_merge[df_merge['Entropy'] > 1]
ax.hist(df_merge_messy['Nrevertant']/df_merge_messy['Nparentmut'], range = (0, 1), bins = 20)
ax.set_title("All", fontsize = 10)
ax.set_xlabel("Reverted position ratio")
ax.set_ylabel("# seq ")
ax.set_xlim(0,1)
#ax.set_ylim(1,300000)
plt.savefig("figures/Nrevertantratio_histogram_total.pdf", bbox_inches='tight')
plt.close()

In [None]:
plate = 'A1'
df_merge_ext = df_merge[df_merge['Well'] == plate].sort_values("Entropy")
df_merge_ext.to_csv("table/"+plate+"_seq_table.txt", index = False, sep = '\t')

seq = []

records = SeqIO.parse("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0150/fasta/root.bcat.fa", 'fasta')

for record in records:
    root_seq = str(record.seq)

ATGC2int = {'A':1, 'T':2, 'G':3, 'C':4, '-':5}

seq_table = []
records = SeqIO.parse("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0150/fasta/"+plate+"_sample.fa", 'fasta')

for record in records:
    seq = str(record.seq)

    seq_row = []

    for i in range(len(seq)):
        if(seq[i] == root_seq[i]):
            seq_row.append(0)
        else:
            seq_row.append(ATGC2int[seq[i]])
    seq_table.append(seq_row)

df_seq_table = pd.DataFrame(seq_table)


fig = plt.figure(figsize=(6,6))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax2 = fig.add_axes([1.0,0.1,0.05,0.2])
ax.set_title(plate)
#ax.imshow(df_seq_table, cmap = 'tab10', aspect = 0.5)
sns.heatmap(df_seq_table, cmap = 'terrain_r', ax = ax, cbar_ax = ax2)
plt.savefig("figures/sequence_heatmap_"+plate+".pdf", bbox_inches='tight')
plt.close()

In [None]:
clade_name = 'Clade3977'

records = SeqIO.parse(gzip.open("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0150/from_idenken/fractalin.bcat.fa.gz",'rt'), 'fasta')
name2seq = {}
for record in records:
    name2seq[record.name] = str(record.seq)

seq_to_visualize_list = []
well_list = []
for i, idx_alphabet in enumerate(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']):
    for j in range(12):
        well = idx_alphabet + str(j+1)
        df_merge_ext = df_merge[df_merge['Clade']==clade_name][df_merge["Plate"]==well]
        if(len(list(df_merge_ext['Tip']))>10):
            seqnames = random.sample(list(df_merge_ext['Tip']), 10)
            seq_to_visualize_list.extend([name2seq[name] for name in seqnames])
            well_list.append(well)

In [None]:


ATGC2int = {'A':1, 'T':2, 'G':3, 'C':4, '-':5}

seq_table = []

for seq in seq_to_visualize_list:

    seq_row = []

    for i in range(len(seq)):
        if(seq[i] == root_seq[i]):
            seq_row.append(0)
        else:
            seq_row.append(ATGC2int[seq[i]])
    seq_table.append(seq_row)

df_seq_table = pd.DataFrame(seq_table)

fig = plt.figure(figsize=(6,6))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax2 = fig.add_axes([1.0,0.1,0.05,0.2])
#ax.imshow(df_seq_table, cmap = 'terrain_r', aspect = 0.5)
sns.heatmap(df_seq_table, cmap = 'terrain_r', ax = ax, cbar_ax = ax2)

ax.set_xlabel("Sequence position")

ax.set_ylabel("Sequences")

ax.set_title(clade_name)

ax.set_yticks([10*i for i in range(int(len(seq_to_visualize_list)/10))])
ax.set_yticklabels(well_list, fontsize = 5)

plt.savefig("figures/sequence_heatmap_"+clade_name+".pdf", bbox_inches='tight')

plt.close()

In [None]:
#クレード内配列間距離の計算
cladename_meanhd_list = []

for k, clade_name in enumerate(list(set(df_clade.index))):

    #print(k, clade_name)
    seq_name_list = list(df_merge[df_merge['Clade']==clade_name].Tip)
    count_list    = []
    for _ in range(1000):
        sample_seq_name_list = random.sample(seq_name_list,2)
        str1 = name2seq[sample_seq_name_list[0]]
        str2 = name2seq[sample_seq_name_list[1]]
        count = 0
        for i in range(len(str1)):
            if (str1[i] != str2[i]):
                count += 1
        count_list.append(count)
    hd = sum(count_list) / len(count_list)

    cladename_meanhd_list.append([clade_name, hd])

# all clades
seq_name_list = list(df_merge.Tip)
for _ in range(1000):
    sample_seq_name_list = random.sample(seq_name_list,2)
    str1 = name2seq[sample_seq_name_list[0]]
    str2 = name2seq[sample_seq_name_list[1]]
    count = 0
    for i in range(len(str1)):
        if (str1[i] != str2[i]):
            count += 1
    count_list.append(count)
hd = sum(count_list) / len(count_list)
mean_hd_all_clades = hd

In [None]:
df_cladename_meanhd = pd.DataFrame(cladename_meanhd_list, columns= ['Clade_', 'meanhd'], index = [cladename_meanhd[0] for cladename_meanhd in cladename_meanhd_list])

In [None]:
df_clade['Clade_']  = df_clade.index
df_clade_hd = pd.merge(df_cladename_meanhd, df_clade, on = 'Clade_')
df_clade_hd

In [None]:
for target in ['meanhd']:
    fig = plt.figure(figsize=(2,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])

    plt.scatter(df_clade_hd['Entropy'], df_clade_hd[target], alpha = 0.5, s = 1)

    ax.set_ylim(0,50)

    ax.set_xlabel("Entropy")
    ax.set_ylabel(target)
    ax.set_title("Clade size > "+str(X), fontsize = 10)

    plt.savefig("figures/Entropy_"+target+"_"+str(X)+".pdf", bbox_inches='tight')

for target in ['meanhd']:

    fig = plt.figure(figsize=(2,2))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    df_clade["Mixed"] = [True if entropy > entropy_threshold else False for entropy in df_clade['Entropy']]

    sns.violinplot(data = df_clade_hd, y = target, x = 'Mixed')
    ax.set_xlabel("Entropy > "+str(entropy_threshold))
    ax.set_ylabel(target)
    ax.set_title("Clade size > "+str(X), fontsize = 10)

    ax.axhline(mean_hd_all_clades)

    #ax.set_ylim(0,1)
    plt.savefig("figures/"+target+"_distribution_"+str(X)+".pdf", bbox_inches='tight')

    sns.violinplot(data = df_clade_hd, y = target, x = 'Mixed')
    print(stats.mannwhitneyu(df_clade_hd[df_clade_hd["Mixed"]==True]['meanhd'], df_clade_hd[df_clade_hd["Mixed"]==False]['meanhd']))

In [None]:
def seqdist(str1, str2):
    count = 0
    for i in range(len(str1)):
        if str1[i]!=str2[i]:
            count += 1
    return count

parentname_list = list(name2parentseq.keys())
new_parentname_list = []

for parentname in parentname_list:
    seq = name2parentseq[parentname]
    seq = seq.replace("-", "")
    if (len(seq) > 150):
        new_parentname_list.append(parentname)
parentname_list = new_parentname_list

In [None]:
dist_matrix = np.zeros((len(parentname_list), len(parentname_list)))
for i, parent_i in enumerate(parentname_list):
    for j, parent_j in enumerate(parentname_list):
        dist_matrix[i,j] = seqdist(name2parentseq[parent_i], name2parentseq[parent_j])

df_dist_matrix = pd.DataFrame(dist_matrix, index = parentname_list, columns = parentname_list)


fig = plt.figure(figsize=(100,100))

ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax2 = fig.add_axes([1.0,0.1,0.05,0.2])
sns.heatmap(df_dist_matrix, cmap = 'cividis_r', ax = ax, cbar_ax = ax2)

plt.savefig("figures/heatmap_parent_dist_matrix.pdf", bbox_inches='tight', )

plt.close()

fig = plt.figure(figsize=(100,100))
sns.clustermap(df_dist_matrix, cmap = 'cividis_r')

plt.savefig("figures/clustermap_parent_dist_matrix.pdf", bbox_inches='tight', )

In [None]:

cladename = "Clade3977"
clade_random_sample = random.sample(list(df_merge[df_merge['Clade']==cladename].Tip), 10)
name_dist_list = []
for seqname in parentname_list:
    for tipname in clade_random_sample:
        name_dist_list.append([seqname, tipname, seqdist(name2parentseq[seqname], name2seq[tipname])])
df_name_dist           = pd.DataFrame(name_dist_list, columns = ['name', 'tipname', 'dist'])
df_name_dist['Well']   = [name.split("_")[0] for name in df_name_dist['name']]
#df_name_dist_mean_tips = df_name_dist.groupby('Well').mean()
df_name_dist_mean_tips = df_name_dist.groupby('name').mean()
#df_name_dist_mean_tips.sort_values("Well")
fig = plt.figure(figsize=(15,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax.set_xlim(0, len(df_name_dist_mean_tips.index)+1)
ax.tick_params(axis='x',rotation=90,labelsize=3)
ax.bar(x = df_name_dist_mean_tips.index, height = df_name_dist_mean_tips['dist'])
ax.set_ylabel("Mean Hamming Distance")
ax.set_xlabel("Parental sequence")

ax.set_title(cladename)
plt.savefig("figures/parent_MHD_"+cladename+".pdf", bbox_inches='tight', )

In [None]:

cladename = "Clade3977"
clade_random_sample = random.sample(list(df_merge[df_merge['Clade']==cladename].Tip), 10)
name_dist_list = []
for seqname in parentname_list:
    for tipname in clade_random_sample:
        name_dist_list.append([seqname, tipname, seqdist(name2parentseq[seqname], name2seq[tipname])])
df_name_dist           = pd.DataFrame(name_dist_list, columns = ['name', 'tipname', 'dist'])
df_name_dist['Well']   = [name.split("_")[0] for name in df_name_dist['name']]
#df_name_dist_mean_tips = df_name_dist.groupby('Well').mean()
df_name_dist_mean_tips = df_name_dist.groupby('name').mean()
#list(df_name_dist_mean_tips['dist'])

In [None]:
threshold = 6

dist_matrix = []

messy_clade_list = list(set(df_merge[df_merge['Entropy']>threshold].Clade))

for cladename in messy_clade_list:

    clade_random_sample = random.sample(list(df_merge[df_merge['Clade']==cladename].Tip), 10)
    name_dist_list = []
    for seqname in parentname_list:
        for tipname in clade_random_sample:
            name_dist_list.append([seqname, tipname, seqdist(name2parentseq[seqname], name2seq[tipname])])
    df_name_dist           = pd.DataFrame(name_dist_list, columns = ['name', 'tipname', 'dist'])
    df_name_dist['Well']   = [name.split("_")[0] for name in df_name_dist['name']]
    #df_name_dist_mean_tips = df_name_dist.groupby('Well').mean()
    df_name_dist_mean_tips = df_name_dist.groupby('name').mean()
    dist_matrix.append(list(df_name_dist_mean_tips['dist']))

In [None]:
df_clade_parent = pd.DataFrame(dist_matrix, index = messy_clade_list, columns = parentname_list)

fig = plt.figure(figsize=(50,50))

ax = fig.add_axes([0.1,0.1,0.8,0.8])
ax2 = fig.add_axes([1.0,0.1,0.05,0.2])
sns.heatmap(df_clade_parent, cmap ='cividis_r', ax =ax, cbar_ax = ax2)
plt.savefig("figures/messyclade_parent.pdf", bbox_inches='tight', )
plt.close()

In [None]:
# E4_1と各配列の距離を計算

name_dist_list = []
records = SeqIO.parse(gzip.open("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0150/from_idenken/fractalin.bcat.fa.gz",'rt'), 'fasta')
name2seq = {}
for record in records:
    name2seq[record.name] = str(record.seq)
    if (record.name != "root"):
        seq  = str(record.seq)
        name_dist_list.append([record.name, seqdist(seq, name2parentseq["E4_1"])])

In [None]:
df_name_dist_E4_1 = pd.DataFrame(name_dist_list, columns = ['Tip', 'Distance_to_E4_1'])
df_name_dist_E4_1_merge = pd.merge(df_merge, df_name_dist_E4_1 , on = 'Tip')

In [None]:
df_merge['Well'] = [tip.split("_")[3] for tip in df_merge['Tip']]

fig = plt.figure(figsize=(2,1))
for i, idx_alphabet in enumerate(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']):
    for j in range(12):
        well = idx_alphabet + str(j+1)

        df_merge_ext = df_name_dist_E4_1_merge[df_merge['Well'] == well]
        ax = fig.add_axes([0.1+i*1.2,0.1+j*1.4,0.8,0.8])

        ax.hist(df_merge_ext['Distance_to_E4_1'], range = (0, 50), bins = 51)

        df_merge_ext_messy = df_merge_ext[df_merge_ext['Entropy'] > 1]

        ax.hist(df_merge_ext_messy['Distance_to_E4_1'], range = (0, 50), bins = 51)

        ax.set_title(well, fontsize = 10) 
        if (j == 0): ax.set_xlabel("Distance to E4_1")
        if (i == 0): ax.set_ylabel("# seq")
        ax.set_xlim(0,50)
        #ax.set_ylim(1,300000)
plt.savefig("figures/Dist_to_E4_1_wells.pdf", bbox_inches='tight')
plt.close()

In [None]:
tree = Phylo.read("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0150/from_yusuke/FRACTALout.ext.nwk", 'newick')


In [None]:
old_tip_names = set([old_tip.name for old_tip in tree.get_terminals()])

In [None]:
df_old_tips = pd.read_table("/Users/nk/Documents/backupped/Research/YachieLabLocal/FRACTAL/data/NK_0150/from_yusuke/col1Merged.col2Previous.tsv", names = ["Tip", "Old_Tip"])

In [None]:
df_old_tips["included"] = [True if (name in old_tip_names) else False for name in df_old_tips["Old_Tip"]]

In [None]:
df_old_tips_included = pd.merge(df_merge, df_old_tips, on ='Tip')
df_old_tips_included

In [None]:
fig = plt.figure(figsize=(2,2))
ax = fig.add_axes([0.1,0.1,0.8,0.8])

sns.violinplot(data = df_old_tips_included, y = 'Entropy', x = 'included')
ax.set_xlabel("Included in the previous version")
ax.set_ylabel("Entropy")
#ax.set_ylim(0,1)
plt.savefig("figures/tree_included.pdf", bbox_inches='tight')

In [None]:
df_old_tips_included.sum()