In [1]:
#!pip install dendropy
#!/usr/bin/python
import dendropy
from pandas import Series, DataFrame
import numpy as np
import function_1

In [2]:
#import dendropy
#from pandas import Series, DataFrame
#import numpy as np


def make_distance_matrix(distances, sort=False):
    distances = distances.reset_index().pivot_table(index=0, columns=1, values='distance')
    distances.index.name=None
    distances.columns.name=None
    distances = distances.fillna(distances.T)

    missing_rows = [i for i in distances.index if i not in distances.columns]
    for row in missing_rows:
        distances[row] = distances.loc[row]

    missing_columns = [c for c in distances.columns if c not in distances.index]
    for column in missing_columns:
        distances.loc[column] = distances[column]
    
    if sort:
        distances = distances.sort_index().sort_index(axis=1)
    return distances.fillna(0)



def d(start, end, distances):
    return distances.loc[(start, end), 'distance']



def get_distance_dataframe(tree):
    pdm = tree.phylogenetic_distance_matrix()
    distances = [[*name, distance] for name, distance in zip(pdm.distinct_taxon_pair_iter(), pdm.distances())]
    distances = DataFrame(distances)
    #print(distances)
    
    def get_label(node):
        return node.label
    
    distances[0] = distances[0].map(get_label)
    distances[1] = distances[1].map(get_label)
    
    distances = distances.set_index([0, 1])
    distances = distances.rename(columns = {2:'distance'})

    return distances


def get_distance_topology(tree):
    pdm = tree.phylogenetic_distance_matrix()
    distances = [[*name, distance] for name, distance in zip(pdm.distinct_taxon_pair_iter(), pdm.distances())]
    

    for pairs in distances:
        pairs[2] = pdm.path_edge_count(taxon1 = pairs[0], taxon2 = pairs[1])
    
    distances = DataFrame(distances)
    def get_label(node):
        return node.label
    
    distances[0] = distances[0].map(get_label)
    distances[1] = distances[1].map(get_label)
    distances = distances.set_index([0, 1])
    distances = distances.rename(columns = {2:'distance'})
    #distances = DataFrame(distances)
    
    #print("hello")
    return distances



def get_ratio(distances1, distances2):
    r1 = distances1.loc[common_leaves, common_leaves].sum().sum() / 2
    r2 = distances2.loc[common_leaves, common_leaves].sum().sum() / 2
    return r1/r2

def noisify_distances(distances):
    noise = np.triu(1 + np.random.random(distances.shape) / 10).round(2)
    noise += noise.T
    return distances * noise


#def temp_paralog(taxalist):
    


def get_paralogs(taxa_list1, taxa_list2):
    
    common_leaves = []
    temp_leaves = []
    paralogs_species = []
    dico_identifier = {}
    
    CommonLeaves = {}
    Paralogs = {}
    
    for c in taxa_list1:
        #This splitting method is not ideal : might not always have the same way to separate specie and gene -> add verification when launching program ?
        taxon = c.split(' ')[0]
        gene = c.split(' ')[1]
        
        if taxon in dico_identifier.keys() :
            dico_identifier[taxon].append(gene)
        else : dico_identifier[taxon] = [gene]

        if taxon in temp_leaves :
            paralogs_species.append(taxon)
            temp_leaves.remove(taxon)
            
        else :
            temp_leaves.append(taxon)

    
    for d in taxa_list2:
        taxon = d.split(' ')[0]
        gene = d.split(' ')[1]
        
        if taxon in dico_identifier.keys() :
            dico_identifier[taxon].append(gene)
        else : dico_identifier[taxon] = [gene]
        
        if taxon in temp_leaves :
            common_leaves.append(taxon)
            temp_leaves.remove(taxon)
            
        elif taxon in common_leaves :
            paralogs_species.append(taxon)
            common_leaves.remove(taxon)
    
    for specie in paralogs_species : Paralogs[specie] = dico_identifier[specie]
    for common in common_leaves : CommonLeaves[common] = dico_identifier[common]
    
    return Paralogs, CommonLeaves
            

In [3]:
def get_paralogs_variant(taxa1, taxa2, common_leaves, paralogs):
    
    dict_T1 = {"Specie":[], "Identifier":[], "Alias":[], "Distance":[]}
    dict_T2 = {"Specie":[], "Identifier":[], "Alias":[], "Distance":[]}
    
    for c in taxa1:
        taxon = c.split(' ')[0]
        gene = c.split(' ')[1]
        dict_T1["Specie"].append(taxon)
        dict_T1["Identifier"].append(gene)
        dict_T1["Alias"].append('')
        dict_T1["Distance"].append(0)
    
    for d in taxa2:
        taxon = d.split(' ')[0]
        gene = d.split(' ')[1]
        dict_T2["Specie"].append(taxon)
        dict_T2["Identifier"].append(gene)
        dict_T2["Alias"].append('')
        dict_T2["Distance"].append(0)
        
    temp_common = np.intersect1d(dict_T1["Specie"],dict_T2["Specie"])
    for element in temp_common:
        if dict_T1["Specie"].count(element) == 1 and dict_T2["Specie"].count(element) == 1:
            common_leaves.append(element)
        else : paralogs.append(element)
    
    return dict_T1, dict_T2

In [4]:
tree1 = dendropy.Tree.get(path='sim_tree_TOPO1', schema='newick', suppress_edge_lengths = False)
# or whatever relevant format if not newick
tree2 = dendropy.Tree.get(path='sim_tree_TOPO2', schema='newick', suppress_edge_lengths = False)



In [5]:
print(tree1.as_ascii_plot())
#essai = tree.phylogenetic_distance_matrix().distinct_taxon_pair_iter()
#essai
#tree.taxon()

                                                /------------------------ 9 b 
                        /-----------------------+                             
/-----------------------+                       \------------------------ 50 x
|                       |                                                     
|                       \------------------------------------------------ 12 b
+                                                                             
|                                               /------------------------ 9 a 
|                       /-----------------------+                             
|                       |                       \------------------------ 12 a
\-----------------------+                                                     
                        |                       /------------------------ 67 x
                        \-----------------------+                             
                                                \---

In [6]:
distances1 = get_distance_topology(tree1)
distances2 = get_distance_topology(tree2)
#distances1
#distances2

In [8]:
distances1

Unnamed: 0_level_0,Unnamed: 1_level_0,distance
0,1,Unnamed: 2_level_1
50 x,12 a,6
9 b,50 x,2
9 a,12 a,2
9 a,67 x,4
12 a,42 x,4
9 b,12 a,6
12 b,9 a,5
9 b,42 x,6
12 b,42 x,5
50 x,9 a,6


In [10]:
test = make_distance_matrix(distances1, True)
taxon = test.columns[0]
taxa = test.columns
taxa

Index(['12 a', '12 b', '42 x', '50 x', '67 x', '9 a', '9 b'], dtype='object')

In [53]:
common_leaves = {}
CL = []
paralogs = {}
para = []
paralogs, common_leaves = get_paralogs_variant(taxa, taxa, CL, para)
#common_leaves = common_leaves[:5]

#print(common_leaves)
paralogs