In [1]:
# pip install ete3

In [2]:
from ete3 import Tree 
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt


# python tree_to_graph.py C:\Users\harsh\s\PrimConsTree\datasets\simulated\Trex_trees20.txt

In [1]:
sets = [frozenset({"specie 1", "specie 2", "specie 3"}), frozenset({"specie 2", "specie 3", "specie 1"}), frozenset({"specie 1", "specie 2"}), frozenset({"specie 2", "specie 1"}), frozenset({"specie 12", "specie 3"})]
 
# option 1
for s in sets:
   print(s, "mapped to", s.__hash__())
 
# option 2
mapping = dict()
for s in sets:
   if s in mapping:
       print(str(s), "already mapped to", mapping[s])
   else:
       mapping[s] = len(mapping)
       print(str(s), "mapped to", mapping[s])

frozenset({'specie 2', 'specie 1', 'specie 3'}) mapped to -6338323216877714290
frozenset({'specie 2', 'specie 1', 'specie 3'}) mapped to -6338323216877714290
frozenset({'specie 2', 'specie 1'}) mapped to -3544678295479905766
frozenset({'specie 2', 'specie 1'}) mapped to -3544678295479905766
frozenset({'specie 3', 'specie 12'}) mapped to 2745075874086891383
frozenset({'specie 2', 'specie 1', 'specie 3'}) mapped to 0
frozenset({'specie 2', 'specie 1', 'specie 3'}) already mapped to 0
frozenset({'specie 2', 'specie 1'}) mapped to 1
frozenset({'specie 2', 'specie 1'}) already mapped to 1
frozenset({'specie 3', 'specie 12'}) mapped to 2


### Algorithm 1: Supergraph Construction 

#### First step: define Tree class with leaf nodes and internal nodes: 

In [3]:
def read_tree(input_file: str) -> Tree:
    """Read the tree from the input file in Newick format using the ete3 library.

    Args:
        input_file (str): Path to the input file.

    Returns:
        Tree: The tree object.
    """
    with open(input_file, 'r') as file:
        # Read the tree from the file
        tree = Tree(file.readline().strip(), format=1)
    return tree

class TreeStructure:
    def __init__(self, tree: Tree):
        self.tree = tree
        self.leaf_nodes = self.identify_nodes()
        self.internal_nodes = self.update_internal_node_names()

    def identify_nodes(self) -> list:
        """Identify leaf nodes in the tree.

        Returns:
            list: List of leaf nodes in the tree.
        """
        leaf_nodes = []
        # Use ete3 Tree.traverse() method to traverse the tree in postorder: children before parents
        for node in self.tree.traverse("postorder"):
            if node.is_leaf():
                leaf_nodes.append(node.name)
        return leaf_nodes

    def update_internal_node_names(self) -> list:
        """Update the internal node names in the tree by concatenating the names of their children.

        Returns:
            list: List of internal nodes in the tree.
        """
        internal_nodes = []

        def generate_unique_name(name: str) -> str:
            """Generate a unique name for an internal node by adding a suffix.

            Args:
                name (str): The name of the internal node.

            Returns:
                str: The unique name of the internal node with a suffix.
            """
            suffix = 1
            while name + str(suffix) in self.leaf_nodes:
                suffix += 1
            return name + str(suffix)

        def update_internal_node(node):
            """Update the name of an internal node by concatenating the names of its children.

            Args:
                node (TreeNode): The internal node to update.
            """
            if not node.is_leaf():
                children_names = sorted([child.name for child in node.children])
                new_name = ''.join(children_names)
                if new_name in self.leaf_nodes:
                    new_name = generate_unique_name(new_name)
                node.name = new_name
                internal_nodes.append(node.name)

        for node in self.tree.traverse("postorder"):
            update_internal_node(node)

        return internal_nodes
    
input_file = r'C:\Users\harsh\s\PrimConsTree\datasets\simulated\trex_treestest.txt'
input_tree = read_tree(input_file)
tree_structure = TreeStructure(input_tree)
print(tree_structure.leaf_nodes)
print(tree_structure.internal_nodes)
print(input_tree)

['4', '1', '2', '3']
['23', '123', '1234']

   /-4
--|
  |   /-1
   \-|
     |   /-2
      \-|
         \-3


In [4]:
# empty list for nodes:
vertices = []
vertices_no = 0
# Would it be better to use a matrix to represent the graph?
graph = []
edge_frequencies = defaultdict(int)
all_internal_nodes = []
in_degrees = []

def identify_nodes(tree):
    leaf_nodes = []
    for node in tree.traverse("postorder"):
        if node.is_leaf():
            leaf_nodes.append(node.name)
    return leaf_nodes


def add_vertex(v):
    global graph
    global vertices_no
    global vertices
    global in_degrees
    if v not in vertices:
      vertices_no = vertices_no + 1
      vertices.append(v)
      in_degrees.append(0)
      if vertices_no > 1:
          for vertex in graph:
              vertex.append(0)
      graph.append([0] * vertices_no)


def add_edge(v1, v2, e):
    global graph
    global vertices_no
    global vertices
    global in_degrees
    # edge_frequencies[(v1, v2)] += 1
    # edge_frequencies[(v2, v1)] += 1  # Assuming an undirected graph
    if v1 <= v2:
        edge_frequencies[(v1, v2)] += 1
    else:
        edge_frequencies[(v2, v1)] += 1

    index1 = vertices.index(v1)
    index2 = vertices.index(v2)
    # Check if an edge already exists between v1 and v2
    if graph[index1][index2] != 0:
        graph[index1][index2] = graph[index1][index2] + e
        in_degrees[index2] += 1
        # print(v1," to ", v2, " more than once\n\n")
    else:
        graph[index1][index2] = e
        in_degrees[index2] += 1
        # print(v1," to ", v2, "\n\n")
    # print("graph is now as: ",graph)


def update_internal_node_names(tree, leaf_nodes):
    internal_nodes = []
    def generate_unique_name(name):
        # Generate a unique name by adding a suffix
        suffix = 1
        while name + str(suffix) in leaf_nodes:
            suffix += 1
        return name + str(suffix)

    def update_internal_node(node):
        if not node.is_leaf():
            children_names = sorted([child.name for child in node.children])
            new_name = ''.join(children_names)
            new_name = ''.join(sorted(new_name))
            if new_name in leaf_nodes:
                new_name = generate_unique_name(new_name)
            node.name = new_name
            internal_nodes.append(node.name)
            if node.name not in all_internal_nodes:
              all_internal_nodes.append(node.name)

    for node in tree.traverse("postorder"):
        update_internal_node(node)
    return internal_nodes


def check_direct_connection(tree, v1, v2):
    # Find the nodes by name
    node1 = tree.search_nodes(name=v1)
    node2 = tree.search_nodes(name=v2)
    # print("\n", v1, node1,node1[0], " and ",v2,node2,"\n")
    if not node1 or not node2:
        print("they does not exits")
        return False  # One of the nodes does not exist

    node1 = node1[0]
    node2 = node2[0]
    #print("\ncheck_direct_connection :", node1, "-", node1[0],"\n")
    # Check if node1 is the parent of node2
    if node2.up == node1: # or node1.up == node2
        # print()
        # print(node1, "is parent of ",node2)
        # print()
        return True
    return False


def calculate_distances(tree):
    distances = {}
    all_nodes = list(tree.traverse())  # Collect all nodes
    for i in range(len(all_nodes)):
        for j in range(i + 1, len(all_nodes)):
            node1 = all_nodes[i]
            node2 = all_nodes[j]
            distance = node1.get_distance(node2)  # Calculate distance using the tree library function
            distance = round(distance, 2)
            distances[(node1.name, node2.name)] = distance
            distances[(node2.name, node1.name)] = distance  # Optionally, for undirected graphs
    return distances


def tree_to_graph(tree):
    # Get a list of all unique nodes (leaves and internal nodes)
    # print(tree)
    unique_nodes = set(tree.iter_leaves())
    unique_nodes.update(tree.traverse("preorder"))

    # Create a matrix to represent the distances
    node_names = [node.name for node in unique_nodes]  # Define node_names here
    node_names.sort(key=lambda x: (tree & x).get_distance(tree))
    print("node_names : ",node_names)

    distances = calculate_distances(tree)

    # add vertex
    for node in node_names:
        add_vertex(node)

    # add edge
    for i in range(0, len(node_names)):
      for j in range(i + 1, len(node_names)):
        v1 = node_names[i]
        v2 = node_names[j]
        if check_direct_connection(tree, v1,v2):
          distance = round(distances.get((v1, v2), 0), 2)
          #print("\n\nTree nodes conection : ",v1,"-",v2,"-",distance,"\n")
          add_edge(v1, v2, distance)


input_file = r'C:\Users\harsh\s\PrimConsTree\datasets\simulated\trex_treestest.txt'
with open(input_file, 'r') as file:
    for idx, line in enumerate(file, 1):
        tree = Tree(line.strip(), format=1)
        leaf_nodes = identify_nodes(tree)
        # print(leaf_nodes)
        internal_nodes = update_internal_node_names(tree,leaf_nodes)
        # print(internal_nodes)
        tree_to_graph(tree)


print("Combined Graph In-degrees:", in_degrees)

# print('\nnumber of internal nodes:',len(internal_nodes))
print('\n number of all internal nodes:',len(all_internal_nodes))
print('\n all internal nodes:',all_internal_nodes)
print('\n all vertices:',vertices)
print('\n number of vertices:',vertices_no)

for i in range(vertices_no):
  print('vertices',vertices[i],end="-")
  print('in degree', in_degrees[i])

# Extract unique node names
nodes = sorted(set(node for edge in edge_frequencies for node in edge))
num_nodes = len(nodes)
print('\n nodes:',nodes)
print('\n number of nodes:',len(nodes))

# Populate the frequency matrix with the values from the dictionary
# frequency_matrix = [[0] * num_nodes for _ in range(num_nodes)]
# for i in range(num_nodes):
#     for j in range(num_nodes):
#         node_i, node_j = nodes[i], nodes[j]
#         if (node_i, node_j) in edge_frequencies:
#             frequency_matrix[i][j] = edge_frequencies[(node_i, node_j)]
#         elif (node_j, node_i) in edge_frequencies:
#             frequency_matrix[i][j] = edge_frequencies[(node_j, node_i)]  # Assuming an undirected graph

# frequency_matrix = [[0] * num_nodes for _ in range(num_nodes)]
# for i in range(num_nodes):
#     for j in range(num_nodes):
#         node_i, node_j = nodes[i], nodes[j]
#         if node_i <= node_j:
#             freq = edge_frequencies[(node_i, node_j)]
#             frequency_matrix[i][j] = frequency_matrix[j][i] = freq
#         else:
#             freq = edge_frequencies[(node_j, node_i)]
#             frequency_matrix[i][j] = frequency_matrix[j][i] = freq

#         # Print the node names and the frequency between them
#         if i <= j:  # To avoid printing the same pair twice in an undirected graph
#             print(f"Frequency between {node_i} and {node_j}: {freq}")


# Generate nodes list and initialize matrix
nodes = sorted(set(node for edge in edge_frequencies for node in edge))
num_nodes = len(nodes)
print('\nNodes:', nodes)
print('\nNumber of nodes:', num_nodes)

# Initialize the frequency matrix
frequency_matrix = [[0] * num_nodes for _ in range(num_nodes)]

# Mapping node names to indices
node_index_map = {node: index for index, node in enumerate(nodes)}

# Populate the frequency matrix
for (node1, node2), freq in edge_frequencies.items():
    index1 = node_index_map[node1]
    index2 = node_index_map[node2]
    # Since it's an undirected graph, symmetrize the entries
    frequency_matrix[index1][index2] = frequency_matrix[index2][index1] = freq

# Print frequencies between each pair
for i in range(num_nodes):
    for j in range(num_nodes):  # Start from i to avoid duplicating prints
      if frequency_matrix[i][j] != 0:
        print(f"Frequency between {nodes[i]} and {nodes[j]}: {frequency_matrix[i][j]}")


for i in range(vertices_no):
  for j in range(vertices_no):
    if graph[i][j] != 0 and frequency_matrix[i][j] != 0:
       graph[i][j] = round(graph[i][j] / frequency_matrix[i][j], 2)
       #print("Hi ",graph[i][j])

print("frequency_matrix : ",frequency_matrix)
for i in graph:
  print(i)

node_names :  ['1234', '123', '4', '23', '1', '2', '3']
node_names :  ['1234', '123', '4', '2', '13', '1', '3']
node_names :  ['1234', '234', '1', '23', '4', '2', '3']
Combined Graph In-degrees: [0, 2, 3, 2, 3, 3, 3, 1, 1]

 number of all internal nodes: 5

 all internal nodes: ['23', '123', '1234', '13', '234']

 all vertices: ['1234', '123', '4', '23', '1', '2', '3', '13', '234']

 number of vertices: 9
vertices 1234-in degree 0
vertices 123-in degree 2
vertices 4-in degree 3
vertices 23-in degree 2
vertices 1-in degree 3
vertices 2-in degree 3
vertices 3-in degree 3
vertices 13-in degree 1
vertices 234-in degree 1

 nodes: ['1', '123', '1234', '13', '2', '23', '234', '3', '4']

 number of nodes: 9

Nodes: ['1', '123', '1234', '13', '2', '23', '234', '3', '4']

Number of nodes: 9
Frequency between 1 and 123: 1
Frequency between 1 and 1234: 1
Frequency between 1 and 13: 1
Frequency between 123 and 1: 1
Frequency between 123 and 1234: 2
Frequency between 123 and 13: 1
Frequency between

In [5]:
print("Leaf nodes:", leaf_nodes)
for leaf in leaf_nodes:
    leaf_index = node_index_map[leaf]
    print(leaf)
    for i in all_internal_nodes:
        i_index = node_index_map[i]
        distance = graph[leaf_index][i_index]
        print(i,"-",distance)

Leaf nodes: ['1', '4', '2', '3']
1
23 - 0
123 - 1.0
1234 - 1.0
13 - 0
234 - 0
4
23 - 0
123 - 0
1234 - 0.25
13 - 0.5
234 - 0
2
23 - 0
123 - 0
1234 - 0
13 - 0
234 - 0
3
23 - 0
123 - 0
1234 - 0
13 - 0
234 - 0.5


In [6]:
def modified_prims(graph, frequencies, internal_nodes, degrees):
    num_nodes = len(graph)
    print("graph len: ", num_nodes)
    mst = []
    in_mst = [False] * num_nodes

    # Create a mapping from node names to indices
    node_name_to_index = {name: index for index, name in enumerate(vertices)}

    # Debugging: Print the mappings and input lists
    print("Vertices:", vertices)
    print("Internal Nodes:", internal_nodes)
    print("Node Name to Index Mapping:", node_name_to_index)

    # Start from the first node of interest.
    start_node = internal_nodes[3]
    print("start_node", start_node)
    # Handle case where start_node might not be in the mapping
    if start_node not in node_name_to_index:
        print(f"Start node '{start_node}' not found in node_name_to_index mapping.")
        return []  # or handle this case as needed

    # Convert start_node from a name to an index
    start_node_index = node_name_to_index[start_node]
    print("start_node_index", start_node_index)
    # Use this index in your MST algorithm
    in_mst[start_node_index] = True

    while len(mst) < len(internal_nodes) - 1:
        max_degree = float('-inf')
        max_frequency = float('-inf')
        min_branch_length = float('inf')
        chosen_edge = (None, None, float('inf'))  # Initialize with placeholder values

        for node_name in internal_nodes:
            node_index = node_name_to_index[node_name]
            if in_mst[node_index]:
                for neighbor_index, edge_weight in enumerate(graph[node_index]):
                  if edge_weight >= 0:
                    neighbor_name = vertices[neighbor_index]  # Get the name of the neighbor
                    if neighbor_name in internal_nodes and not in_mst[neighbor_index]:
                        frequency = frequencies[node_index][neighbor_index]
                        # if frequency > 0:
                        degree = degrees[neighbor_index]
                        # branch_length = graph [node_index][neighbor_index]

                        # if frequency > max_frequency:
                        #     max_frequency = frequency
                        #     chosen_edge = (node_index, neighbor_index, edge_weight)
                        # elif frequency == max_frequency and degree > max_degree:
                        #     max_degree = degree
                        #     max_frequency = frequency
                        #     chosen_edge = (node_index, neighbor_index, edge_weight)
                        # elif frequency == max_frequency and degree == max_degree and edge_weight < min_branch_length:
                        #     max_degree = degree
                        #     max_frequency = frequency
                        #     min_branch_length = edge_weight
                        #     chosen_edge = (node_index, neighbor_index, edge_weight)
                        # Update chosen edge based on a clear priority of criteria
                        if (frequency > max_frequency or
                            (frequency == max_frequency and degree > max_degree) or
                            (frequency == max_frequency and degree == max_degree and edge_weight < min_branch_length)):
                            chosen_edge = (node_index, neighbor_index, edge_weight)
                            max_degree = degree
                            max_frequency = frequency
                            min_branch_length = edge_weight



        # After the loop, update in_mst and mst using indices
        print("\n\nChosen edge: ", chosen_edge)
        # print("Type of chosen_edge[1]:", type(chosen_edge[1]))
        if chosen_edge[0] is not None and chosen_edge[1] is not None:
            # Use index for in_mst
            in_mst[chosen_edge[1]] = True
            # Convert indices back to names for mst
            # mst.append((vertices[chosen_edge[0]], vertices[chosen_edge[1]]))
            mst.append((vertices[chosen_edge[0]], vertices[chosen_edge[1]], chosen_edge[2]))

    return mst



def draw_mst(mst_G):
    # Draw the graph
    pos = nx.spring_layout(mst_G)  # Layout algorithm (you can choose another one)

    # Increase the figure size (adjust the width and height as needed)
    plt.figure(figsize=(15, 15))

    # Customize node and edge visual properties for clarity
    nx.draw(mst_G, pos, with_labels=True, node_size=30, node_color='skyblue', font_size=8, font_color='black', edge_color='gray', width=0.8)

    # Display the graph
    plt.axis('off')
    plt.show()



# Call the modified_prims function with the specified nodes
mst_result = modified_prims(graph, frequency_matrix, all_internal_nodes, in_degrees)
print('\nMST :',mst_result)

mst_G = nx.Graph()
for edge in mst_result:
    node1, node2, distance = edge
    mst_G.add_edge(node1, node2, weight=distance)
    # mst_G.add_edge(edge[0], edge[1])
    # print(node1, node2,'->', distance)
# draw_mst(mst_G)
print("\n\nmst_G: ", mst_G)


graph len:  9
Vertices: ['1234', '123', '4', '23', '1', '2', '3', '13', '234']
Internal Nodes: ['23', '123', '1234', '13', '234']
Node Name to Index Mapping: {'1234': 0, '123': 1, '4': 2, '23': 3, '1': 4, '2': 5, '3': 6, '13': 7, '234': 8}
start_node 13
start_node_index 7


Chosen edge:  (7, 3, 0)


Chosen edge:  (3, 1, 0)


Chosen edge:  (3, 0, 0)


Chosen edge:  (3, 8, 0)

MST : [('13', '23', 0), ('23', '123', 0), ('23', '1234', 0), ('23', '234', 0)]


mst_G:  Graph with 5 nodes and 4 edges


In [7]:
# Create a mapping from node names to indices based on the list of vertices used to construct the frequency matrix and graph
# node_name_to_index = {name: index for index, name in enumerate(sorted(vertices))}

node_index_map = {node: index for index, node in enumerate(nodes)}
print(node_index_map)
print(graph)
# Initialize the graph
G = nx.Graph()
G.add_nodes_from(nodes)  # Ensure all vertices are added to the graph

# Add edges from the MST result using the consistent index
for node1, node2, distance in mst_result:
    index1 = node_index_map[node1]
    index2 = node_index_map[node2]
    # index1 = node_name_to_index[node1]
    # index2 = node_name_to_index[node2]
    G.add_edge(node1, node2, weight=distance)
    # Update graph matrix if needed
    graph[index1][index2] = graph[index2][index1] = distance

print(all_internal_nodes)
print("Leaf nodes:", leaf_nodes)
for leaf in leaf_nodes:
    leaf_index = node_index_map[leaf]
    nearest_internal_node = None
    min_distance = float('inf')
    max_freq = -1
    # print(leaf)
    for i in all_internal_nodes:
        i_index = node_index_map[i]
        distance = graph[leaf_index][i_index]
        freq = frequency_matrix[leaf_index][i_index]
        print(f" {leaf} and {i} with distance {distance} and freq is {freq}")
        if freq > 0:  # Ensure valid, direct connection and non-zero frequency
          if freq > max_freq or (freq == max_freq and distance < min_distance):
              max_freq = freq
              min_distance = distance
              nearest_internal_node = i
          # else:
          #   print(f" no {leaf} and {nearest_internal_node} with distance {min_distance} and fres is {freq}")

    if nearest_internal_node:
        G.add_edge(leaf, nearest_internal_node, weight=min_distance)
        print(f"  Added edge between {leaf} and {nearest_internal_node} with distance {min_distance} and freq is {max_freq}")
    else:
        print(f"  No suitable connection found for {leaf}")

print("Final graph with leaves:", G)
# Optionally, print other matrices or data structures for debugging

{'1': 0, '123': 1, '1234': 2, '13': 3, '2': 4, '23': 5, '234': 6, '3': 7, '4': 8}
[[0, 1.0, 1.0, 0, 0.5, 0, 0, 0, 0.5], [0, 0, 0, 0.5, 0.5, 0.5, 0, 0.5, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1.0, 1.0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0.5, 0, 0.5, 0, 0], [0, 0, 0.25, 0.5, 0, 0, 0, 0, 0]]
['23', '123', '1234', '13', '234']
Leaf nodes: ['1', '4', '2', '3']
 1 and 23 with distance 0 and freq is 0
 1 and 123 with distance 1.0 and freq is 1
 1 and 1234 with distance 1.0 and freq is 1
 1 and 13 with distance 0 and freq is 1
 1 and 234 with distance 0 and freq is 0
  Added edge between 1 and 13 with distance 0 and freq is 1
 4 and 23 with distance 0 and freq is 0
 4 and 123 with distance 0 and freq is 0
 4 and 1234 with distance 0.25 and freq is 2
 4 and 13 with distance 0.5 and freq is 0
 4 and 234 with distance 0 and freq is 1
  Added edge between 4 and 1234 with distance 0.25 and freq is 2
 2 and 23 with di

In [8]:
# Iterate over a copy of the internal nodes list to avoid modification during iteration
for internal_node in all_internal_nodes[:]:
    # Check the neighbors of the internal node
    neighbors = list(G.neighbors(internal_node))

    # Check if the internal node is connected only to one other internal node and no leaf nodes
    if len(neighbors) == 1 and neighbors[0] in all_internal_nodes and not any(neighbor in leaf_node_set for neighbor in neighbors):
        G.remove_node(internal_node)
        all_internal_nodes.remove(internal_node)

    # New condition: if the internal node is connected to exactly one leaf node and one internal node
    elif len(neighbors) == 2 and any(neighbor in leaf_node_set for neighbor in neighbors) and any(neighbor in all_internal_nodes for neighbor in neighbors):
        # Identify the leaf node and the other internal node
        leaf_node = next(neighbor for neighbor in neighbors if neighbor in leaf_node_set)
        other_internal_node = next(neighbor for neighbor in neighbors if neighbor in all_internal_nodes)

        # Calculate the new branch length as the sum of distances from the distance matrix
        new_branch_length = graph[node_name_to_index[internal_node]][node_name_to_index[leaf_node]] + graph[node_name_to_index[internal_node]][node_name_to_index[other_internal_node]]

        # Remove the internal node
        G.remove_node(internal_node)

        # Add an edge between the leaf node and the other internal node with the new branch length
        # Note: You might need to adjust this part if your graph G does not store weights
        G.add_edge(leaf_node, other_internal_node, weight=new_branch_length)

        all_internal_nodes.remove(internal_node)  # Optional: update the internal nodes list
print("After removing G: ", G)
for i in graph:
  print(i)
# Now G contains only the internal nodes that are connected to leaf nodes or more than one internal node

NameError: name 'leaf_node_set' is not defined

In [None]:
# Print all edges and their weights
for edge in G.edges(data=True):
    node1, node2, weight = edge
    print(f"Edge: {node1} - {node2}, Weight: {weight['weight']}")

Edge: 00111111111111112222222333445566778899 - 0011111111111122222223334455667789, Weight: 0
Edge: 00111111111111112222222333445566778899 - 019, Weight: 0
Edge: 00111111111111112222222333445566778899 - 111123, Weight: 0
Edge: 001111111111111122222223334567899 - 0222, Weight: 0
Edge: 001111111111111122222223334567899 - 1111156789, Weight: 0
Edge: 001111111111111122222223334567899 - 5678, Weight: 0
Edge: 0011111111111122222223334455667789 - 111789, Weight: 0
Edge: 0011111111111122222223334455667789 - 56, Weight: 0
Edge: 0011111111111122222223334455667789 - 78, Weight: 0
Edge: 0011111111111122222223334455667789 - 011222223, Weight: 0
Edge: 01111111111111222222233344556677889 - 111123, Weight: 0
Edge: 011111111111122222233456789 - 0222, Weight: 0
Edge: 011222223 - 23, Weight: 0
Edge: 011222223 - 1, Weight: 0
Edge: 011222223 - 21, Weight: 0
Edge: 011222223 - 20, Weight: 0
Edge: 019 - 231, Weight: 0
Edge: 019 - 10, Weight: 0
Edge: 019 - 9, Weight: 0
Edge: 0222 - 231, Weight: 0
Edge: 0222 - 2

In [None]:
def networkx_to_ete3(G, root_node):
    if root_node not in G:
        raise ValueError(f"Root node '{root_node}' not found in the graph.")

    visited = set()  # To keep track of visited nodes

    def add_children(node, ete3_node):
        visited.add(node)  # Mark the current node as visited
        for child in G.neighbors(node):
            if child not in visited:
                # Create the child node
                child_node = ete3_node.add_child(name=str(child))

                # Set the branch length (edge weight)
                branch_length = G[node][child].get('weight', 1.0)  # Default to 1.0 if no weight is found
                child_node.dist = branch_length

                # Recursively add children of this node
                add_children(child, child_node)

    root = Tree(name=str(root_node))
    add_children(root_node, root)
    return root

# Example usage
root_node = '00111111111111112222222333445566778899'
ete3_tree = networkx_to_ete3(G, root_node)
print(ete3_tree.write(format=5))

((((((14:0):0,1112:0):0):0,17:0):0,56:0,((4:0.01):0):0,(23:0,1:0,21:0,20:0):0):0,(((18:0,19:0):0,(011111111111122222233456789:0,((11114789:0):0,(5:0,7:0,6:0,8:0):0):0,22:0):0,3:0,2:0):0,10:0,9:0):0,(01111111111111222222233344556677889:0,(15:0,16:0):0,11:0,12:0,13:0):0);


In [None]:
(((14:0,17:0):0,56:0,((4:0.01):0):0,(23:0,1:0,21:0,20:0):0):0,(((18:0,19:0):0,(((11114789:0):0,(5:0,7:0,6:0,8:0):0):0,22:0):0,3:0,2:0):0,10:0,9:0):0,((15:0,16:0):0,11:0,12:0,13:0):0);

In [None]:
from ete3 import Tree

renaming_dict = {
    "1": "Guangxi_Pangolin_P2V",
    "2": "Guangdong_Pangolin_P2S_2019",
    "3": "Guangdong_Pangolin_1_2019",
    "4": "RaTG13",
    "5": "Hu_Australia_VIC231_2020",
    "6": "Hu_Wuhan_2020",
    "7": "Hu_USA_UT_00346_2020",
    "8": "Hu_Italy_TE4836_2020",
    "9": "Bat_CoVZXC21",
    "10": "Bat_CoVZC45",
    "11": "SARS",
    "12": "Tor2",
    "13": "SARS-CoV_BJ182-4",
    "14": "Rs3367",
    "15": "BtCoV_273_2005",
    "16": "Rf1",
    "17": "BtCoV_279_2005",
    "18": "HKU3-12",
    "19": "HKU3-6",
    "20": "Guangxi_Pangolin_P1E",
    "21": "Guangxi_Pangolin_P4L",
    "22": "Guangxi_Pangolin_P5L",
    "23": "Guangxi_Pangolin_P5E"
}

# Function to rename leaves of the tree
def rename_leaves(tree, renaming_dict):
    for leaf in tree:
        if leaf.name in renaming_dict:
            leaf.name = renaming_dict[leaf.name]

# Example Newick format string (replace this with your actual Newick string)
newick_str = "((((((14:0):0,1112:0):0):0,17:0):0,56:0,((4:0.01):0):0,(23:0,1:0,21:0,20:0):0):0,(((18:0,19:0):0,(011111111111122222233456789:0,((11114789:0):0,(5:0,7:0,6:0,8:0):0):0,22:0):0,3:0,2:0):0,10:0,9:0):0,(01111111111111222222233344556677889:0,(15:0,16:0):0,11:0,12:0,13:0):0);"

# Load the tree
tree = Tree(newick_str, format=1)

# Rename the leaves
rename_leaves(tree, renaming_dict)

# Print the new Newick format string
print(tree.write(format=3))

((((((Rs3367:0)NoName:0,1112:0)NoName:0)NoName:0,BtCoV_279_2005:0)NoName:0,56:0,((RaTG13:0.01)NoName:0)NoName:0,(Guangxi_Pangolin_P5E:0,Guangxi_Pangolin_P2V:0,Guangxi_Pangolin_P4L:0,Guangxi_Pangolin_P1E:0)NoName:0)NoName:0,(((HKU3-12:0,HKU3-6:0)NoName:0,(011111111111122222233456789:0,((11114789:0)NoName:0,(Hu_Australia_VIC231_2020:0,Hu_USA_UT_00346_2020:0,Hu_Wuhan_2020:0,Hu_Italy_TE4836_2020:0)NoName:0)NoName:0,Guangxi_Pangolin_P5L:0)NoName:0,Guangdong_Pangolin_1_2019:0,Guangdong_Pangolin_P2S_2019:0)NoName:0,Bat_CoVZC45:0,Bat_CoVZXC21:0)NoName:0,(01111111111111222222233344556677889:0,(BtCoV_273_2005:0,Rf1:0)NoName:0,SARS:0,Tor2:0,SARS-CoV_BJ182-4:0)NoName:0);


In [None]:
((((((Rs3367:0):0,1112:0):0):0,BtCoV_279_2005:0):0,56:0,((RaTG13:0.01):0):0,(Guangxi_Pangolin_P5E:0,Guangxi_Pangolin_P2V:0,Guangxi_Pangolin_P4L:0,Guangxi_Pangolin_P1E:0):0):0,(((HKU3-12:0,HKU3-6:0):0,(011111111111122222233456789:0,((11114789:0):0,(Hu_Australia_VIC231_2020:0,Hu_USA_UT_00346_2020:0,Hu_Wuhan_2020:0,Hu_Italy_TE4836_2020:0):0):0,Guangxi_Pangolin_P5L:0):0,Guangdong_Pangolin_1_2019:0,Guangdong_Pangolin_P2S_2019:0):0,Bat_CoVZC45:0,Bat_CoVZXC21:0):0,(01111111111111222222233344556677889:0,(BtCoV_273_2005:0,Rf1:0)NoName:0,SARS:0,Tor2:0,SARS-CoV_BJ182-4:0):0);




(((Rs3367:0,BtCoV_279_2005:0):0,((RaTG13:0.01):0):0,(Guangxi_Pangolin_P5E:0,Guangxi_Pangolin_P2V:0,Guangxi_Pangolin_P4L:0,Guangxi_Pangolin_P1E:0):0):0,(((HKU3-12:0,HKU3-6:0):0,(((Hu_Australia_VIC231_2020:0,Hu_USA_UT_00346_2020:0,Hu_Wuhan_2020:0,Hu_Italy_TE4836_2020:0):0):0,Guangxi_Pangolin_P5L:0):0,Guangdong_Pangolin_1_2019:0,Guangdong_Pangolin_P2S_2019:0):0,Bat_CoVZC45:0,Bat_CoVZXC21:0):0,((BtCoV_273_2005:0,Rf1:0):0,SARS:0,Tor2:0,SARS-CoV_BJ182-4:0):0);

In [None]:
import re

# Your Newick format string
newick_string = "((((((Rs3367:0)NoName:0,1112:0)NoName:0)NoName:0,BtCoV_279_2005:0)NoName:0,56:0,((RaTG13:0.01)NoName:0)NoName:0,(Guangxi_Pangolin_P5E:0,Guangxi_Pangolin_P2V:0,Guangxi_Pangolin_P4L:0,Guangxi_Pangolin_P1E:0)NoName:0)NoName:0,(((HKU3-12:0,HKU3-6:0)NoName:0,(011111111111122222233456789:0,((11114789:0)NoName:0,(Hu_Australia_VIC231_2020:0,Hu_USA_UT_00346_2020:0,Hu_Wuhan_2020:0,Hu_Italy_TE4836_2020:0)NoName:0)NoName:0,Guangxi_Pangolin_P5L:0)NoName:0,Guangdong_Pangolin_1_2019:0,Guangdong_Pangolin_P2S_2019:0)NoName:0,Bat_CoVZC45:0,Bat_CoVZXC21:0)NoName:0,(01111111111111222222233344556677889:0,(BtCoV_273_2005:0,Rf1:0)NoName:0,SARS:0,Tor2:0,SARS-CoV_BJ182-4:0)NoName:0);"

# Regular expression to remove internal node names following a closing parenthesis with any number of ":0"
cleaned_newick = re.sub(r'\)NoName(:\d+)?', ')', newick_string)

print(cleaned_newick)


((((((Rs3367:0),1112:0)),BtCoV_279_2005:0),56:0,((RaTG13:0.01)),(Guangxi_Pangolin_P5E:0,Guangxi_Pangolin_P2V:0,Guangxi_Pangolin_P4L:0,Guangxi_Pangolin_P1E:0)),(((HKU3-12:0,HKU3-6:0),(011111111111122222233456789:0,((11114789:0),(Hu_Australia_VIC231_2020:0,Hu_USA_UT_00346_2020:0,Hu_Wuhan_2020:0,Hu_Italy_TE4836_2020:0)),Guangxi_Pangolin_P5L:0),Guangdong_Pangolin_1_2019:0,Guangdong_Pangolin_P2S_2019:0),Bat_CoVZC45:0,Bat_CoVZXC21:0),(01111111111111222222233344556677889:0,(BtCoV_273_2005:0,Rf1:0),SARS:0,Tor2:0,SARS-CoV_BJ182-4:0));
