## Build a dendrogram based on vConTACT similarity 

In [6]:
# imports 
import pandas as pd 
from Bio import SeqIO
import numpy as np 
import seaborn as sns
from scipy.cluster.hierarchy import linkage, dendrogram, to_tree
import matplotlib.pyplot as plt
import re

In [3]:
vcontact = pd.read_csv('../data/assembly_processes/vcontact_long_term.phables.c1.ntw', header = None, sep = ' ')
vcontact_filtered = vcontact[vcontact[0].str.contains('phage_comp_')] 
vcontact_filtered = vcontact_filtered[vcontact_filtered[1].str.contains('phage_comp_')]

In [None]:
handle = '../data/assembly_processes/resolved_paths.fasta'
seq_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
contigs = list(seq_dict.keys()) 

vcontact_filtered = vcontact_filtered[vcontact_filtered[0].isin(contigs)] 
vcontact_filtered = vcontact_filtered[vcontact_filtered[1].isin(contigs)] 

vcontact_filtered.to_csv('../data/assembly_processes/vcontact_long_term.phables.filtered.c1.ntw')

### build the similarity matrix

In [6]:
import numpy as np
from scipy.cluster.hierarchy import linkage, to_tree
from scipy.spatial.distance import squareform

# --- Build similarity matrix (your code) ---
similarity = np.zeros((len(contigs), len(contigs)), dtype=float)

for i in range(len(vcontact_filtered)):
    contig1 = vcontact_filtered.iloc[i, 0]
    contig2 = vcontact_filtered.iloc[i, 1]
    score   = float(vcontact_filtered.iloc[i, 2])

    idx1 = contigs.index(contig1)
    idx2 = contigs.index(contig2)
    similarity[idx1, idx2] = score
    similarity[idx2, idx1] = score  # ensure symmetry

# Ensure diagonal reflects self-similarity
np.fill_diagonal(similarity, similarity.max() if similarity.max() > 0 else 1.0)

# --- Standardize similarities between 0 and 1 using only off-diagonals ---
n = similarity.shape[0]
off_diag_mask = ~np.eye(n, dtype=bool)
off_vals = similarity[off_diag_mask]
sim_min = off_vals.min()
sim_max = off_vals.max()

if sim_max > sim_min:
    standardized = (similarity - sim_min) / (sim_max - sim_min)
else:
    # All off-diagonal values identical; fall back to zeros off-diagonal, ones on diagonal
    standardized = np.zeros_like(similarity)
    np.fill_diagonal(standardized, 1.0)

# Keep perfect self-similarity
np.fill_diagonal(standardized, 1.0)

# --- Convert to dissimilarities (distance = 1 - similarity) ---
dissimilarities = 1.0 - standardized
np.fill_diagonal(dissimilarities, 0.0)

# --- SciPy condensed distance matrix & UPGMA (average linkage) ---
condensed = squareform(dissimilarities, checks=True)
Z = linkage(condensed, method="average")

# --- Convert to Newick using contig names ---
root, nodes = to_tree(Z, rd=True)

def get_newick(node, parent_dist, names, newick=""):
    if node.is_leaf():
        return f"{names[node.id]}:{parent_dist - node.dist:.6f}{newick}"
    else:
        if newick:
            newick = f"):{parent_dist - node.dist:.6f}{newick}"
        else:
            newick = ");"
        newick = get_newick(node.get_left(),  node.dist, names, newick=newick)
        newick = get_newick(node.get_right(), node.dist, names, newick="," + newick)
        newick = "(" + newick
        return newick

leaf_names = list(contigs)  # <-- use your contig names here
newick_str = get_newick(root, root.dist, leaf_names)

with open("../output_files/vcontact_long_term.phables.upgma_tree.nwk", "w") as f:
    f.write(newick_str)

print("Saved Newick to upgma_tree.nwk")


Saved Newick to upgma_tree.nwk


In [16]:
newick_str


'((((((((((((((((((((((((phage_comp_121_cycle_1:1.000000,phage_comp_41_cycle_1:1.000000):0.000000,phage_comp_146_cycle_1:1.000000):0.000000,phage_comp_155_cycle_1:1.000000):0.000000,phage_comp_166_cycle_1:1.000000):0.000000,phage_comp_187_cycle_1:1.000000):0.000000,phage_comp_189_cycle_1:1.000000):0.000000,phage_comp_193_cycle_1:1.000000):0.000000,phage_comp_219_cycle_1:1.000000):0.000000,phage_comp_227_cycle_1:1.000000):0.000000,phage_comp_294_cycle_1:1.000000):0.000000,((phage_comp_290_cycle_1:0.359215,phage_comp_268_cycle_1:0.359215):0.472202,phage_comp_299_cycle_1:0.831417):0.168583):0.000000,phage_comp_311_cycle_1:1.000000):0.000000,phage_comp_418_cycle_1:1.000000):0.000000,phage_comp_1621_cycle_1:1.000000):0.000000,((phage_comp_1666_cycle_1:0.993447,phage_comp_1615_cycle_1:0.993447):0.004367,phage_comp_184_cycle_1:0.997813):0.002187):0.000000,phage_comp_1684_cycle_1:1.000000):0.000000,((phage_comp_1709_cycle_1:0.100537,phage_comp_278_cycle_1:0.100537):0.711310,phage_comp_65_cycle

In [20]:
similarity.shape

(89, 89)