In [1]:
# allow proper pathing for project
import sys
from pathlib import Path

# get absolute path to project root
ROOT = Path(__file__).resolve().parents[1] if "__file__" in locals() else Path.cwd().parents[0]
sys.path.append(str(ROOT))

In [2]:
import json
import gzip
from collections import deque, defaultdict
from src.utils.path_utils import get_data_dir
from src.data.genome_node import CovidGenomeSequence
from src.data.phylogenetic_graph import PhylogeneticGraph
from src.data.tree_to_csv import tree_to_csv, ancestor_union_to_csv
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import pandas as pd

In [3]:
with gzip.open("../data_json/tree.json.gz", "rt", encoding="utf-8") as f:
    data = json.load(f)
print(data.keys())
with gzip.open("../data_json/root-sequence.json.gz", "rt", encoding="utf-8") as f:
    root_seq = json.load(f)
print(set(root_seq.keys()))

dict_keys(['meta', 'tree', 'version'])
{'ORF7b', 'ORF3a', 'S', 'nuc', 'ORF7a', 'ORF8', 'M', 'N', 'E', 'ORF1b', 'ORF1a', 'ORF6', 'ORF9b'}


In [4]:
tree = data['tree']
tree['node_attrs']
tree['children'][1]['children'][0]['branch_attrs']

{'labels': {'aa': 'ORF1a: L2235I, N3833K'},
 'mutations': {'ORF1a': ['L2235I', 'N3833K'], 'nuc': ['C6968A', 'T11764A']}}

In [5]:
# dates = []
q = deque([tree])
node_attrs = {}
branch_attrs = {}
counter = 0

# Track mutation counts per protein
protein_mutation_counts = defaultdict(int)

while q:
    # process node
    node = q.popleft()
    for key, val in node["node_attrs"].items():
        node_attrs[key] = val
    for key, val in node["branch_attrs"].items():
        branch_attrs[key] = val

    # Count mutations per protein
    if (
        "mutations" in node["branch_attrs"]
        and node["node_attrs"].get("division_exposure", {"value": ""})["value"]
        == "Massachusetts"
    ):
        mutations_dict = node["branch_attrs"]["mutations"]
        for protein, mutation_list in mutations_dict.items():
            protein_mutation_counts[protein] += len(mutation_list)

        counter += 1
    # if node["branch_attrs"]["mutations"]:
    #     if "NODE" in node["name"]:
    #         ancestral_muts.append(node["branch_attrs"]["mutations"])
    #     else:
    #         if 'MA' in node['name']:
    #             MA_muts.append(node["branch_attrs"]["mutations"])
    #         tip_muts.append(node["branch_attrs"]["mutations"])

    # clades.add(node['node_attrs']['clade_membership']['value'])

    children = node.get("children", [])

    for child in children:
        q.append(child)

print(f"Visited {counter} sequences")
print(f"\nMutation counts per protein:")
print("-" * 40)

# Sort by count (descending) and display
sorted_proteins = sorted(
    protein_mutation_counts.items(), key=lambda x: x[1], reverse=True
)
for protein, count in sorted_proteins:
    print(f"{protein:10s}: {count:7,d} mutations")

print("-" * 40)
print(f"Total mutations: {sum(protein_mutation_counts.values()):,}")

Visited 1487 sequences

Mutation counts per protein:
----------------------------------------
nuc       :   1,983 mutations
ORF1a     :     468 mutations
ORF1b     :     179 mutations
S         :     101 mutations
N         :      74 mutations
ORF3a     :      71 mutations
ORF7a     :      55 mutations
ORF8      :      41 mutations
M         :      21 mutations
ORF9b     :      17 mutations
ORF6      :      14 mutations
E         :       7 mutations
ORF7b     :       6 mutations
----------------------------------------
Total mutations: 3,037


In [6]:
tree_obj = CovidGenomeSequence(tree)
tree_obj.generate_sequences(root_seq, proteins=["N", "S", "ORF1a"])

covid_phylo_graph = PhylogeneticGraph(tree_obj)

# get all genomes from MA
ma_genomes = covid_phylo_graph.find_tips_by_attribute(
    "division_exposure", "Massachusetts"
)

# union of all paths from root
ma_genome_ancestors = covid_phylo_graph.get_ancestor_union(ma_genomes)

ancestor_union_to_csv(ma_genome_ancestors, "ma_sequences-v2.csv")

Processing 4070 ancestor nodes...


Processing nodes: 100%|██████████| 4070/4070 [00:00<00:00, 259383.67 nodes/s]



Writing CSV with 4070 nodes...


Writing CSV: 100%|██████████| 4070/4070 [00:00<00:00, 22592.61 rows/s]

✓ CSV written to /Users/vnagpal/Desktop/fa-2025/cse-598-ai4sci/gen-cov-abm/data/ma_sequences-v2.csv (ancestor union)





In [7]:
tree_to_csv(tree_obj, "all_sequences-v2.csv")

Traversing tree: 14761 nodes [00:00, 455149.17 nodes/s]



Writing CSV with 14761 rows...


Writing CSV: 100%|██████████| 14761/14761 [00:00<00:00, 22796.23 rows/s]

CSV written to /Users/vnagpal/Desktop/fa-2025/cse-598-ai4sci/gen-cov-abm/data/all_sequences-v2.csv



