In [1]:
# allow proper pathing for project
import sys
from pathlib import Path

# get absolute path to project root
ROOT = Path(__file__).resolve().parents[1] if "__file__" in locals() else Path.cwd().parents[0]
sys.path.append(str(ROOT))


In [2]:
import json
import gzip
from collections import deque, defaultdict
from src.utils.path_utils import get_data_dir
from src.data.genome_node import CovidGenomeSequence
from src.data.phylogenetic_graph import PhylogeneticGraph
from src.data.tree_to_csv import tree_to_csv, ancestor_union_to_csv
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import pandas as pd

In [3]:
with gzip.open("data_json/tree.json.gz", "rt", encoding="utf-8") as f:
    data = json.load(f)
print(data.keys())
with gzip.open("data_json/root-sequence.json.gz", "rt", encoding="utf-8") as f:
    root_seq = json.load(f)
print(set(root_seq.keys()))

dict_keys(['meta', 'tree', 'version'])
{'ORF1a', 'N', 'ORF7a', 'ORF8', 'S', 'nuc', 'ORF7b', 'M', 'ORF9b', 'ORF1b', 'ORF6', 'E', 'ORF3a'}


In [4]:
tree = data['tree']
tree['node_attrs']
tree['children'][1]['children'][0]['branch_attrs']

{'labels': {'aa': 'ORF1a: L2235I, N3833K'},
 'mutations': {'ORF1a': ['L2235I', 'N3833K'], 'nuc': ['C6968A', 'T11764A']}}

In [5]:
# dates = []
q = deque([tree])
node_attrs = {}
branch_attrs = {}
counter = 0
while q:
    # process node
    node = q.popleft()
    for key, val in node["node_attrs"].items():
        node_attrs[key] = val
    for key, val in node["branch_attrs"].items():
        branch_attrs[key] = val
    # if node["branch_attrs"]["mutations"]:
    #     if "NODE" in node["name"]:
    #         ancestral_muts.append(node["branch_attrs"]["mutations"])
    #     else:
    #         if 'MA' in node['name']:
    #             MA_muts.append(node["branch_attrs"]["mutations"])
    #         tip_muts.append(node["branch_attrs"]["mutations"])

    # clades.add(node['node_attrs']['clade_membership']['value'])

    counter += 1

    children = node.get("children", [])

    for child in children:
        q.append(child)

print(f"Visited {counter} sequences")

Visited 14761 sequences


In [6]:
# branch_attrs

In [7]:
tree_obj = CovidGenomeSequence(tree)
tree_obj.generate_sequences(root_seq, proteins=["N", "S"])

covid_phylo_graph = PhylogeneticGraph(tree_obj)

# get all genomes from MA
ma_genomes = covid_phylo_graph.find_tips_by_attribute(
    "division_exposure", "Massachusetts"
)

# union of all paths from root
ma_genome_ancestors = covid_phylo_graph.get_ancestor_union(ma_genomes)

ancestor_union_to_csv(ma_genome_ancestors, "ma_sequences.csv")

Protein S: Expected E at substitution position 470 (1-based: 471), found S. Applying mutation anyway.
Protein S: Expected T at substitution position 314 (1-based: 315), found Q. Applying mutation anyway.
Processing 4070 ancestor nodes...


Processing nodes: 100%|██████████| 4070/4070 [00:00<00:00, 338599.20 nodes/s]



Writing CSV with 4070 nodes...


Writing CSV: 100%|██████████| 4070/4070 [00:00<00:00, 60870.90 rows/s]

✓ CSV written to /Users/vnagpal/Desktop/fa-2025/cse-598-ai4sci/gen-cov-abm/data/ma_sequences.csv (ancestor union)





In [8]:
tree_to_csv(tree_obj, "all_sequences.csv")

Traversing tree: 14761 nodes [00:00, 540830.58 nodes/s]



Writing CSV with 14761 rows...


Writing CSV: 100%|██████████| 14761/14761 [00:00<00:00, 69806.54 rows/s]

CSV written to /Users/vnagpal/Desktop/fa-2025/cse-598-ai4sci/gen-cov-abm/data/all_sequences.csv





In [9]:
all_seqs = pd.read_csv(get_data_dir() / "all_sequences.csv")

ma_seqs = all_seqs[all_seqs["division"] == "Massachusetts"]

ma_orig_seqs = all_seqs[all_seqs["division_exposure"] == "Massachusetts"]