In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Barnstable County       151
# Suffolk County MA        61
# Bristol County           34
# Norfolk County           19
# Plymouth County          15
# Worcester County         14
# Essex County MA          10
# Hampden County            4
# Berkshire County          1
# Hampshire County          1
county_code_map = {
    "Barnstable County": 25001,
    "Suffolk County MA": 25025,
    "Bristol County": 25005,
    "Norfolk County": 25021,
    "Plymouth County": 25023,
    "Worcester County": 25027,
    "Essex County MA": 25009,
    "Hampden County": 25013,
    "Berkshire County": 25003,
    "Hampshire County": 25015,
}

In [2]:
# Step 1: Load and filter sequences
df = pd.read_csv("../data/all_sequences-v2.csv")

# Filter for Massachusetts and specified counties
df = df[
    (df['division_exposure'] == 'Massachusetts') & 
    (df['location'].isin(county_code_map.keys()))
]
print(f"{len(df)} filtered genomes")

print(df['location'].value_counts())


296 filtered genomes
location
Barnstable County    151
Suffolk County MA     61
Bristol County        34
Norfolk County        19
Plymouth County       15
Essex County MA       10
Hampden County         4
Worcester County       1
Berkshire County       1
Name: count, dtype: int64


In [3]:
# load orf1a sequence IDs and other information
with open("../data/orf1a_sequence_ids.txt", "r") as f:
    orf1a_ids = [line.strip() for line in f]

id_to_index = {seq_id: idx for idx, seq_id in enumerate(orf1a_ids)}

genome_indices = []
genome_names = []
counties = []
for _, row in df.iterrows():
    genome_indices.append(id_to_index[row["name"]])
    genome_names.append(row["name"])
    counties.append(row["location"])

In [4]:
# load embeddings
all_embeddings = np.load("../data/orf1a_sequence_embeddings.npy")

# filter embeddings based on select genomes
selected_embeddings = all_embeddings[genome_indices]

In [5]:
# normalize embeddings to unit vectors
norms = np.linalg.norm(selected_embeddings, axis=1, keepdims=True)
normalized_embeddings = selected_embeddings / norms

In [7]:
# save embeddings by county
base_dir = Path("../embeddings")

county_counts = {}
for genome_name, embedding, county in zip(genome_names, normalized_embeddings, counties):
    # county code
    county_code = county_code_map[county]
    
    # create county directory if it doesn't exist
    county_dir = base_dir / str(county_code)
    county_dir.mkdir(exist_ok=True)
    
    # save individual embedding file
    filename = genome_name.replace("/", "_").replace(" ", "_")
    filepath = county_dir / f"{filename}.npy"
    np.save(filepath, embedding)
    
    print(f'For {county}, Saved to {filepath}')


For Norfolk County, Saved to ../embeddings/25021/USA_MA-CDCBI-CRSP_VOCC2O65IXRDAJ3N_2021.npy
For Barnstable County, Saved to ../embeddings/25001/USA_MA-CDCBI-CRSP_IVAMC57PDYCJZSCZ_2021.npy
For Barnstable County, Saved to ../embeddings/25001/USA_MA-CDCBI-CRSP_FUCT5MIL5QEPWLM3_2021.npy
For Worcester County, Saved to ../embeddings/25027/USA_COVID20-0588_2020.npy
For Norfolk County, Saved to ../embeddings/25021/USA_MA-UW-633_2020.npy
For Norfolk County, Saved to ../embeddings/25021/USA_MA-UW-673_2020.npy
For Hampden County, Saved to ../embeddings/25013/USA_MA-CDCBI-CRSP_CT4A5QLV5GTOWDBC_2021.npy
For Bristol County, Saved to ../embeddings/25005/USA_MA-CDCBI-CRSP_QF7RDI6WFSZCZC4D_2021.npy
For Barnstable County, Saved to ../embeddings/25001/USA_MA-CDCBI-CRSP_JQTDRO7IPXC2KV5N_2021.npy
For Barnstable County, Saved to ../embeddings/25001/USA_MA-CDCBI-CRSP_54X2E2BLRHTYXPY4_2021.npy
For Barnstable County, Saved to ../embeddings/25001/USA_MA-CDCBI-CRSP_SBSSULH5XXXIY3SF_2021.npy
For Barnstable Count