In [1]:
import os
import pandas as pd
from pathlib import Path
# from datetime import date

In [2]:
MUT_PATH = '/home/yutianc/muninn_sc2/inputs/escape_1000/mutations.tsv'

IN_DIR = '/home/yutianc/bjorn_rep/data/sc2/raw_data'
OUT_DIR = '/home/yutianc/muninn_sc2/inputs/escape_1000'

FASTA_DIR = '/home/yutianc/bjorn_rep/data/sc2/consensus_sequences'

In [None]:
mut = pd.read_csv(MUT_PATH, sep='\t')
dms = pd.read_csv(os.path.join(IN_DIR, "final_variant_scores.txt"), sep=',', header=0)
lineage = pd.read_csv(os.path.join(IN_DIR, "lineage_report.csv"))
evescape = pd.read_csv(os.path.join(IN_DIR, "full_spike_evescape.csv"))

# the unique samples in the subset, will be used to subset other files
sra = list(mut["sra"].unique())
mut['mutation'] = mut['ref_aa'] + str(mut['pos_aa']) + mut['alt_aa']


## DMS Data

In [4]:
# Check if Hu-1_v1 and Hu-1_v2 are the same thing..
dms_v1 = dms[dms['target'] == 'Wuhan-Hu-1_v1']
dms_v2 = dms[dms['target'] == 'Wuhan-Hu-1_v2']

m = pd.merge(dms_v1, dms_v2, how="outer", on=["position"], indicator=True)
assert len(m[m["wildtype_x"] != m['wildtype_y']]) == 0
assert len(m[m["_merge"] != 'both']) == 0

# they are the same thing, so I will treat both of them as Hu-1

In [None]:
dms = pd.read_csv(os.path.join(IN_DIR, "final_variant_scores_BA.1_BA.2.txt"), sep=',', header=0)

ref_map = {'Omicron_BA1': 'NC_045512.2_escape_BA.1_rbd',
           'Omicron_BA2': 'NC_045512.2_escape_BA.2_rbd',
           'Wuhan-Hu-1_v1': 'NC_045512.2',
           'Wuhan-Hu-1_v2': 'NC_045512.2',
           'Beta': 'Beta',
           'Alpha': 'Alpha',
           'Delta': 'Delta',
           }

# this is rbd region only, so YP.. is used here
dms['GFF_FEATURE'] = 'YP_009724390.1_' + dms['target'].map(ref_map)

# check if gffs are correct
merged = pd.merge(dms, mut, how='inner', on=['mutation'], indicator=True)

assert not (
    (merged["_merge"] == "both") &
    (merged["GFF_FEATURE_x"] != merged["GFF_FEATURE_y"])
).any()


# currently only for these ref genomes
dms = dms[dms['target'].isin(['Omicron_BA1', 'Omicron_BA2', 'Wuhan-Hu-1_v1', 'Wuhan-Hu-1_v2'])]

dms.to_csv(os.path.join(OUT_DIR, "dms_sampled.tsv"), sep='\t', index=False)

## Metadata

In [3]:
metadata = pd.read_csv(os.path.join(IN_DIR, "metadata.csv"))
print(len(metadata))

metadata["mut_sra"] = metadata['fasta_hdr'].apply(lambda x: str(x).split("/")[2] if len(str(x).split("/")) == 4 else x)

# remove duplicates for unique accession constraint, keep the first occurrence to keep gisaid accession
metadata = metadata.drop_duplicates(subset=["ID"], keep="first")
print(len(metadata))
metadata

107675
106972


Unnamed: 0,ID,gb_accession,gisaid_accession,collection_date,location,percent_coverage_cds,avg_depth,authors,originating_lab,fasta_hdr,host,zipcode,Coverage,mut_sra
0,SEARCH-65663,,EPI_ISL_9805989,2022-01-03,North America/USA/California/San Diego,99.9048,,"Chip Schooley, Natasha Martin, Cheryl Anderson...",EXCITE Lab,hCoV-19/USA/CA-SEARCH-65663/2022,Human,1721,,CA-SEARCH-65663
1,SEARCH-65927,,EPI_ISL_9806245,2022-01-05,North America/USA/California/San Diego,99.8844,,"Chip Schooley, Natasha Martin, Cheryl Anderson...",EXCITE Lab,hCoV-19/USA/CA-SEARCH-65927/2022,Human,7030,,CA-SEARCH-65927
2,SEARCH-65942,,EPI_ISL_9806260,2022-01-05,North America/USA/California/San Diego,99.8844,,"Chip Schooley, Natasha Martin, Cheryl Anderson...",EXCITE Lab,hCoV-19/USA/CA-SEARCH-65942/2022,Human,7030,,CA-SEARCH-65942
3,SEARCH-67218,,EPI_ISL_9807421,2022-01-06,North America/USA/California/San Diego,99.6940,,"Chip Schooley, Natasha Martin, Cheryl Anderson...",EXCITE Lab,hCoV-19/USA/CA-SEARCH-67218/2022,Human,5658,,CA-SEARCH-67218
4,SEARCH-68516,,EPI_ISL_9808486,2022-01-10,North America/USA/California/San Diego,99.8504,,"Chip Schooley, Natasha Martin, Cheryl Anderson...",EXCITE Lab,hCoV-19/USA/CA-SEARCH-68516/2022,Human,6612,,CA-SEARCH-68516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107670,SEARCH-143382,,,2025-04-23,North America/USA/California/San Diego,99.3471,,SEARCH Alliance San Diego with Ashleigh Murphy...,San Diego County Public Health Laboratory,hCoV-19/USA/CA-SEARCH-143382/2025,Human,92111,99.3,CA-SEARCH-143382
107671,SEARCH-143383,,,2025-04-23,North America/USA/California/San Diego,99.8776,,SEARCH Alliance San Diego with Ashleigh Murphy...,San Diego County Public Health Laboratory,hCoV-19/USA/CA-SEARCH-143383/2025,Human,92111,99.9,CA-SEARCH-143383
107672,SEARCH-143384,,,2025-04-22,North America/USA/California/San Diego,98.2862,,SEARCH Alliance San Diego with Ashleigh Murphy...,San Diego County Public Health Laboratory,hCoV-19/USA/CA-SEARCH-143384/2025,Human,92040,98.3,CA-SEARCH-143384
107673,SEARCH-143385,,,2025-04-04,North America/USA/California/San Diego,98.9799,,SEARCH Alliance San Diego with Ashleigh Murphy...,San Diego County Public Health Laboratory,hCoV-19/USA/CA-SEARCH-143385/2025,Human,92114,99.0,CA-SEARCH-143385


In [5]:
# for those id and fasta_hdr mismatch, overwrite fasta_hdr with id
mask = metadata["mut_sra"].notna() & (metadata["mut_sra"] != metadata["ID"])
metadata.loc[mask, "ID"] = metadata.loc[mask, "mut_sra"]

# metadata_sampled = metadata[metadata["mut_sra"].isin(sra)]
# metadata_sampled = metadata_sampled.drop(columns={"mut_sra"})
# metadata_sampled.to_csv(os.path.join(OUT_DIR, "metadata_sampled.tsv"), sep="\t", index=False)

metadata = metadata.drop(columns={"mut_sra"})
metadata.to_csv(os.path.join(IN_DIR, "metadata_cleaned.tsv"), index=False, sep='\t')


## Lineage

In [7]:
def fasta_first_seq_id(path: Path):
    with path.open("r", encoding="utf-8", errors="replace") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">"):
                seq_id = line[1:].split()[0]
                seq_id = seq_id.split("/")[2] if len(seq_id.split("/")) ==  4 else seq_id
                return seq_id
            return None
    return None 

def find_fname_sid_mapping(dir_path: str, exts=(".fasta")):
    d = Path(dir_path)
    fname, sid = [], []

    for p in d.iterdir():  # no subdirs
        if exts and p.suffix.lower() not in exts:
            continue

        seq_id = fasta_first_seq_id(p)

        fname.append(p.stem)
        sid.append(seq_id)

    return fname, sid


In [8]:
# similarily, overwrite the mismatched taxon with id from fasta file
fname, sid = find_fname_sid_mapping(FASTA_DIR)
dic_lineage = dict(zip(fname, sid))
lineage["taxon"] = lineage["taxon"].map(dic_lineage)

lineage_sampled = lineage[lineage["taxon"].isin(sra)]

lineage_sampled.to_csv(os.path.join(OUT_DIR, "lineage_sampled.csv"), index=False)
lineage.to_csv(os.path.join(IN_DIR, "lineage_cleaned.csv"), index=False)

## EVEscape

In [4]:
evescape

Unnamed: 0,i,wt,mut,fitness_eve,dissimilarity_charge_hydro,accessibility_wcn,evescape
0,1,M,A,,-2.143469,,-2.437632
1,1,M,C,,-1.756578,,-2.362357
2,1,M,D,,1.435655,,-1.871061
3,1,M,E,,1.251421,,-1.893057
4,1,M,F,,-1.517074,,-2.317395
...,...,...,...,...,...,...,...
24182,1273,T,R,,2.614752,,-1.747551
24183,1273,T,S,,-2.014505,,-2.412182
24184,1273,T,V,,-0.835408,,-2.196479
24185,1273,T,W,,-1.148606,,-2.250724


In [8]:
# evescape is Hu-1 only
# and this file is for spike only, so the corresponding gff is YP_009724390.1

evescape["GFF_FEATURE"] = 'YP_009724390.1_NC_045512.2'
evescape["mutation"] = evescape["wt"] + str(evescape["i"]) + evescape["mut"]
# check if GFFs match with the ones in mut file
merged = pd.merge(mut, evescape, on="mutation", how="inner")

assert (merged["GFF_FEATURE_x"] == merged["GFF_FEATURE_y"]).all()

evescape = evescape.drop(columns={"mutation"})
evescape.to_csv(os.path.join(OUT_DIR, "evescape.csv"), index=False)

In [9]:
evescape

Unnamed: 0,i,wt,mut,fitness_eve,dissimilarity_charge_hydro,accessibility_wcn,evescape,GFF_FEATURE
0,1,M,A,,-2.143469,,-2.437632,YP_009724390.1_NC_045512.2
1,1,M,C,,-1.756578,,-2.362357,YP_009724390.1_NC_045512.2
2,1,M,D,,1.435655,,-1.871061,YP_009724390.1_NC_045512.2
3,1,M,E,,1.251421,,-1.893057,YP_009724390.1_NC_045512.2
4,1,M,F,,-1.517074,,-2.317395,YP_009724390.1_NC_045512.2
...,...,...,...,...,...,...,...,...
24182,1273,T,R,,2.614752,,-1.747551,YP_009724390.1_NC_045512.2
24183,1273,T,S,,-2.014505,,-2.412182,YP_009724390.1_NC_045512.2
24184,1273,T,V,,-0.835408,,-2.196479,YP_009724390.1_NC_045512.2
24185,1273,T,W,,-1.148606,,-2.250724,YP_009724390.1_NC_045512.2
