In [1]:
import os
import shutil
from collections import defaultdict

import pandas as pd
from Bio import SeqIO

METADATA_DIR = "Metadata"
SNP_DIR = "SNP"
CLUSTER_TRIM_DIR = "ClusterTrim"

if os.path.exists(CLUSTER_TRIM_DIR):
    shutil.rmtree(CLUSTER_TRIM_DIR)
os.mkdir(CLUSTER_TRIM_DIR)

In [2]:
metadata = pd.read_csv(os.path.join(METADATA_DIR, "SARS_CoV_2.csv"), index_col=0)
metadata.loc[pd.isna(metadata['Nextstrain_clade']), "Nextstrain_clade"] = "unknown"

In [3]:
metadata["Nextstrain_clade"].unique()

array(['19A', '20A', '20C', '20B', '19B', '20G', '20D', '20F',
       '20E (EU1)', '20A.EU2', '20I/501Y.V1', '20H/501Y.V2', 'unknown'],
      dtype=object)

In [4]:
ac2clade = {}

for clade, info in metadata.groupby("Nextstrain_clade"):
    if pd.isna(clade):
        print(clade)
    for ac in info.index:
        ac2clade[ac] = clade

# 20aa cluster grouping

In [5]:
groupedSeqs = defaultdict(list)

for record in SeqIO.parse(os.path.join(SNP_DIR, "spike_AA_selected_20.fasta"), "fasta"):
    clade = ac2clade[record.id]
    groupedSeqs[clade].append(record)

In [6]:
for clade, seqs in groupedSeqs.items():
    clade = clade.replace('/', '_').replace(' ', '_')
    outdir = os.path.join(CLUSTER_TRIM_DIR, clade)
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    SeqIO.write(seqs, os.path.join(outdir, "sequences.fasta"), "fasta")