## Figure 2C: Phylogenetic tree of viruses with clade defining substitution(s).

#### This notebook prepares additional metadata (local and context clade) for auspice.

In [13]:
import os
import pandas as pd
from Bio import SeqIO

In [14]:
meta = pd.read_csv('../data/rawdata/nyc_omicron_metadata.csv', index_col='seqName')

In [15]:
nextclade = pd.read_csv('../data/nextclade_nyc/nextclade.tsv', delimiter='\t', index_col='seqName')

In [16]:
clades = {
    'A': [],
    'B': [],
    'C': [],
    'D': [],
    'Others': []
}
for i in nextclade.index:
    substitutions = nextclade.loc[i, 'substitutions'].split(',')
    if 'G5515T' in substitutions:
        clades['A'].append(i)
    elif 'G5924A' in substitutions:
        clades['B'].append(i)
    elif 'T10135C' in substitutions and 'C25708T' in substitutions and 'A29301G' in substitutions:
        clades['C'].append(i)
    elif 'C2470T' in substitutions and 'G22599A' in substitutions:
        clades['D'].append(i)
    else:
        clades['Others'].append(i)

In [17]:
seqs_gl = []
with open('../data/rawdata/global_sequences.fasta') as f:
    for seq_record in SeqIO.parse(f, 'fasta'):
        sequence = str(seq_record.seq)
        if sequence.count('A') + sequence.count('T') + sequence.count('G') + sequence.count('C') >= 29000:
            seqs_gl.append(seq_record.description)

meta_gl = pd.read_csv('../data/rawdata/global_metadata.tsv', delimiter='\t', index_col='strain').dropna(subset=['date', 'region_exposure', 'country_exposure', 'division_exposure'])

nextclade_gl = pd.read_csv('../data/nextclade_global/nextclade.tsv', delimiter='\t', index_col='seqName').dropna(subset=['substitutions'])
nextclade_gl = nextclade_gl[
    (nextclade_gl['qc.overallStatus'] == 'good') 
    & (nextclade_gl.index.isin(seqs_gl)) 
    & (nextclade_gl.index.isin(meta_gl.index))
    ]

In [18]:
clades_gl = {
    'A': [],
    'B': [],
    'C': [],
    'D': []
}

for i in nextclade_gl.index:
    substitutions = nextclade_gl.loc[i, 'substitutions'].split(',')
    if 'G5515T' in substitutions:
        clades_gl['A'].append(i)
    if 'G5924A' in substitutions:
        clades_gl['B'].append(i)
    if 'T10135C' in substitutions and 'C25708T' in substitutions and 'A29301G' in substitutions:
        clades_gl['C'].append(i)
    if 'C2470T' in substitutions and 'G22599A' in substitutions:
        clades_gl['D'].append(i)

In [19]:
seqs_context = []

for c in clades_gl:
    earliest_date = meta[meta.index.isin(clades[c])]['Date of Collection'].sort_values()[0]
    for i in clades_gl[c]:
        if meta_gl.loc[i, 'date'] < earliest_date:
            seqs_context.append(i)

In [20]:
cols = {
    'A': '#6098C5',
    'B': '#FD9E31',
    'C': '#9370DB',
    'D': '#EE6553',
    'Global': '#B0C4DE'
}

with open('../../nextstrain_workflow/auspice_metadata/introduction_clades.csv', 'w') as f:
    f.write('strain'+','+'clade'+','+'clade__color'+'\n')
    for c in clades:
        if c != 'Others':
            for i in clades[c]:
                f.write(i+','+c+','+cols[c]+'\n')
    for i in seqs_context:
        f.write(i[8:]+','+'Global'+','+cols['Global']+'\n')