# Results
- See `results/influenza_qnet_predictions_YEAR.csv` and `results/dominant_sequences_YEAR.csv`
- Save to DataFrames located in `tables` directory
- See this [link](https://www.fludb.org/brc/vaccineRecommend.spg?decorator=influenza#:~:text=From%20these%20data%2C%20the%20WHO,recommendation%20are%20also%20usually%20suggested.) for WHO recommendations and this [link](https://platform.epicov.org/epi3/frontend#507f8c) to search the sequences
- Tables 4 - 15 in the paper

In [239]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from Bio import SeqIO
import Levenshtein as lev

In [240]:
NCBI_PATH = 'raw_data/ncbi/'
GISAID_PATH = 'raw_data/gisaid/'


FILES = ['north_h1n1_ha', 'north_h1n1_na', 'north_h3n2_ha', 'north_h3n2_na',
         'south_h1n1_ha', 'south_h1n1_na', 'south_h3n2_ha', 'south_h3n2_na']
table_dict = {}
for FILE in FILES:
    table_dict[FILE] = pd.read_csv('tables/' + FILE + '.csv')


FILES_3CLUSTER = ['north_h1n1_na', 'north_h3n2_na',
                  'south_h1n1_na', 'south_h3n2_na']
table_dict_3cluster = {}
for FILE in FILES_3CLUSTER:
    table_dict_3cluster[FILE] = pd.read_csv('tables/' + FILE + '_3cluster.csv')

In [241]:
# input: results table
# output: updated table with blank row
def add_new_row(df, year, multi_cluster = False):
    if len(df.loc[df['year'] == year]) == 0:
        if not multi_cluster:
            df.loc[df.shape[0]] = [year] + (df.shape[1] - 1) * [-1]
        else:
            df.loc[df.shape[0]] = (df.shape[1] - 1) * [-1] + [year]

        
# input: results table, name of strain, year
# output: updated table with dominant sequence
def add_dominant_sequence(df, name, year):
    dom_df = pd.read_csv('results/dominant_sequences_' + year + '.csv')
    dom_row = dom_df.loc[dom_df['strain'] == name]
    dom_name = dom_row['name'].values[0]
    dom_seq = dom_row['sequence'].values[0]
    df.loc[df['year'] == year, ['dominant_strain_accession_name']] = dom_name
    df.loc[df['year'] == year, ['dominant_strain_sequence']] = dom_seq
    

# input: results table, name of strain, year
# output: updated table with qnet predicted sequence
def add_predicted_sequence(df, name, year, multi_cluster = False):
    if not multi_cluster:
        pred_df = pd.read_csv('results/influenza_qnet_predictions_' + year + '.csv')
        pred_row = pred_df.loc[pred_df['strain'] == name]
        pred_name = pred_row['name'].values[0]
        pred_seq = pred_row['sequence'].values[0]
        df.loc[df['year'] == year, ['qdistance_recommendation_accession_name']] = pred_name
        df.loc[df['year'] == year, ['qdistance_recommendation_sequence']] = pred_seq
    else:
        pred_df = pd.read_csv('results/influenza_qnet_predictions_3cluster_' + year + '.csv')
        pred_row = pred_df.loc[pred_df['strain'] == name]
        pred_name_0 = pred_row['name 0'].values[0]
        pred_name_1 = pred_row['name 1'].values[0]
        pred_name_2 = pred_row['name 2'].values[0]
        pred_seq_0 = pred_row['sequence 0'].values[0]
        pred_seq_1 = pred_row['sequence 1'].values[0]
        pred_seq_2 = pred_row['sequence 2'].values[0]
        df.loc[df['year'] == year, ['qdistance_recommendation_accession_name_0']] = pred_name_0
        df.loc[df['year'] == year, ['qdistance_recommendation_accession_name_1']] = pred_name_1
        df.loc[df['year'] == year, ['qdistance_recommendation_accession_name_2']] = pred_name_2
        df.loc[df['year'] == year, ['qdistance_recommendation_sequence_0']] = pred_seq_0
        df.loc[df['year'] == year, ['qdistance_recommendation_sequence_1']] = pred_seq_1
        df.loc[df['year'] == year, ['qdistance_recommendation_sequence_2']] = pred_seq_2
    
    
# input: results table, name of strain, year
# output: updated table with who and qnet errors
def add_who_qnet_errors(df, year, multi_cluster = False):
    if not multi_cluster:
        dom_seq = df.loc[df['year'] == year]['dominant_strain_sequence'].values[0]
        who_seq = df.loc[df['year'] == year]['WHO_recommendation_sequence'].values[0]
        qnet_seq = df.loc[df['year'] == year]['qdistance_recommendation_sequence'].values[0]
        trunc = min(len(dom_seq), len(qnet_seq), len(who_seq))
        who_error = lev.distance(dom_seq[:trunc], who_seq[:trunc])
        qnet_error = lev.distance(dom_seq[:trunc], qnet_seq[:trunc])
        df.loc[df['year'] == year, ['ldistance_WHO']] = who_error
        df.loc[df['year'] == year, ['ldistance_Qnet_recommendation']] = qnet_error
    else:
        dom_seq = df.loc[df['year'] == year]['dominant_strain_sequence'].values[0]
        who_seq = df.loc[df['year'] == year]['WHO_recommendation_sequence'].values[0]
        qnet_seq_0 = df.loc[df['year'] == year]['qdistance_recommendation_sequence_0'].values[0]
        qnet_seq_1 = df.loc[df['year'] == year]['qdistance_recommendation_sequence_1'].values[0]
        qnet_seq_2 = df.loc[df['year'] == year]['qdistance_recommendation_sequence_2'].values[0]
        trunc = min(len(dom_seq), len(qnet_seq_0), len(qnet_seq_1), len(qnet_seq_2), len(who_seq))
        who_error = lev.distance(dom_seq[:trunc], who_seq[:trunc])
        qnet_error_0 = lev.distance(dom_seq[:trunc], qnet_seq_0[:trunc])
        qnet_error_1 = lev.distance(dom_seq[:trunc], qnet_seq_1[:trunc])
        qnet_error_2 = lev.distance(dom_seq[:trunc], qnet_seq_2[:trunc])
        df.loc[df['year'] == year, ['ldistance_WHO']] = who_error
        df.loc[df['year'] == year, ['ldistance_Qnet_recommendation_0']] = qnet_error_0
        df.loc[df['year'] == year, ['ldistance_Qnet_recommendation_1']] = qnet_error_1
        df.loc[df['year'] == year, ['ldistance_Qnet_recommendation_2']] = qnet_error_2
    
    
# input: results table, name of strain, year
# output: updated table with qnet sample size
def add_qnet_sample_size(df, name, year):
    DIR = GISAID_PATH + name + '_' + str(int(year[2:4]) - 1) + '.fasta'
    data = SeqIO.parse(DIR, 'fasta')
    length = 0
    for record in data:
        length += 1
    df.loc[df['year'] == year, ['qnet_sample_size']] = min(1000, length)

## 2020-2021

### Single Cluster

In [242]:
YEAR = '2020_2021'

who_seqs = []
dom_seqs = []
pred_seqs = []
who_errs = []
qnet_errs = []
qnet_sample_size = []

for FILE in FILES:
    df = table_dict[FILE]
    add_new_row(df, YEAR)
    add_dominant_sequence(df, FILE, YEAR)
    # add_predicted_sequence(df, FILE, YEAR)
    add_who_qnet_errors(df, YEAR)
    # add_qnet_sample_size(df, FILE, YEAR)
    # items to display
    who_seqs.append(df.loc[df['year'] == YEAR]['WHO_recommendation_name'].values[0])
    dom_seqs.append(df.loc[df['year'] == YEAR]['dominant_strain_accession_name'].values[0])
    pred_seqs.append(df.loc[df['year'] == YEAR]['qdistance_recommendation_accession_name'].values[0])
    who_errs.append(df.loc[df['year'] == YEAR]['ldistance_WHO'].values[0])
    qnet_errs.append(df.loc[df['year'] == YEAR]['ldistance_Qnet_recommendation'].values[0])
    qnet_sample_size.append(df.loc[df['year'] == YEAR]['qnet_sample_size'].values[0])
    
pd.DataFrame({'strain':FILES, 
              'who':who_seqs, 
              'dominant':dom_seqs, 
              'qnet':pred_seqs,
              'who err':who_errs,
              'qnet err':qnet_errs,
              'qnet sample':qnet_sample_size})

Unnamed: 0,strain,who,dominant,qnet,who err,qnet err,qnet sample
0,north_h1n1_ha,A/Hawaii/70/2019,A/Togo/905/2020,A/Italy/8949/2019,4,8,-1
1,north_h1n1_na,A/Hawaii/70/2019,A/Ghana/119/2020,A/Texas/112/2019,0,5,-1
2,north_h3n2_ha,A/Hong Kong/2671/2019,A/India/Pun-NIV300460/2021_Apr,A/California/NHRC-OID_FDX100215/2019,16,14,-1
3,north_h3n2_na,A/Hong Kong/2671/2019,A/Kenya/122/2021,A/Maryland/02/2019,3,13,-1
4,south_h1n1_ha,A/Brisbane/02/2018,A/Cote_d'Ivoire/951/2020,A/Italy/8451/2019,8,6,-1
5,south_h1n1_na,A/Brisbane/02/2018,A/Srinagar/AG_659/2020,A/Texas/7939/2019,5,4,-1
6,south_h3n2_ha,A/South Australia/34/2019,A/Timor-Leste/2/2020,A/Kentucky/27/2019,9,11,-1
7,south_h3n2_na,A/South Australia/34/2019,A/Bangladesh/3009/2020,A/Washington/9757/2019,1,13,-1


### Multi-Cluster

In [243]:
who_seqs = []
dom_seqs = []
pred_seqs_0 = []
pred_seqs_1 = []
pred_seqs_2 = []
who_errs = []
qnet_errs_0 = []
qnet_errs_1 = []
qnet_errs_2 = []
qnet_sample_size = []

for FILE in FILES_3CLUSTER:
    df = table_dict_3cluster[FILE]
    add_new_row(df, YEAR, multi_cluster=True)
    add_dominant_sequence(df, FILE[:13], YEAR)
    # add_predicted_sequence(df, FILE, YEAR, multi_cluster=True)
    add_who_qnet_errors(df, YEAR, multi_cluster=True)
    # add_qnet_sample_size(df, FILE[:13], YEAR)
    # items to display
    who_seqs.append(df.loc[df['year'] == YEAR]['WHO_recommendation_name'].values[0])
    dom_seqs.append(df.loc[df['year'] == YEAR]['dominant_strain_accession_name'].values[0])
    pred_seqs_0.append(df.loc[df['year'] == YEAR]['qdistance_recommendation_accession_name_0'].values[0])
    pred_seqs_1.append(df.loc[df['year'] == YEAR]['qdistance_recommendation_accession_name_1'].values[0])
    pred_seqs_2.append(df.loc[df['year'] == YEAR]['qdistance_recommendation_accession_name_2'].values[0])
    who_errs.append(df.loc[df['year'] == YEAR]['ldistance_WHO'].values[0])
    qnet_errs_0.append(df.loc[df['year'] == YEAR]['ldistance_Qnet_recommendation_0'].values[0])
    qnet_errs_1.append(df.loc[df['year'] == YEAR]['ldistance_Qnet_recommendation_1'].values[0])
    qnet_errs_2.append(df.loc[df['year'] == YEAR]['ldistance_Qnet_recommendation_2'].values[0])
    qnet_sample_size.append(df.loc[df['year'] == YEAR]['qnet_sample_size'].values[0])
    
pd.DataFrame({'strain':FILES_3CLUSTER, 
              'who':who_seqs, 
              'dominant':dom_seqs, 
              'qnet 0':pred_seqs_0,
              'qnet 1':pred_seqs_1,
              'qnet 2':pred_seqs_2,
              'who err':who_errs,
              'qnet err 0':qnet_errs_0,
              'qnet err 1':qnet_errs_1,
              'qnet err 2':qnet_errs_2,
              'qnet sample':qnet_sample_size})

Unnamed: 0,strain,who,dominant,qnet 0,qnet 1,qnet 2,who err,qnet err 0,qnet err 1,qnet err 2,qnet sample
0,north_h1n1_na,A/Hawaii/70/2019,A/Ghana/119/2020,A/California/NHRC-OID_BOX-ILI-0012/2019,A/Indiana/30/2019,A/Germany/9488/2019,0,3,8,4,-1
1,north_h3n2_na,A/Hong Kong/2671/2019,A/Kenya/122/2021,A/England/9738/2019,A/Washington/9757/2019,A/Minnesota/06/2019,3,1,13,9,-1
2,south_h1n1_na,A/Brisbane/02/2018,A/Srinagar/AG_659/2020,A/California/NHRC-OID_BOX-ILI-0012/2019,A/Indiana/30/2019,A/Germany/9488/2019,5,2,7,3,-1
3,south_h3n2_na,A/South Australia/34/2019,A/Bangladesh/3009/2020,A/England/9738/2019,A/Washington/9757/2019,A/Minnesota/06/2019,1,1,13,9,-1


## 2021-2022

### Single Cluster

In [244]:
YEAR = '2021_2022'

# Manually add WHO recommendations
table_dict['north_h1n1_ha'].loc[table_dict['north_h1n1_ha']['year'] == YEAR, ['WHO_recommendation_name']] = 'A/Victoria/2570/2019'
table_dict['north_h1n1_ha'].loc[table_dict['north_h1n1_ha']['year'] == YEAR, ['WHO_recommendation_sequence']] = 'MKAILVVMLYTFTTANADTLCIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRGVAPLHLGKCNIAGWILGNPECESLSTARSWSYIVETSNSDNGTCYPGDFINYEELREQLSSVSSFERFEIFPKTSSWPNHDSDNGVTAACPHAGAKSFYKNLIWLVKKGKSYPKINQTYINDKGKEVLVLWGIHHPPTIADQQSLYQNADAYVFVGTSRYSKKFKPEIATRPKVRDREGRMNYYWTLVEPGDKITFEATGNLVAPRYAFTMERDAGSGIIISDTPVHDCNTTCQTPEGAINTSLPFQNVHPITIGKCPKYVKSTKLRLATGLRNVPSIQSRGLFGAIAGFIEGGWTGMVDGWYGYHHQNEQGSGYAADLKSTQNAIDKITNKVNSVIEKMNTQFTAVGKEFNHLEKRIENLNKKVDDGFLDIWTYNAELLVLLENERTLDYHDSNVKNLYEKVRNQLKNNAKEIGNGCFEFYHKCDNTCMESVKNGTYDYPKYSEEAKLNREKIDGVKLDSTRIYQILAIYSTVASSLVLVVSLGAISFWMCSNGSLQCRICI'
table_dict['north_h1n1_na'].loc[table_dict['north_h1n1_na']['year'] == YEAR, ['WHO_recommendation_name']] = 'A/Victoria/2570/2019'
table_dict['north_h1n1_na'].loc[table_dict['north_h1n1_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = 'MNPNQKIITIGSICMTIGTANLILQIGNIISIWVSHSIQIGNQSQIETCNKSVITYENNTWVNQTFVNISNTNSAARQSVASVKLAGNSSLCPVSGWAIYSKDNSVRIGSKGDVFVIREPFISCSPLECRTFFLTQGALLNDKHSNGTIKDRSPYRTLMSCPIGEVPSPYNSRFESVAWSASACHDGTNWLTIGISGPDSGAVAVLKYNGIITDTIKSWRNKILRTQESECACVNGSCFTIMTDGPSDGQASYKIFRIEKGKIIKSVEMKAPNYHYEECSCYPDSSEITCVCRDNWHGSNRPWVSFNQNLEYQMGYICSGVFGDNPRPNDKTGSCGPVSSNGANGVKGFSFKYGNGVWIGRTKSISSRKGFEMIWDPNGWTGTDNKFSKKQDIVGINEWSGYSGSFVQHPELTGLNCIRPCFWVELIRGRPEENTIWTSGSSISFCGVDSDIVGWSWPDGAELPFTIDK'
table_dict['north_h3n2_ha'].loc[table_dict['north_h3n2_ha']['year'] == YEAR, ['WHO_recommendation_name']] = 'A/Cambodia/e0826360/2020'
table_dict['north_h3n2_ha'].loc[table_dict['north_h3n2_ha']['year'] == YEAR, ['WHO_recommendation_sequence']] = 'MKTIIALSYILCLVFAQKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGGNCTLIDALLGDPQCDGFQNKEWDLFVERSRANSNCYPYDVPDYASLRSLVASSGTLEFKNESFNWTGVKQNGTSSACIRGSSSSFFSRLNWLTHLNYKYPALNVTMPNNEQFDKLYIWGVHHPRTDKDQIFLFAQPSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFKHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRVQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNETYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAMSCFLLCIALLGFIMWACQKGNIRCNICI'
table_dict['north_h3n2_na'].loc[table_dict['north_h3n2_na']['year'] == YEAR, ['WHO_recommendation_name']] = 'A/Cambodia/e0826360/2020'
table_dict['north_h3n2_na'].loc[table_dict['north_h3n2_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = 'MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPPNNQVMLCEPTIIERNMTEIVYLTNTTIEKEICPKPAEYRNWSKPQCGITGFAPFSKDNSIRLSAGGDIWVTREPYVSCDLDKCYQFALGQGTTLNNVHSNNTVRDRTPYRTLLMNELGVPFHLGTKQVCIAWSSSSCHDGKAWLHVCITGDDKNATASFIYNGRLVDSVVSWSNDILRTQESECVCINGTCTVVMTDGNATGKADTKILFIEEGKIVHTSKLSGSAQHVEECSCYPRYPGVRCVCRDNWKGSNRPIIDINIKDHSIVSRYVCSGLVGDTPRKSDSSSSSHCLNPNNEKGDHGVKGWAFDDGNDVWMGRTINETSRLGYETFKVVEGWSNPKSKLQINRQVIVDRGDRSGYSGIFSVEGKSCINRCFYVELIRGRKEETEVLWTSNSIVVFCGTSGTYGTGSWPDGANLSLMHI'
table_dict['south_h1n1_ha'].loc[table_dict['south_h1n1_ha']['year'] == YEAR, ['WHO_recommendation_name']] = 'A/Victoria/2570/2019'
table_dict['south_h1n1_ha'].loc[table_dict['south_h1n1_ha']['year'] == YEAR, ['WHO_recommendation_sequence']] = 'MKAILVVMLYTFTTANADTLCIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRGVAPLHLGKCNIAGWILGNPECESLSTARSWSYIVETSNSDNGTCYPGDFINYEELREQLSSVSSFERFEIFPKTSSWPNHDSDNGVTAACPHAGAKSFYKNLIWLVKKGKSYPKINQTYINDKGKEVLVLWGIHHPPTIADQQSLYQNADAYVFVGTSRYSKKFKPEIATRPKVRDREGRMNYYWTLVEPGDKITFEATGNLVAPRYAFTMERDAGSGIIISDTPVHDCNTTCQTPEGAINTSLPFQNVHPITIGKCPKYVKSTKLRLATGLRNVPSIQSRGLFGAIAGFIEGGWTGMVDGWYGYHHQNEQGSGYAADLKSTQNAIDKITNKVNSVIEKMNTQFTAVGKEFNHLEKRIENLNKKVDDGFLDIWTYNAELLVLLENERTLDYHDSNVKNLYEKVRNQLKNNAKEIGNGCFEFYHKCDNTCMESVKNGTYDYPKYSEEAKLNREKIDGVKLDSTRIYQILAIYSTVASSLVLVVSLGAISFWMCSNGSLQCRICI'
table_dict['south_h1n1_na'].loc[table_dict['south_h1n1_na']['year'] == YEAR, ['WHO_recommendation_name']] = 'A/Victoria/2570/2019'
table_dict['south_h1n1_na'].loc[table_dict['south_h1n1_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = 'MNPNQKIITIGSICMTIGTANLILQIGNIISIWVSHSIQIGNQSQIETCNKSVITYENNTWVNQTFVNISNTNSAARQSVASVKLAGNSSLCPVSGWAIYSKDNSVRIGSKGDVFVIREPFISCSPLECRTFFLTQGALLNDKHSNGTIKDRSPYRTLMSCPIGEVPSPYNSRFESVAWSASACHDGTNWLTIGISGPDSGAVAVLKYNGIITDTIKSWRNKILRTQESECACVNGSCFTIMTDGPSDGQASYKIFRIEKGKIIKSVEMKAPNYHYEECSCYPDSSEITCVCRDNWHGSNRPWVSFNQNLEYQMGYICSGVFGDNPRPNDKTGSCGPVSSNGANGVKGFSFKYGNGVWIGRTKSISSRKGFEMIWDPNGWTGTDNKFSKKQDIVGINEWSGYSGSFVQHPELTGLNCIRPCFWVELIRGRPEENTIWTSGSSISFCGVDSDIVGWSWPDGAELPFTIDK'
table_dict['south_h3n2_ha'].loc[table_dict['south_h3n2_ha']['year'] == YEAR, ['WHO_recommendation_name']] = 'A/Hong Kong/2671/2019'
table_dict['south_h3n2_ha'].loc[table_dict['south_h3n2_ha']['year'] == YEAR, ['WHO_recommendation_sequence']] = 'MKTIIALSYILCLVFTQKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGGNCTLIDALLGDPQCDGFQNKKWDLFVERSRAYSNCYPYDVPDYASLRSLVASSGTLEFKNESFNWAGVTQNGKSFSCIRGSSSSFFSRLNWLTHLNYIYPALNVTMPNKEQFDKLYIWGVHHPVTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGFRPRIRNIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRVQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNETYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI'
table_dict['south_h3n2_na'].loc[table_dict['south_h3n2_na']['year'] == YEAR, ['WHO_recommendation_name']] = 'A/Hong Kong/2671/2019'
table_dict['south_h3n2_na'].loc[table_dict['south_h3n2_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = 'MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPPNNQVMLCEPTIIERNITEIVYLTNTTIEKEICPKPAEYRNWSKPQCGITGFAPFSKDNSIRLSAGGDIWVTREPYVSCDLDKCYQFALGQGTTLNNVHSNNTVRDRTPYRTLLMNELGVPFHLGTKQVCIAWSSSSCHDGKAWLHVCITGDDKNATASFIYNGRLVDSVVSWSNDILRTQESECVCINGTCTVVMTDGNATGKADTKILFIEEGKIVHTSKLSGSAQHVEECSCYPRYPGVRCVCRDNWKGSNRPIIDINIKDHSIVSSYVCSGLVGDTPRKSDSSSSSHCLNPNNEEGGHGVKGWAFDDGNDVWMGRTINETSRLGYETFKVVEGWSNPKSKLQINRQVIVDRGDRSGYSGIFSVEGKSCINRCFYVELIRGRKEETEVLWTSNSIVVFCGTSGTYGTGSWPDGADLNLMHT'

In [245]:
who_seqs = []
dom_seqs = []
pred_seqs = []
who_errs = []
qnet_errs = []
qnet_sample_size = []

for FILE in FILES:
    df = table_dict[FILE]
    add_new_row(df, YEAR)
    add_dominant_sequence(df, FILE, YEAR)
    add_predicted_sequence(df, FILE, YEAR)
    add_who_qnet_errors(df, YEAR)
    add_qnet_sample_size(df, FILE, YEAR)
    # items to display
    who_seqs.append(df.loc[df['year'] == YEAR]['WHO_recommendation_name'].values[0])
    dom_seqs.append(df.loc[df['year'] == YEAR]['dominant_strain_accession_name'].values[0])
    pred_seqs.append(df.loc[df['year'] == YEAR]['qdistance_recommendation_accession_name'].values[0])
    who_errs.append(df.loc[df['year'] == YEAR]['ldistance_WHO'].values[0])
    qnet_errs.append(df.loc[df['year'] == YEAR]['ldistance_Qnet_recommendation'].values[0])
    qnet_sample_size.append(df.loc[df['year'] == YEAR]['qnet_sample_size'].values[0])
    
pd.DataFrame({'strain':FILES, 
              'who':who_seqs, 
              'dominant':dom_seqs, 
              'qnet':pred_seqs,
              'who err':who_errs,
              'qnet err':qnet_errs,
              'qnet sample':qnet_sample_size})

Unnamed: 0,strain,who,dominant,qnet,who err,qnet err,qnet sample
0,north_h1n1_ha,A/Victoria/2570/2019,A/Ireland/20935/2022,A/Togo/45/2021,9,3,240
1,north_h1n1_na,A/Victoria/2570/2019,A/Cote_d'Ivoire/3729/2021,A/Togo/0071/2021,1,5,236
2,north_h3n2_ha,A/Cambodia/e0826360/2020,A/Human/New_York/PV60641/2022,A/India/Pun-NIV291000/2021_Jan,14,5,446
3,north_h3n2_na,A/Cambodia/e0826360/2020,A/Stockholm/10/2022,A/Darwin/9/2021,2,2,421
4,south_h1n1_ha,A/Victoria/2570/2019,A/Abidjan/457/2021,A/Togo/35/2021,9,4,31
5,south_h1n1_na,A/Victoria/2570/2019,A/Cote_D'Ivoire/1496/2021,A/Togo/0155/2021,1,7,29
6,south_h3n2_ha,A/Hong Kong/2671/2019,A/Darwin/9a/2021,A/India/PUN-NIV301718/2021,19,1,224
7,south_h3n2_na,A/Hong Kong/2671/2019,A/India/PUN-NIV301718/2021,A/Darwin/11/2021,6,1,218


### Multi-Cluster

In [246]:
YEAR = '2021_2022'

# Manually add WHO recommendations
for FILE in FILES_3CLUSTER:
    df = table_dict_3cluster[FILE]
    add_new_row(df, YEAR, multi_cluster=True)
    
table_dict_3cluster['north_h1n1_na'].loc[table_dict_3cluster['north_h1n1_na']['year'] == YEAR, ['WHO_recommendation_name']] = 'A/Victoria/2570/2019'
table_dict_3cluster['north_h1n1_na'].loc[table_dict_3cluster['north_h1n1_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = 'MNPNQKIITIGSICMTIGTANLILQIGNIISIWVSHSIQIGNQSQIETCNKSVITYENNTWVNQTFVNISNTNSAARQSVASVKLAGNSSLCPVSGWAIYSKDNSVRIGSKGDVFVIREPFISCSPLECRTFFLTQGALLNDKHSNGTIKDRSPYRTLMSCPIGEVPSPYNSRFESVAWSASACHDGTNWLTIGISGPDSGAVAVLKYNGIITDTIKSWRNKILRTQESECACVNGSCFTIMTDGPSDGQASYKIFRIEKGKIIKSVEMKAPNYHYEECSCYPDSSEITCVCRDNWHGSNRPWVSFNQNLEYQMGYICSGVFGDNPRPNDKTGSCGPVSSNGANGVKGFSFKYGNGVWIGRTKSISSRKGFEMIWDPNGWTGTDNKFSKKQDIVGINEWSGYSGSFVQHPELTGLNCIRPCFWVELIRGRPEENTIWTSGSSISFCGVDSDIVGWSWPDGAELPFTIDK'
table_dict_3cluster['north_h3n2_na'].loc[table_dict_3cluster['north_h3n2_na']['year'] == YEAR, ['WHO_recommendation_name']] = 'A/Cambodia/e0826360/2020'
table_dict_3cluster['north_h3n2_na'].loc[table_dict_3cluster['north_h3n2_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = 'MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPPNNQVMLCEPTIIERNMTEIVYLTNTTIEKEICPKPAEYRNWSKPQCGITGFAPFSKDNSIRLSAGGDIWVTREPYVSCDLDKCYQFALGQGTTLNNVHSNNTVRDRTPYRTLLMNELGVPFHLGTKQVCIAWSSSSCHDGKAWLHVCITGDDKNATASFIYNGRLVDSVVSWSNDILRTQESECVCINGTCTVVMTDGNATGKADTKILFIEEGKIVHTSKLSGSAQHVEECSCYPRYPGVRCVCRDNWKGSNRPIIDINIKDHSIVSRYVCSGLVGDTPRKSDSSSSSHCLNPNNEKGDHGVKGWAFDDGNDVWMGRTINETSRLGYETFKVVEGWSNPKSKLQINRQVIVDRGDRSGYSGIFSVEGKSCINRCFYVELIRGRKEETEVLWTSNSIVVFCGTSGTYGTGSWPDGANLSLMHI'
table_dict_3cluster['south_h1n1_na'].loc[table_dict_3cluster['south_h1n1_na']['year'] == YEAR, ['WHO_recommendation_name']] = 'A/Victoria/2570/2019'
table_dict_3cluster['south_h1n1_na'].loc[table_dict_3cluster['south_h1n1_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = 'MNPNQKIITIGSICMTIGTANLILQIGNIISIWVSHSIQIGNQSQIETCNKSVITYENNTWVNQTFVNISNTNSAARQSVASVKLAGNSSLCPVSGWAIYSKDNSVRIGSKGDVFVIREPFISCSPLECRTFFLTQGALLNDKHSNGTIKDRSPYRTLMSCPIGEVPSPYNSRFESVAWSASACHDGTNWLTIGISGPDSGAVAVLKYNGIITDTIKSWRNKILRTQESECACVNGSCFTIMTDGPSDGQASYKIFRIEKGKIIKSVEMKAPNYHYEECSCYPDSSEITCVCRDNWHGSNRPWVSFNQNLEYQMGYICSGVFGDNPRPNDKTGSCGPVSSNGANGVKGFSFKYGNGVWIGRTKSISSRKGFEMIWDPNGWTGTDNKFSKKQDIVGINEWSGYSGSFVQHPELTGLNCIRPCFWVELIRGRPEENTIWTSGSSISFCGVDSDIVGWSWPDGAELPFTIDK'
table_dict_3cluster['south_h3n2_na'].loc[table_dict_3cluster['south_h3n2_na']['year'] == YEAR, ['WHO_recommendation_name']] = 'A/Hong Kong/2671/2019'
table_dict_3cluster['south_h3n2_na'].loc[table_dict_3cluster['south_h3n2_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = 'MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPPNNQVMLCEPTIIERNITEIVYLTNTTIEKEICPKPAEYRNWSKPQCGITGFAPFSKDNSIRLSAGGDIWVTREPYVSCDLDKCYQFALGQGTTLNNVHSNNTVRDRTPYRTLLMNELGVPFHLGTKQVCIAWSSSSCHDGKAWLHVCITGDDKNATASFIYNGRLVDSVVSWSNDILRTQESECVCINGTCTVVMTDGNATGKADTKILFIEEGKIVHTSKLSGSAQHVEECSCYPRYPGVRCVCRDNWKGSNRPIIDINIKDHSIVSSYVCSGLVGDTPRKSDSSSSSHCLNPNNEEGGHGVKGWAFDDGNDVWMGRTINETSRLGYETFKVVEGWSNPKSKLQINRQVIVDRGDRSGYSGIFSVEGKSCINRCFYVELIRGRKEETEVLWTSNSIVVFCGTSGTYGTGSWPDGADLNLMHT'

In [247]:
who_seqs = []
dom_seqs = []
pred_seqs_0 = []
pred_seqs_1 = []
pred_seqs_2 = []
who_errs = []
qnet_errs_0 = []
qnet_errs_1 = []
qnet_errs_2 = []
qnet_sample_size = []

for FILE in FILES_3CLUSTER:
    df = table_dict_3cluster[FILE]
    add_new_row(df, YEAR, multi_cluster=True)
    add_dominant_sequence(df, FILE, YEAR)
    add_predicted_sequence(df, FILE, YEAR, multi_cluster=True)
    add_who_qnet_errors(df, YEAR, multi_cluster=True)
    add_qnet_sample_size(df, FILE, YEAR)
    # items to display
    who_seqs.append(df.loc[df['year'] == YEAR]['WHO_recommendation_name'].values[0])
    dom_seqs.append(df.loc[df['year'] == YEAR]['dominant_strain_accession_name'].values[0])
    pred_seqs_0.append(df.loc[df['year'] == YEAR]['qdistance_recommendation_accession_name_0'].values[0])
    pred_seqs_1.append(df.loc[df['year'] == YEAR]['qdistance_recommendation_accession_name_1'].values[0])
    pred_seqs_2.append(df.loc[df['year'] == YEAR]['qdistance_recommendation_accession_name_2'].values[0])
    who_errs.append(df.loc[df['year'] == YEAR]['ldistance_WHO'].values[0])
    qnet_errs_0.append(df.loc[df['year'] == YEAR]['ldistance_Qnet_recommendation_0'].values[0])
    qnet_errs_1.append(df.loc[df['year'] == YEAR]['ldistance_Qnet_recommendation_1'].values[0])
    qnet_errs_2.append(df.loc[df['year'] == YEAR]['ldistance_Qnet_recommendation_2'].values[0])
    qnet_sample_size.append(df.loc[df['year'] == YEAR]['qnet_sample_size'].values[0])
    
pd.DataFrame({'strain':FILES_3CLUSTER, 
              'who':who_seqs, 
              'dominant':dom_seqs, 
              'qnet 0':pred_seqs_0,
              'qnet 1':pred_seqs_1,
              'qnet 2':pred_seqs_2,
              'who err':who_errs,
              'qnet err 0':qnet_errs_0,
              'qnet err 1':qnet_errs_1,
              'qnet err 2':qnet_errs_2,
              'qnet sample':qnet_sample_size})

Unnamed: 0,strain,who,dominant,qnet 0,qnet 1,qnet 2,who err,qnet err 0,qnet err 1,qnet err 2,qnet sample
0,north_h1n1_na,A/Victoria/2570/2019,A/Cote_d'Ivoire/3729/2021,A/Togo/0071/2021,A/Yunnan-Mengzi/1462/2020,A/North_Carolina/15/2020,1,5,51,98,236
1,north_h3n2_na,A/Cambodia/e0826360/2020,A/Stockholm/10/2022,A/Laos/527/2021,A/Michigan/UOM10045655748/2020,A/Wisconsin/01/2021,2,3,7,58,421
2,south_h1n1_na,A/Victoria/2570/2019,A/Cote_D'Ivoire/1496/2021,A/Togo/0155/2021,A/Shandong/00204/2021,A/North_Carolina/15/2020,1,7,58,98,29
3,south_h3n2_na,A/Hong Kong/2671/2019,A/India/PUN-NIV301718/2021,A/Darwin/11/2021,A/Hawaii/28/2020,A/South_Australia/1/2021,6,1,49,57,218


## 2022-2023

### Single Cluster

In [248]:
YEAR = '2022_2023'

# Manually add WHO recommendations
table_dict['north_h1n1_ha'].loc[table_dict['north_h1n1_ha']['year'] == YEAR, ['WHO_recommendation_name']] = -1
table_dict['north_h1n1_ha'].loc[table_dict['north_h1n1_ha']['year'] == YEAR, ['WHO_recommendation_sequence']] = -1
table_dict['north_h1n1_na'].loc[table_dict['north_h1n1_na']['year'] == YEAR, ['WHO_recommendation_name']] = -1
table_dict['north_h1n1_na'].loc[table_dict['north_h1n1_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = -1
table_dict['north_h3n2_ha'].loc[table_dict['north_h3n2_ha']['year'] == YEAR, ['WHO_recommendation_name']] = -1
table_dict['north_h3n2_ha'].loc[table_dict['north_h3n2_ha']['year'] == YEAR, ['WHO_recommendation_sequence']] = -1
table_dict['north_h3n2_na'].loc[table_dict['north_h3n2_na']['year'] == YEAR, ['WHO_recommendation_name']] = -1
table_dict['north_h3n2_na'].loc[table_dict['north_h3n2_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = -1
table_dict['south_h1n1_ha'].loc[table_dict['south_h1n1_ha']['year'] == YEAR, ['WHO_recommendation_name']] = -1
table_dict['south_h1n1_ha'].loc[table_dict['south_h1n1_ha']['year'] == YEAR, ['WHO_recommendation_sequence']] = -1
table_dict['south_h1n1_na'].loc[table_dict['south_h1n1_na']['year'] == YEAR, ['WHO_recommendation_name']] = -1
table_dict['south_h1n1_na'].loc[table_dict['south_h1n1_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = -1
table_dict['south_h3n2_ha'].loc[table_dict['south_h3n2_ha']['year'] == YEAR, ['WHO_recommendation_name']] = -1
table_dict['south_h3n2_ha'].loc[table_dict['south_h3n2_ha']['year'] == YEAR, ['WHO_recommendation_sequence']] = -1
table_dict['south_h3n2_na'].loc[table_dict['south_h3n2_na']['year'] == YEAR, ['WHO_recommendation_name']] = -1
table_dict['south_h3n2_na'].loc[table_dict['south_h3n2_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = -1

In [249]:
who_seqs = []
dom_seqs = []
pred_seqs = []
who_errs = []
qnet_errs = []
qnet_sample_size = []

for FILE in FILES:
    df = table_dict[FILE]
    add_new_row(df, YEAR)
    # add_dominant_sequence(df, FILE, YEAR)
    add_predicted_sequence(df, FILE, YEAR)
    # add_who_qnet_errors(df, YEAR)
    add_qnet_sample_size(df, FILE, YEAR)
    # items to display
    who_seqs.append(df.loc[df['year'] == YEAR]['WHO_recommendation_name'].values[0])
    dom_seqs.append(df.loc[df['year'] == YEAR]['dominant_strain_accession_name'].values[0])
    pred_seqs.append(df.loc[df['year'] == YEAR]['qdistance_recommendation_accession_name'].values[0])
    who_errs.append(df.loc[df['year'] == YEAR]['ldistance_WHO'].values[0])
    qnet_errs.append(df.loc[df['year'] == YEAR]['ldistance_Qnet_recommendation'].values[0])
    qnet_sample_size.append(df.loc[df['year'] == YEAR]['qnet_sample_size'].values[0])
    
pd.DataFrame({'strain':FILES, 
              'who':who_seqs, 
              'dominant':dom_seqs, 
              'qnet':pred_seqs,
              'who err':who_errs,
              'qnet err':qnet_errs,
              'qnet sample':qnet_sample_size})

Unnamed: 0,strain,who,dominant,qnet,who err,qnet err,qnet sample
0,north_h1n1_ha,-1,-1,A/Netherlands/00068/2022,-1,-1,976
1,north_h1n1_na,-1,-1,A/Lyon/820/2021,-1,-1,961
2,north_h3n2_ha,-1,-1,A/Denmark/370/2022,-1,-1,1000
3,north_h3n2_na,-1,-1,A/Michigan/UOM10042819294/2021,-1,-1,1000
4,south_h1n1_ha,-1,-1,A/Cote_D'Ivoire/1270/2021,-1,-1,281
5,south_h1n1_na,-1,-1,A/Dakar/35/2021,-1,-1,264
6,south_h3n2_ha,-1,-1,A/Saint-Martin/00754/2022,-1,-1,641
7,south_h3n2_na,-1,-1,A/Texas/12723/2022,-1,-1,628


### Multi-Cluster

In [250]:
YEAR = '2022_2023'

# Manually add WHO recommendations
for FILE in FILES_3CLUSTER:
    df = table_dict_3cluster[FILE]
    add_new_row(df, YEAR, multi_cluster=True)
    
table_dict_3cluster['north_h1n1_na'].loc[table_dict_3cluster['north_h1n1_na']['year'] == YEAR, ['WHO_recommendation_name']] = -1
table_dict_3cluster['north_h1n1_na'].loc[table_dict_3cluster['north_h1n1_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = -1
table_dict_3cluster['north_h3n2_na'].loc[table_dict_3cluster['north_h3n2_na']['year'] == YEAR, ['WHO_recommendation_name']] = -1
table_dict_3cluster['north_h3n2_na'].loc[table_dict_3cluster['north_h3n2_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = -1
table_dict_3cluster['south_h1n1_na'].loc[table_dict_3cluster['south_h1n1_na']['year'] == YEAR, ['WHO_recommendation_name']] = -1
table_dict_3cluster['south_h1n1_na'].loc[table_dict_3cluster['south_h1n1_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = -1
table_dict_3cluster['south_h3n2_na'].loc[table_dict_3cluster['south_h3n2_na']['year'] == YEAR, ['WHO_recommendation_name']] = -1
table_dict_3cluster['south_h3n2_na'].loc[table_dict_3cluster['south_h3n2_na']['year'] == YEAR, ['WHO_recommendation_sequence']] = -1

In [251]:
who_seqs = []
dom_seqs = []
pred_seqs_0 = []
pred_seqs_1 = []
pred_seqs_2 = []
who_errs = []
qnet_errs_0 = []
qnet_errs_1 = []
qnet_errs_2 = []
qnet_sample_size = []

for FILE in FILES_3CLUSTER:
    df = table_dict_3cluster[FILE]
    add_new_row(df, YEAR, multi_cluster=True)
    # add_dominant_sequence(df, FILE, YEAR)
    add_predicted_sequence(df, FILE, YEAR, multi_cluster=True)
    # add_who_qnet_errors(df, YEAR, multi_cluster=True)
    add_qnet_sample_size(df, FILE, YEAR)
    # items to display
    who_seqs.append(df.loc[df['year'] == YEAR]['WHO_recommendation_name'].values[0])
    dom_seqs.append(df.loc[df['year'] == YEAR]['dominant_strain_accession_name'].values[0])
    pred_seqs_0.append(df.loc[df['year'] == YEAR]['qdistance_recommendation_accession_name_0'].values[0])
    pred_seqs_1.append(df.loc[df['year'] == YEAR]['qdistance_recommendation_accession_name_1'].values[0])
    pred_seqs_2.append(df.loc[df['year'] == YEAR]['qdistance_recommendation_accession_name_2'].values[0])
    who_errs.append(df.loc[df['year'] == YEAR]['ldistance_WHO'].values[0])
    qnet_errs_0.append(df.loc[df['year'] == YEAR]['ldistance_Qnet_recommendation_0'].values[0])
    qnet_errs_1.append(df.loc[df['year'] == YEAR]['ldistance_Qnet_recommendation_1'].values[0])
    qnet_errs_2.append(df.loc[df['year'] == YEAR]['ldistance_Qnet_recommendation_2'].values[0])
    qnet_sample_size.append(df.loc[df['year'] == YEAR]['qnet_sample_size'].values[0])
    
pd.DataFrame({'strain':FILES_3CLUSTER, 
              'who':who_seqs, 
              'dominant':dom_seqs, 
              'qnet 0':pred_seqs_0,
              'qnet 1':pred_seqs_1,
              'qnet 2':pred_seqs_2,
              'who err':who_errs,
              'qnet err 0':qnet_errs_0,
              'qnet err 1':qnet_errs_1,
              'qnet err 2':qnet_errs_2,
              'qnet sample':qnet_sample_size})

Unnamed: 0,strain,who,dominant,qnet 0,qnet 1,qnet 2,who err,qnet err 0,qnet err 1,qnet err 2,qnet sample
0,north_h1n1_na,-1,-1,A/Netherlands/10646/2022,A/Sydney/234/2022,A/Wisconsin/03/2021,-1,-1,-1,-1,961
1,north_h3n2_na,-1,-1,A/Maine/02/2022,A/Michigan/UOM10042819294/2021,A/Netherlands/10082/2022,-1,-1,-1,-1,1000
2,south_h1n1_na,-1,-1,A/Switzerland/86136/2022,A/Wisconsin/04/2021,A/Wisconsin/05/2021,-1,-1,-1,-1,264
3,south_h3n2_na,-1,-1,A/Congo/313/2021,A/Texas/12723/2022,A/Netherlands/00037/2022,-1,-1,-1,-1,628


## Save Tables

In [252]:
for FILE in FILES:
    os.makedirs('tables', exist_ok=True)  
    table_dict[FILE].to_csv('tables/' + FILE + '.csv', index=False)
    
for FILE in FILES_3CLUSTER:
    os.makedirs('tables', exist_ok=True)  
    table_dict_3cluster[FILE].to_csv('tables/' + FILE + '_3cluster.csv', index=False)

## Total Data Retreived (SI Table 3)

In [253]:
STRAINS = ['h1n1_ha', 'h1n1_na', 'h3n2_ha', 'h3n2_na']
GISAID_YEARS = ['_20', '_21']
ncbi_counts = []
gisaid_counts = []

for STRAIN in STRAINS:
    # ncbi
    ncbi_num = 0
    NCBI_DIR = NCBI_PATH + STRAIN + '.fasta'
    for record in SeqIO.parse(NCBI_DIR, 'fasta'):
        ncbi_num += 1
    ncbi_counts.append(ncbi_num)
    
    # gisaid
    gisaid_num = 0
    for FILE in FILES:
        if STRAIN in FILE:
            for YEAR in GISAID_YEARS:
                GISAID_DIR = GISAID_PATH + FILE + YEAR + '.fasta'
                for record in SeqIO.parse(GISAID_DIR, 'fasta'):
                    gisaid_num += 1
    gisaid_counts.append(gisaid_num)
    
total_seqs = pd.DataFrame({'database':4*['NCBI'] + 4*['GISAID'], 
                           'strain':STRAINS + STRAINS, 
                           'no. seqs':ncbi_counts + gisaid_counts})
total_seqs.loc[total_seqs.shape[0]] = ['Total','',total_seqs['no. seqs'].sum()]
total_seqs

Unnamed: 0,database,strain,no. seqs
0,NCBI,h1n1_ha,17894
1,NCBI,h1n1_na,16637
2,NCBI,h3n2_ha,18265
3,NCBI,h3n2_na,14699
4,GISAID,h1n1_ha,1528
5,GISAID,h1n1_na,1490
6,GISAID,h3n2_ha,13975
7,GISAID,h3n2_na,13811
8,Total,,98299
