# Enet Predictions vs. WHO Predictions
Compare Enet predictions and WHO predictions.
- For each season, take the average Hamming distance between the predictions and all strains
- Truncate sequence to HA 565, NA 468
- WHO predictions from [WHO vaccine recommendations](https://www.who.int/teams/global-influenza-programme/vaccines/who-recommendations/recommendations-for-influenza-vaccine-composition-archive)

In [3]:
import os 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from distance import hamming as distance
pd.set_option('display.max_columns', None)
from domseq import DomSeq
from zedstat.textable import textable


WHO_DIR = 'data/who/'
SEASONAL_DIR = 'results/enet_predictions/seasonal_predictions/'
PRED_DIR = 'results/enet_predictions/'
OUT_DIR = 'results/enet_who_comparison/'
TAB_DIR = 'tables/plotdata/'
os.makedirs(OUT_DIR, exist_ok=True)

FILES = ['north_h1n1', 'north_h3n2', 'south_h1n1', 'south_h3n2']

NORTH_YEARS = []
for i in np.arange(3, 24):
    YEAR = ''
    if i < 10:
        YEAR += '0' + str(i)
    else:
        YEAR += (str(i))
    if i + 1 < 10:
        YEAR += '_0' + str(i + 1)
    else:
        YEAR += '_' + str(i + 1)
    NORTH_YEARS.append(YEAR)
        
SOUTH_YEARS = []
for i in np.arange(3, 24):
    if i < 10:
        SOUTH_YEARS.append('0' + str(i))
    else:
        SOUTH_YEARS.append(str(i))

NA_TRUNC = 468 # 2 less than official length of 470
HA_TRUNC = 565 # 2 less than official length of 567

## Compute Enet and WHO Errors
For each season, take the average Hamming distance between the prediction and all strains
- `season`: 03-04 through 23-24 for north, 03 through 23 for south
- Do this for both multicluster and single cluster predictions
    - Two-cluster: `results/enet_predictions/<hemisphere>_<subtype>_predictions.csv`
    - Single-cluster: `results/enet_predictions/<hemisphere>_<subtype>_predictions_single_cluster.csv`


## Two-Cluster Predictions vs. WHO
- For each sequence in a season, take the minimum Hamming distance between that sequence and our predictions from the two largest clusters
- **Note: in the 2009-10 H1N1 Northern flu season, we outperform WHO by 423.79 edits**
    - Our predictions are A/Hawaii/02/2008,2008-01-17 (ACF10336.1, cluster size 563) and A/Hong Kong/H090-751-V3 (EPI326467, cluster size 23)
    - The latter is 423.79 edits closer to the population than WHO
    - The sequence was collected on 2009-02-08, so technically before the WHO recommendation date, but was only submitted on 2011-07-08, so we exclude it from our report

In [2]:
for FILE in FILES:    
    # Prepare prediction DataFrame
    df = pd.read_csv(PRED_DIR + FILE + '_predictions.csv', converters={'season': str})
    # WHO recommendation name, sequence
    who_ha_df = pd.read_csv(WHO_DIR + FILE + '.csv')
    who_na_df = pd.read_csv(WHO_DIR + FILE + '.csv')
    df['name_who'] = who_ha_df['who_recommendation_name']
    df['ha_seq_who'] = who_ha_df['ha_sequence']
    df['na_seq_who'] = who_na_df['na_sequence']
    
    # Compute errors
    seasons = df['season'].values
    enet_errors_ha = []
    enet_errors_na = []
    who_errors_ha = []
    who_errors_na = []
    
    for season in seasons:
        season_str = str(season)
        if len(season_str) == 1:
            season_str = str('0' + season_str)
            
        # Read all sequences
        DATA_DIR = 'data/merged/' + FILE + '/' + FILE + '_' + season_str + '.csv'
        if not os.path.isfile(DATA_DIR):
            enet_errors_ha.append(-1)
            enet_errors_na.append(-1)
            who_errors_ha.append(-1)
            who_errors_na.append(-1)
            continue
        seq_df = pd.read_csv(DATA_DIR)
        
        # Access Enet and WHO recommendations
        enet_ha_seq_0 = df[df['season'] == season]['ha_seq_0'].values[0][:HA_TRUNC]
        enet_na_seq_0 = df[df['season'] == season]['na_seq_0'].values[0][:NA_TRUNC]
        enet_ha_seq_1 = df[df['season'] == season]['ha_seq_1'].values[0][:HA_TRUNC]
        enet_na_seq_1 = df[df['season'] == season]['na_seq_1'].values[0][:NA_TRUNC]
        who_ha_seq = df[df['season'] == season]['ha_seq_who'].values[0][:HA_TRUNC]
        who_na_seq = df[df['season'] == season]['na_seq_who'].values[0][:NA_TRUNC]
        
        # Find average enet and who errors
        total_enet_error_ha = 0
        total_enet_error_na = 0
        total_who_error_ha = 0
        total_who_error_na = 0
        
        # HA
        ha_len = len(seq_df)
        for seq in seq_df['sequence']:
            if len(seq) < HA_TRUNC:
                ha_len -= 1
                continue
            total_enet_error_ha += min(distance(enet_ha_seq_0, seq[:HA_TRUNC]), distance(enet_ha_seq_1, seq[:HA_TRUNC]))
            total_who_error_ha += distance(who_ha_seq, seq[:HA_TRUNC])
        # NA
        na_len = len(seq_df)
        for seq in seq_df['sequence_na']:
            if len(seq) < NA_TRUNC:
                na_len -= 1
                continue
            total_enet_error_na += min(distance(enet_na_seq_0, seq[:NA_TRUNC]), distance(enet_na_seq_1, seq[:NA_TRUNC]))
            total_who_error_na += distance(who_na_seq, seq[:NA_TRUNC])
            
        enet_errors_ha.append(total_enet_error_ha/ha_len)
        enet_errors_na.append(total_enet_error_na/na_len)
        who_errors_ha.append(total_who_error_ha/ha_len)
        who_errors_na.append(total_who_error_na/na_len)

    # Add to dataframe
    df['ha_who_error'] = who_errors_ha
    df['na_who_error'] = who_errors_na
    df['ha_enet_error'] = enet_errors_ha
    df['na_enet_error'] = enet_errors_na
    df.to_csv(OUT_DIR + FILE + '.csv', index=False)
    

# Error differences between Enet and WHO
# Positive means we are better, negative means WHO better
north_errors_df = pd.DataFrame({'north_season':NORTH_YEARS})
for FILE in FILES[:2]:
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    north_errors_df[FILE + '_ha'] = df['ha_who_error'] - df['ha_enet_error']
    north_errors_df[FILE + '_na'] = df['na_who_error'] - df['na_enet_error']
    
south_errors_df = pd.DataFrame({'south_season':SOUTH_YEARS})
for FILE in FILES[2:]:
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    south_errors_df[FILE + '_ha'] = df['ha_who_error'] - df['ha_enet_error']
    south_errors_df[FILE + '_na'] = df['na_who_error'] - df['na_enet_error']
    
errors_df = north_errors_df.join(south_errors_df, how='outer')
errors_df.to_csv(OUT_DIR + 'errors_difference.csv', index=False)
print('Average Improvement')
print(f'\tNorth H1N1 HA: {np.mean(errors_df["north_h1n1_ha"]):.3f}')
print(f'\tNorth H3N2 HA: {np.mean(errors_df["north_h3n2_ha"]):.3f}')
print(f'\tSouth H1N1 HA: {np.mean(errors_df["south_h1n1_ha"]):.3f}')
print(f'\tSouth H3N2 HA: {np.mean(errors_df["south_h3n2_ha"]):.3f}')
errors_df[['north_season', 'north_h1n1_ha', 'north_h3n2_ha', 
           'south_season', 'south_h1n1_ha', 'south_h3n2_ha']].round(2)

Average Improvement
	North H1N1 HA: 24.430
	North H3N2 HA: 4.414
	South H1N1 HA: 5.038
	South H3N2 HA: 3.272


Unnamed: 0,north_season,north_h1n1_ha,north_h3n2_ha,south_season,south_h1n1_ha,south_h3n2_ha
0,03_04,3.0,22.72,3,4.31,19.36
1,04_05,-2.29,4.75,4,2.5,2.03
2,05_06,4.18,7.9,5,2.94,2.63
3,06_07,4.75,5.22,6,3.3,10.7
4,07_08,-0.62,6.85,7,6.47,3.74
5,08_09,1.14,2.62,8,6.42,-0.88
6,09_10,423.79,1.97,9,0.87,2.0
7,10_11,3.09,2.41,10,7.35,0.03
8,11_12,5.18,2.85,11,3.75,4.52
9,12_13,4.31,-1.01,12,5.74,5.16


In [4]:
# Reformat for plotdata in paper
for FILE in FILES:
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    df = df.rename(columns={'ha_who_error':'ldistance_WHO', 'ha_enet_error':'ldistance_Qnet_recommendation'})
    df.to_csv(TAB_DIR + FILE + '_two.csv', index=False)

### 2009-10 H1N1 Northern flu season

Hamming distance between A/Hong Kong/H090-751-V3 (EPI326467) and [pdm09](https://www.ncbi.nlm.nih.gov/nuccore/NC_026433.1).

In [86]:
pdm09_ha_seq = 'MKAILVVLLYTFATANADTLCIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRGVAPLHLGKCNIAGWILGNPECESLSTASSWSYIVETPSSDNGTCYPGDFIDYEELREQLSSVSSFERFEIFPKTSSWPNHDSNKGVTAACPHAGAKSFYKNLIWLVKKGNSYPKLSKSYINDKGKEVLVLWGIHHPSTSADQQSLYQNADAYVFVGSSRYSKKFKPEIAIRPKVRXXEGRMNYYWTLVEPGDKITFEATGNLVVPRYAFAMERNAGSGIIISDTPVHDCNTTCQTPKGAINTSLPFQNIHPITIGKCPKYVKSTKLRLATGLRNIPSIQSRGLFGAIAGFIEGGWTGMVDGWYGYHHQNEQGSGYAADLKSTQNAIDEITNKVNSVIEKMNTQFTAVGKEFNHLEKRIENLNKKVDDGFLDIWTYNAELLVLLENERTLDYHDSNVKNLYEKVRSQLKNNAKEIGNGCFEFYHKCDNTCMESVKNGTYDYPKYSEEAKLNREEIDGVKLESTRIYQILAIYSTVASSLVLVVSLGAISFWMCSNGSLQCRICI'
north_h1n1 = pd.read_csv(OUT_DIR + 'north_h1n1.csv')
hongkong = north_h1n1[north_h1n1['name_1']=='A/Hong_Kong/H090-751-V3']
hongkong_ha_seq = hongkong['ha_seq_1'].values[0]
distance(pdm09_ha_seq[:565], hongkong_ha_seq)

5

## Single-Cluster Predictions vs. WHO

In [3]:
for FILE in FILES:    
    # Prepare prediction DataFrame
    df = pd.read_csv(PRED_DIR + FILE + '_predictions_single_cluster.csv', converters={'season': str})
    # WHO recommendation name, sequence
    who_ha_df = pd.read_csv(WHO_DIR + FILE + '.csv')
    who_na_df = pd.read_csv(WHO_DIR + FILE + '.csv')
    df['name_who'] = who_ha_df['who_recommendation_name']
    df['ha_seq_who'] = who_ha_df['ha_sequence']
    df['na_seq_who'] = who_na_df['na_sequence']
    
    # Compute errors
    seasons = df['season'].values
    enet_errors_ha = []
    enet_errors_na = []
    who_errors_ha = []
    who_errors_na = []
    
    for season in seasons:
        season_str = str(season)
        if len(season_str) == 1:
            season_str = str('0' + season_str)
            
        # Read all sequences
        DATA_DIR = 'data/merged/' + FILE + '/' + FILE + '_' + season_str + '.csv'
        if not os.path.isfile(DATA_DIR):
            enet_errors_ha.append(-1)
            enet_errors_na.append(-1)
            who_errors_ha.append(-1)
            who_errors_na.append(-1)
            continue
        seq_df = pd.read_csv(DATA_DIR)
        
        # Access Enet and WHO recommendations
        enet_ha_seq = df[df['season'] == season]['ha_seq'].values[0][:HA_TRUNC]
        enet_na_seq = df[df['season'] == season]['na_seq'].values[0][:NA_TRUNC]
        who_ha_seq = df[df['season'] == season]['ha_seq_who'].values[0][:HA_TRUNC]
        who_na_seq = df[df['season'] == season]['na_seq_who'].values[0][:NA_TRUNC]
        
        # Find average enet and who errors
        total_enet_error_ha = 0
        total_enet_error_na = 0
        total_who_error_ha = 0
        total_who_error_na = 0
        
        # HA
        ha_len = len(seq_df)
        for seq in seq_df['sequence']:
            if len(seq) < HA_TRUNC:
                ha_len -= 1
                continue
            total_enet_error_ha += distance(enet_ha_seq, seq[:HA_TRUNC])
            total_who_error_ha += distance(who_ha_seq, seq[:HA_TRUNC])
        # NA
        na_len = len(seq_df)
        for seq in seq_df['sequence_na']:
            if len(seq) < NA_TRUNC:
                na_len -= 1
                continue
            total_enet_error_na += distance(enet_na_seq, seq[:NA_TRUNC])
            total_who_error_na += distance(who_na_seq, seq[:NA_TRUNC])
            
        enet_errors_ha.append(total_enet_error_ha/ha_len)
        enet_errors_na.append(total_enet_error_na/na_len)
        who_errors_ha.append(total_who_error_ha/ha_len)
        who_errors_na.append(total_who_error_na/na_len)

    # Add to dataframe
    df['ha_who_error'] = who_errors_ha
    df['na_who_error'] = who_errors_na
    df['ha_enet_error'] = enet_errors_ha
    df['na_enet_error'] = enet_errors_na
    df.to_csv(OUT_DIR + FILE + '_single_cluster.csv', index=False)
    

# Error differences between Enet and WHO
# Positive means we are better, negative means WHO better
north_errors_df = pd.DataFrame({'north_season':NORTH_YEARS})
for FILE in FILES[:2]:
    df = pd.read_csv(OUT_DIR + FILE + '_single_cluster.csv')
    north_errors_df[FILE + '_ha'] = df['ha_who_error'] - df['ha_enet_error']
    north_errors_df[FILE + '_na'] = df['na_who_error'] - df['na_enet_error']
    
south_errors_df = pd.DataFrame({'south_season':SOUTH_YEARS})
for FILE in FILES[2:]:
    df = pd.read_csv(OUT_DIR + FILE + '_single_cluster.csv')
    south_errors_df[FILE + '_ha'] = df['ha_who_error'] - df['ha_enet_error']
    south_errors_df[FILE + '_na'] = df['na_who_error'] - df['na_enet_error']
    
errors_df = north_errors_df.join(south_errors_df, how='outer')
errors_df.to_csv(OUT_DIR + 'errors_difference_single_cluster.csv', index=False)
print('Average Improvement')
print(f'\tNorth H1N1 HA: {np.mean(errors_df["north_h1n1_ha"]):.3f}')
print(f'\tNorth H3N2 HA: {np.mean(errors_df["north_h3n2_ha"]):.3f}')
print(f'\tSouth H1N1 HA: {np.mean(errors_df["south_h1n1_ha"]):.3f}')
print(f'\tSouth H3N2 HA: {np.mean(errors_df["south_h3n2_ha"]):.3f}')
errors_df[['north_season', 'north_h1n1_ha', 'north_h3n2_ha', 
           'south_season', 'south_h1n1_ha', 'south_h3n2_ha']].round(2)

Average Improvement
	North H1N1 HA: 3.380
	North H3N2 HA: 3.155
	South H1N1 HA: 3.784
	South H3N2 HA: 1.911


Unnamed: 0,north_season,north_h1n1_ha,north_h3n2_ha,south_season,south_h1n1_ha,south_h3n2_ha
0,03_04,-3.0,22.67,3,4.31,8.04
1,04_05,-2.29,2.94,4,0.5,2.05
2,05_06,4.18,7.82,5,2.94,2.63
3,06_07,4.05,2.36,6,2.35,8.96
4,07_08,-3.02,7.84,7,6.65,3.66
5,08_09,0.4,2.59,8,-1.63,-0.88
6,09_10,0.0,1.96,9,-0.07,2.01
7,10_11,2.79,3.53,10,3.71,0.83
8,11_12,5.17,2.72,11,3.75,3.43
9,12_13,7.28,-2.2,12,5.72,3.26


In [5]:
# Reformat for plotdata in paper
for FILE in FILES:
    df = pd.read_csv(OUT_DIR + FILE + '_single_cluster.csv')
    df = df.rename(columns={'ha_who_error':'ldistance_WHO', 'ha_enet_error':'ldistance_Qnet_recommendation'})
    df.to_csv(TAB_DIR + FILE + '_single.csv', index=False)

## Comparison to Huddleston et al. 2020

https://elifesciences.org/articles/60067#fig11s1

Interesting points:
- They take the average of the 12 points (validation and test, not just test)
- They use a combination of north and south timepoints (mostly South)
- For WHO sequences used across multiple years, they only evaluate the earliest one
    - Ex. A/California/7/2004 is used for the vaccine for north 2005-06 and south 2006, so the timepoint they chose was north 2005-06 since it was earlier
    - Ex. A/Perth/16/2009 is used for south 2010, north 2010-11, south 2011, north 2011-12, south 2012, so the timepoint they chose was south 2010 since it is the earliest
- We do best with using multicluster predictions, then making a single prediction using the cluster counts (rather than areas) of the two largest clusters

The averages below match the paper.

In [2]:
huddleston = pd.read_csv('data/huddleston.csv')
print(f'Best: {huddleston[12:24]["relative_distance_to_future"].mean():.3f}')
print(f'Naive: {huddleston[24:36]["relative_distance_to_future"].mean():.3f}')
print(f'LBI: {huddleston[36:48]["relative_distance_to_future"].mean():.3f}')
print(f'HI: {huddleston[48:60]["relative_distance_to_future"].mean():.3f}')
huddleston[:12]

Best: -3.825
Naive: -2.193
LBI: -1.927
HI: -2.333


Unnamed: 0,strain_type,strain,abbreviation,timepoint,distance_to_future,relative_distance_to_future
0,vaccine,A/Fujian/411/2002,FU02,2003-10-01,9.44,
1,vaccine,A/Wellington/1/2004,WE04,2004-10-01,6.08,
2,vaccine,A/California/7/2004,CA04,2005-04-01,7.5,
3,vaccine,A/Wisconsin/67/2005,WI05,2006-04-01,10.3,
4,vaccine,A/Brisbane/10/2007,BR07,2007-10-01,5.78,
5,vaccine,A/Perth/16/2009,PE09,2009-10-01,7.95,
6,vaccine,A/Victoria/361/2011,VI11,2012-04-01,6.27,
7,vaccine,A/Texas/50/2012,TX12,2013-10-01,8.65,
8,vaccine,A/Switzerland/9715293/2013,SW13,2014-10-01,10.82,
9,vaccine,A/HongKong/4801/2014,HK14,2015-10-01,6.09,


In [54]:
south = pd.read_csv(OUT_DIR + 'south_h3n2_single_cluster.csv', converters={'season': str})[['season','name','name_who','ha_who_error','ha_enet_error']].iloc[[1,2,5,7,11,12,13,15,16]]
north = pd.read_csv(OUT_DIR + 'north_h3n2_single_cluster.csv', converters={'season': str})[['season','name','name_who','ha_who_error','ha_enet_error']].iloc[[2,3,10]]
enet = south.append(north)

new_order = np.array([1, 2, 10, 11, 3, 4, 12, 5, 6, 7, 8, 9])
enet = enet.iloc[new_order - 1].reset_index(drop=True)
enet['enet_improvement'] = enet['ha_who_error'] - enet['ha_enet_error']

# Make table for paper
enet['Timepoint'] = huddleston[:12]['timepoint']
enet['Mutational Load + LBI Distance to Future'] = huddleston[36:48]['distance_to_future'].values
enet['Mutational Load + LBI Improvement'] = -huddleston[36:48]['relative_distance_to_future'].values
enet['HI + Mutational Load Distance to Future'] = huddleston[48:60]['distance_to_future'].values
enet['HI + Mutational Load Improvement'] = -huddleston[48:60]['relative_distance_to_future'].values
enet = enet.rename(columns={'name_who':'WHO Recommendation', 
                            'name':'Enet Recommendation',
                            'ha_enet_error':'Emergenet Distance to Future',
                            'enet_improvement':'Emergenet Improvement'})
enet = enet[['Timepoint', 'WHO Recommendation',
             'Emergenet Distance to Future', 'Emergenet Improvement',
             'Mutational Load + LBI Distance to Future', 'Mutational Load + LBI Improvement', 
             'HI + Mutational Load Distance to Future', 'HI + Mutational Load Improvement']].round(2)
enet = enet.append(enet.mean(), ignore_index=True).round(2)
enet.to_csv(OUT_DIR+'huddleston_comparison.csv', index=False)
enet = enet.set_index('Timepoint')
textable(enet, tabname = 'tables/huddleston_comparison.tex', FORMAT='%1.2f')
enet

Unnamed: 0_level_0,WHO Recommendation,Emergenet Distance to Future,Emergenet Improvement,Mutational Load + LBI Distance to Future,Mutational Load + LBI Improvement,HI + Mutational Load Distance to Future,HI + Mutational Load Improvement
Timepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2003-10-01,A/Fujian/411/2002,7.28,2.05,8.44,1.01,6.5,2.95
2004-10-01,A/Wellington/1/2004,4.06,2.63,4.38,1.7,4.38,1.7
2005-04-01,A/California/7/2004,5.18,7.82,4.6,2.89,4.6,2.89
2006-04-01,A/Wisconsin/67/2005,8.54,2.36,5.36,4.94,5.37,4.94
2007-10-01,A/Brisbane/10/2007,5.78,-0.88,3.78,2.0,3.78,2.0
2009-10-01,A/Perth/16/2009,8.03,0.83,6.95,1.0,7.93,0.02
2012-04-01,A/Victoria/361/2011,4.0,4.79,3.72,2.54,7.02,-0.75
2013-10-01,A/Texas/50/2012,6.66,2.1,6.54,2.11,6.89,1.76
2014-10-01,A/Switzerland/9715293/2013,9.85,0.87,3.88,6.94,3.88,6.94
2015-10-01,A/Hong Kong/4801/2014,7.64,-0.74,7.33,-1.24,6.09,0.0
