# Enet Predictions vs. WHO Predictions
- Compare Enet predictions and WHO predictions
- For each season, take the average Levenshtein distance between the predictions and dominant strains
- Truncate sequence to HA 565, NA 468
- WHO predictions from [WHO vaccine recommendations](https://www.who.int/teams/global-influenza-programme/vaccines/who-recommendations/recommendations-for-influenza-vaccine-composition-archive)

In [2]:
import os 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from Levenshtein import distance
pd.set_option('display.max_columns', None)

In [3]:
WHO_DIR = 'raw_data/who/'
PRED_DIR = 'results/enet_predictions/'
DOM_DIR = 'dominant_sequences/results/'
OUT_DIR = 'results/enet_who_comparison/'

FILES = ['north_h1n1', 'north_h3n2', 'south_h1n1', 'south_h3n2']

NORTH_YEARS = []
for i in np.arange(3, 24):
    YEAR = ''
    if i < 10:
        YEAR += '0' + str(i)
    else:
        YEAR += (str(i))
    if i + 1 < 10:
        YEAR += '_0' + str(i + 1)
    else:
        YEAR += '_' + str(i + 1)
    NORTH_YEARS.append(YEAR)
        
SOUTH_YEARS = []
for i in np.arange(3, 24):
    if i < 10:
        SOUTH_YEARS.append('0' + str(i))
    else:
        SOUTH_YEARS.append(str(i))

NA_TRUNC = 468 # 2 less than official length of 470
HA_TRUNC = 565 # 2 less than official length of 567

## Construct DataFrame
- `season`: 03-04 through 23-24 for north, 03 through 23 for south

In [4]:
for FILE in FILES:    
    df = pd.read_csv(PRED_DIR + FILE + '_predictions.csv')
    # who recommendation name, sequence
    who_ha_df = pd.read_csv(WHO_DIR + FILE + '.csv')
    who_na_df = pd.read_csv(WHO_DIR + FILE + '.csv')
    df['name_who'] = who_ha_df['who_recommendation_name']
    df['ha_seq_who'] = who_ha_df['ha_sequence']
    df['na_seq_who'] = who_na_df['na_sequence']
    df.to_csv(OUT_DIR + FILE + '.csv', index=False)

## Compute Enet and WHO Errors
- For each season, take the best weighted average Levenshtein distance between the two predictions and the dominant strains
    - Our predictions are the Emergenet predictions from the two largest clusters
    - Dominant strains in `./dominant_sequences/results/`, recall that we clustered the population space under the Levenshtein distance, and found the Levenshtein centroid for each cluster (see `./dominant_sequences/dominant_sequences.ipynb`)
    - For each season:
        - For each dominant sequence, take the smallest Levenshtein distance between it and our two predictions
        - Multiply this by the cluster size the dominant sequence is from, and add it to the total
        - Divide the total by the total population size (sum of all cluster sizes)

In [5]:
for FILE in FILES:
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    seasons = df['season'].values
    
    enet_errors_ha = []
    enet_errors_na = []
    who_errors_ha = []
    who_errors_na = []
    
    for season in seasons:
        season_str = str(season)
        if len(season_str) == 1:
            season_str = str('0' + season_str)
            
        # read dominant sequences
        DATA_DIR_HA = DOM_DIR + FILE + '_ha/' + FILE + '_ha_' + season_str + '/dom_seqs.csv'
        DATA_DIR_NA = DOM_DIR + FILE + '_na/' + FILE + '_na_' + season_str + '/dom_seqs.csv'
        if not os.path.isfile(DATA_DIR_HA):
            enet_errors_ha.append(-1)
            enet_errors_na.append(-1)
            who_errors_ha.append(-1)
            who_errors_na.append(-1)
            continue
        dom_df_ha = pd.read_csv(DATA_DIR_HA)
        dom_df_na = pd.read_csv(DATA_DIR_NA)
        
        # access enet and who recommendations
        enet_ha_seq_0 = df[df['season'] == season]['ha_seq_0'].values[0]
        enet_na_seq_0 = df[df['season'] == season]['na_seq_0'].values[0]
        enet_ha_seq_1 = df[df['season'] == season]['ha_seq_1'].values[0]
        enet_na_seq_1 = df[df['season'] == season]['na_seq_1'].values[0]

        if enet_ha_seq_0 == '-1' or enet_ha_seq_1 == '-1':
            enet_errors_ha.append(-1)
            enet_errors_na.append(-1)
            who_errors_ha.append(-1)
            who_errors_na.append(-1)
            continue

        who_ha_seq = df[df['season'] == season]['ha_seq_who'].values[0][:HA_TRUNC]
        who_na_seq = df[df['season'] == season]['na_seq_who'].values[0][:NA_TRUNC]
        
        # find average enet and who errors
        total_enet_error_ha = 0
        total_enet_error_na = 0
        total_who_error_ha = 0
        total_who_error_na = 0
        
        for i in range(len(dom_df_ha)):
            domseq = dom_df_ha['sequence'].values[i]
            cluster_size = dom_df_ha['cluster_size'].values[i]
            total_enet_error_ha += min(distance(enet_ha_seq_0, domseq), distance(enet_ha_seq_1, domseq)) * cluster_size
            total_who_error_ha += distance(who_ha_seq, domseq) * cluster_size
        for i in range(len(dom_df_na)):
            domseq = dom_df_na['sequence'].values[i]
            cluster_size = dom_df_na['cluster_size'].values[i]
            total_enet_error_na += min(distance(enet_na_seq_0, domseq), distance(enet_na_seq_1, domseq)) * cluster_size
            total_who_error_na += distance(who_na_seq, domseq) * cluster_size
        enet_errors_ha.append(total_enet_error_ha/sum(dom_df_ha['cluster_size']))
        enet_errors_na.append(total_enet_error_na/sum(dom_df_na['cluster_size']))
        who_errors_ha.append(total_who_error_ha/sum(dom_df_ha['cluster_size']))
        who_errors_na.append(total_who_error_na/sum(dom_df_na['cluster_size']))

    # add to dataframe
    df['ha_who_error'] = who_errors_ha
    df['na_who_error'] = who_errors_na
    df['ha_enet_error'] = enet_errors_ha
    df['na_enet_error'] = enet_errors_na
    df.to_csv(OUT_DIR + FILE + '.csv', index=False)

In [15]:
# reformat for plotdata in paper
for FILE in FILES:
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    df = df.rename(columns={'ha_who_error':'ldistance_WHO', 'ha_enet_error':'ldistance_Qnet_recommendation'})
    df.to_csv(OUT_DIR+ 'plotdata/' + FILE + '_ha.csv', index=False)
    
for FILE in FILES:
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    df = df.rename(columns={'na_who_error':'ldistance_WHO', 'na_enet_error':'ldistance_Qnet_recommendation'})
    df.to_csv(OUT_DIR + 'plotdata/' + FILE + '_na.csv', index=False)

In [6]:
north_errors_df = pd.DataFrame({'north_season':NORTH_YEARS})
for FILE in FILES[:2]:
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    north_errors_df[FILE + '_ha'] = df['ha_who_error'] - df['ha_enet_error']
    north_errors_df[FILE + '_na'] = df['na_who_error'] - df['na_enet_error']
    
south_errors_df = pd.DataFrame({'south_season':SOUTH_YEARS})
for FILE in FILES[2:]:
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    south_errors_df[FILE + '_ha'] = df['ha_who_error'] - df['ha_enet_error']
    south_errors_df[FILE + '_na'] = df['na_who_error'] - df['na_enet_error']
    
errors_df = north_errors_df.join(south_errors_df, how='outer')
errors_df.to_csv(OUT_DIR + 'errors_difference.csv', index=False)
errors_df.round(2)

Unnamed: 0,north_season,north_h1n1_ha,north_h1n1_na,north_h3n2_ha,north_h3n2_na,south_season,south_h1n1_ha,south_h1n1_na,south_h3n2_ha,south_h3n2_na
0,03_04,4.37,-1.0,19.63,10.24,3,4.61,-0.88,17.23,10.5
1,04_05,1.62,0.0,5.28,2.5,4,6.83,2.7,2.08,1.12
2,05_06,2.0,0.0,7.79,-1.72,5,2.0,-2.08,0.83,1.17
3,06_07,3.49,1.99,1.99,1.0,6,5.8,2.44,10.57,2.99
4,07_08,6.24,8.84,3.77,-1.0,7,4.32,0.61,5.63,0.58
5,08_09,1.02,-0.95,1.0,0.06,8,-0.43,14.66,1.03,0.03
6,09_10,1.06,1.06,1.0,1.57,9,1.0,-0.17,1.0,1.98
7,10_11,2.3,2.24,0.04,0.0,10,4.05,2.86,0.0,0.0
8,11_12,3.99,1.0,5.99,1.84,11,2.99,1.99,2.79,0.0
9,12_13,6.98,3.31,1.4,0.48,12,5.64,1.99,3.05,2.71


## What if we used a random strain from that season instead of our predicted strain?
- Select two random strains from each season to be our "predictions"
- Perform the same analysis as before with these random strains
- Repeat n times and take the 95th percentile of errors among these n repetitions

In [18]:
for FILE in FILES:    
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    df.to_csv(OUT_DIR + FILE + '_random.csv', index=False)
    
NORTH_YEARS_RANDOM = []
for i in np.arange(2, 23):
    YEAR = ''
    if i < 10:
        YEAR += '0' + str(i)
    else:
        YEAR += (str(i))
    if i + 1 < 10:
        YEAR += '_0' + str(i + 1)
    else:
        YEAR += '_' + str(i + 1)
    NORTH_YEARS_RANDOM.append(YEAR)
        
SOUTH_YEARS_RANDOM = []
for i in np.arange(2, 23):
    if i < 10:
        SOUTH_YEARS_RANDOM.append('0' + str(i))
    else:
        SOUTH_YEARS_RANDOM.append(str(i))

In [19]:
for FILE in FILES:
    YEARS = NORTH_YEARS_RANDOM
    if FILE[:5] == 'south':
        YEARS = SOUTH_YEARS_RANDOM
    
    random_errors_ha = [[] for x in range(21)]
    random_errors_na = [[] for x in range(21)]
    
    n = 20
    for state in range(n):
        random_seqs = []
        for YEAR in YEARS:
            population_df = pd.read_csv('raw_data/merged/' + FILE + '/' + FILE + '_' + YEAR + '.csv')
            population_df = population_df.drop_duplicates(subset=['sequence'])
            random_seqs.append(population_df.sample(2, random_state=state))

        df = pd.read_csv(OUT_DIR + FILE + '_random.csv')
        seasons = df['season'].values  
        for i in range(len(seasons)):
            season_str = str(seasons[i])
            if len(season_str) == 1:
                season_str = str('0' + season_str)
            # read dominant sequences
            DATA_DIR_HA = DOM_DIR + FILE + '_ha/' + FILE + '_ha_' + season_str + '/dom_seqs.csv'
            DATA_DIR_NA = DOM_DIR + FILE + '_na/' + FILE + '_na_' + season_str + '/dom_seqs.csv'
            if not os.path.isfile(DATA_DIR_HA):
                random_errors_ha[i] = -1
                random_errors_na[i] = -1
                continue
            dom_df_ha = pd.read_csv(DATA_DIR_HA)
            dom_df_na = pd.read_csv(DATA_DIR_NA)

            # random seq
            random_seq_ha_0 = random_seqs[i]['sequence'].values[0]
            random_seq_na_0 = random_seqs[i]['sequence_na'].values[0]
            random_seq_ha_1 = random_seqs[i]['sequence'].values[1]
            random_seq_na_1 = random_seqs[i]['sequence_na'].values[1]

            # find average random errors
            total_random_error_ha = 0
            total_random_error_na = 0
            for j in range(len(dom_df_ha)):
                domseq = dom_df_ha['sequence'].values[j]
                cluster_size = dom_df_ha['cluster_size'].values[j]
                total_random_error_ha += min(distance(random_seq_ha_0, domseq), distance(random_seq_ha_1, domseq)) * cluster_size
            for j in range(len(dom_df_na)):
                domseq = dom_df_na['sequence'].values[j]
                cluster_size = dom_df_na['cluster_size'].values[j]
                total_random_error_na += min(distance(random_seq_na_0, domseq), distance(random_seq_na_1, domseq)) * cluster_size
            random_errors_ha[i].append(total_random_error_ha/sum(dom_df_ha['cluster_size']))
            random_errors_na[i].append(total_random_error_na/sum(dom_df_na['cluster_size']))

    # mean of the error
    df['ha_random_error'] = [np.mean(random_errors_ha[i]) for i in range(21)]
    df['na_random_error'] = [np.mean(random_errors_na[i]) for i in range(21)]
    # variance of the error
    df['ha_random_error_var'] = [np.var(random_errors_ha[i]) for i in range(21)]
    df['na_random_error_var'] = [np.var(random_errors_na[i]) for i in range(21)]
    # upper bound of the 95% confidence interval of the error
    df['ha_random_error_95'] = [np.mean(random_errors_ha[i]) + 1.96*np.std(random_errors_ha[i])/np.sqrt(n) for i in range(21)]
    df['na_random_error_95'] = [np.mean(random_errors_na[i]) + 1.96*np.std(random_errors_na[i])/np.sqrt(n) for i in range(21)]
    df.to_csv(OUT_DIR + FILE + '_random.csv', index=False)

In [20]:
north_errors_df = pd.DataFrame({'north_season':NORTH_YEARS})
for FILE in FILES[:2]:
    df = pd.read_csv(OUT_DIR + FILE + '_random.csv')
    north_errors_df[FILE + '_ha'] = df['ha_random_error_95'] - df['ha_enet_error']
    north_errors_df[FILE + '_na'] = df['na_random_error_95'] - df['na_enet_error']
    
south_errors_df = pd.DataFrame({'south_season':SOUTH_YEARS})
for FILE in FILES[2:]:
    df = pd.read_csv(OUT_DIR + FILE + '_random.csv')
    south_errors_df[FILE + '_ha'] = df['ha_random_error_95'] - df['ha_enet_error']
    south_errors_df[FILE + '_na'] = df['na_random_error_95'] - df['ha_enet_error']
    
errors_df = north_errors_df.join(south_errors_df, how='outer')
errors_df.to_csv(OUT_DIR + 'errors_difference_random.csv', index=False)
errors_df.round(2)

Unnamed: 0,north_season,north_h1n1_ha,north_h1n1_na,north_h3n2_ha,north_h3n2_na,south_season,south_h1n1_ha,south_h1n1_na,south_h3n2_ha,south_h3n2_na
0,03_04,1.45,-0.27,9.97,3.75,3,4.31,3.94,8.99,-0.37
1,04_05,-1.88,0.0,4.03,8.06,4,3.81,4.35,0.8,2.95
2,05_06,-1.05,-0.92,1.73,-3.47,5,-0.35,0.44,3.81,5.98
3,06_07,0.46,0.76,0.61,0.86,6,2.68,1.65,2.71,0.78
4,07_08,5.16,5.64,0.21,-2.68,7,2.17,1.57,3.21,4.42
5,08_09,2.45,0.47,2.29,1.63,8,1.89,4.21,4.05,4.35
6,09_10,5.66,5.18,1.02,0.35,9,1.5,-24.03,2.39,-1.51
7,10_11,0.19,0.57,0.84,1.21,10,36.54,26.85,0.89,-0.68
8,11_12,1.68,-1.02,5.35,0.43,11,0.7,-1.75,1.71,-1.25
9,12_13,2.96,1.97,2.65,-0.4,12,2.08,-2.59,2.53,-3.66


In [26]:
print('LAST 2 DECADES')
for FILE in FILES:
    df = pd.read_csv(OUT_DIR + FILE + '_random.csv')
    df['diff_ha'] = df['ha_random_error_95'] - df['ha_enet_error']
    df['diff_na'] = df['na_random_error_95'] - df['na_enet_error']
    print(FILE)
    print('    Avg. Improvement HA', round(sum(df['diff_ha'][:20])/20,2))
    print('    % Improvement HA:', round(100*(sum(df['ha_random_error_95'][:20])/sum(df['ha_enet_error'][:20])-1),2))
    print('    Avg. Improvement NA', round(sum(df['diff_na'][:20])/20,2))
    print('    % Improvement NA:', round(100*(sum(df['na_random_error_95'][:20])/sum(df['na_enet_error'][:20])-1),2))

LAST 2 DECADES
north_h1n1
    Avg. Improvement HA 2.28
    % Improvement HA: 23.37
    Avg. Improvement NA 1.15
    % Improvement NA: 15.08
north_h3n2
    Avg. Improvement HA 3.29
    % Improvement HA: 59.56
    Avg. Improvement NA 1.03
    % Improvement NA: 29.03
south_h1n1
    Avg. Improvement HA 4.12
    % Improvement HA: 47.36
    Avg. Improvement NA 2.36
    % Improvement NA: 31.54
south_h3n2
    Avg. Improvement HA 3.63
    % Improvement HA: 75.14
    Avg. Improvement NA 1.6
    % Improvement NA: 44.8


In [27]:
print('LAST DECADE')
for FILE in FILES:
    df = pd.read_csv(OUT_DIR + FILE + '_random.csv')
    df['diff_ha'] = df['ha_random_error_95'] - df['ha_enet_error']
    df['diff_na'] = df['na_random_error_95'] - df['na_enet_error']
    print(FILE)
    print('    Avg. Improvement HA', round(sum(df['diff_ha'][10:20])/10,2))
    print('    % Improvement HA:', round(100*(sum(df['ha_random_error_95'][10:20])/sum(df['ha_enet_error'][10:20])-1),2))
    print('    Avg. Improvement NA', round(sum(df['diff_na'][10:20])/10,2))
    print('    % Improvement NA:', round(100*(sum(df['na_random_error_95'][10:20])/sum(df['na_enet_error'][10:20])-1),2))

LAST DECADE
north_h1n1
    Avg. Improvement HA 2.85
    % Improvement HA: 68.89
    Avg. Improvement NA 1.07
    % Improvement NA: 29.26
north_h3n2
    Avg. Improvement HA 3.71
    % Improvement HA: 58.01
    Avg. Improvement NA 1.09
    % Improvement NA: 32.05
south_h1n1
    Avg. Improvement HA 2.7
    % Improvement HA: 67.23
    Avg. Improvement NA 1.13
    % Improvement NA: 30.4
south_h3n2
    Avg. Improvement HA 4.14
    % Improvement HA: 80.27
    Avg. Improvement NA 1.25
    % Improvement NA: 36.0
