# Enet Predictions vs. WHO Predictions
- Compare Enet predictions and WHO predictions
- For each season, take the average levenshtein distance between the prediction and the top 10 dominant strains
- Truncate sequence to HA 565, NA 468
- WHO predictions from [WHO vaccine recommendations](https://www.who.int/teams/global-influenza-programme/vaccines/who-recommendations/recommendations-for-influenza-vaccine-composition-archive)

In [1]:
import os 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from Levenshtein import distance
pd.set_option('display.max_columns', None)

In [2]:
WHO_DIR = 'raw_data/who/'
PRED_DIR = 'results/enet_predictions/'
DOM_DIR = 'results/dominant_sequences/'
OUT_DIR = 'results/enet_who_comparison/'

FILES = ['north_h1n1', 'north_h3n2', 'south_h1n1', 'south_h3n2']

NORTH_YEARS = []
for i in np.arange(2, 23):
    YEAR = ''
    if i < 10:
        YEAR += '0' + str(i)
    else:
        YEAR += (str(i))
    if i + 1 < 10:
        YEAR += '_0' + str(i + 1)
    else:
        YEAR += '_' + str(i + 1)
    NORTH_YEARS.append(YEAR)
        
SOUTH_YEARS = []
for i in np.arange(3, 24):
    if i < 10:
        SOUTH_YEARS.append('0' + str(i))
    else:
        SOUTH_YEARS.append(str(i))

NA_TRUNC = 468 # 2 less than official length of 470
HA_TRUNC = 565 # 2 less than official length of 567

## Construct DataFrame
- `season`: 02-03 through 22-23 for north, 03 through 23 for south

In [3]:
for FILE in FILES:    
    df = pd.read_csv(PRED_DIR + FILE + '_predictions.csv')
    
    # WHO recommendation name, sequence
    who_ha_df = pd.read_csv(WHO_DIR + FILE + '_ha.csv')
    who_na_df = pd.read_csv(WHO_DIR + FILE + '_na.csv')
    df['name_who'] = who_ha_df['who_recommendation_name']
    df['ha_seq_who'] = who_ha_df['who_recommendation_sequence']
    df['na_seq_who'] = who_na_df['who_recommendation_sequence']
    
    # error columns
    for i in range(3):
        df['ha_enet_error_' + str(i)] = -1 * np.ones(len(df))
        df['na_enet_error_' + str(i)] = -1 * np.ones(len(df))
    df['ha_who_error'] = -1 * np.ones(len(df))
    df['na_who_error'] = -1 * np.ones(len(df))
    
    df.to_csv(OUT_DIR + FILE + '.csv', index=False)

## Compute Enet and WHO Errors
- For each season, take the average Levenshtein distance between the prediction and the top 10 dominant strains
    - Dominant strains in `results/dominant_sequences`

In [4]:
for FILE in FILES:
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    seasons = df['season'].values
    
    for i in range(3):
        enet_errors_ha = []
        enet_errors_na = []
        who_errors_ha = []
        who_errors_na = []
        for season in seasons:
            season_str = str(season)
            if len(season_str) == 1:
                season_str = str('0' + season_str)
            # read dominant sequences
            DATA_DIR_HA = DOM_DIR + FILE + '_ha/' + FILE + '_ha_' + season_str + '.csv'
            DATA_DIR_NA = DOM_DIR + FILE + '_na/' + FILE + '_na_' + season_str + '.csv'
            if not os.path.isfile(DATA_DIR_HA):
                enet_errors_ha.append(-1)
                enet_errors_na.append(-1)
                who_errors_ha.append(-1)
                who_errors_na.append(-1)
                continue
            dom_df_ha = pd.read_csv(DATA_DIR_HA, index_col=0)
            dom_df_na = pd.read_csv(DATA_DIR_NA, index_col=0)
            top_dominant_seqs_ha = dom_df_ha['sequence'].values[:10]
            top_dominant_seqs_na = dom_df_na['sequence'].values[:10]
            # access enet and who recommendations
            enet_ha_seq = df[df['season'] == season]['ha_seq_' + str(i)].values[0][:HA_TRUNC]
            enet_na_seq = df[df['season'] == season]['na_seq_' + str(i)].values[0][:NA_TRUNC]
            who_ha_seq = df[df['season'] == season]['ha_seq_who'].values[0][:HA_TRUNC]
            who_na_seq = df[df['season'] == season]['na_seq_who'].values[0][:NA_TRUNC]
            # find average enet and who errors
            total_enet_error_ha = 0
            total_enet_error_na = 0
            total_who_error_ha = 0
            total_who_error_na = 0
            for domseq in top_dominant_seqs_ha:
                total_enet_error_ha += distance(enet_ha_seq, domseq[:HA_TRUNC])
                total_who_error_ha += distance(who_ha_seq, domseq[:HA_TRUNC])
            for domseq in top_dominant_seqs_na:
                total_enet_error_na += distance(enet_na_seq, domseq[:NA_TRUNC])
                total_who_error_na += distance(who_na_seq, domseq[:NA_TRUNC])
            enet_errors_ha.append(total_enet_error_ha/10)
            enet_errors_na.append(total_enet_error_na/10)
            who_errors_ha.append(total_who_error_ha/10)
            who_errors_na.append(total_who_error_na/10)

        # add to dataframe
        df['ha_enet_error_' + str(i)] = enet_errors_ha
        df['na_enet_error_' + str(i)] = enet_errors_na
        df['ha_who_error'] = who_errors_ha
        df['na_who_error'] = who_errors_na
    
    df['best_ha_enet_error'] = df[['ha_enet_error_0','ha_enet_error_1']].min(axis=1)
    df['best_na_enet_error'] = df[['na_enet_error_0','na_enet_error_1']].min(axis=1)
    df.to_csv(OUT_DIR + FILE + '.csv', index=False)

In [5]:
north_errors_df = pd.DataFrame({'north_season':NORTH_YEARS})
for FILE in FILES[:2]:
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    errors_ha = []
    errors_na = []
    # take minimum error of two largest clusters
    for i in range(len(df)):
        errors_ha.append(df['ha_who_error'][i] - df['best_ha_enet_error'][i])
        errors_na.append(df['na_who_error'][i] - df['best_na_enet_error'][i])
    north_errors_df[FILE + '_ha'] = errors_ha
    north_errors_df[FILE + '_na'] = errors_na
    
south_errors_df = pd.DataFrame({'south_season':SOUTH_YEARS})
for FILE in FILES[2:]:
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    errors_ha = []
    errors_na = []
    # take minimum error of two largest clusters
    for i in range(len(df)):
        errors_ha.append(df['ha_who_error'][i] - df['best_ha_enet_error'][i])
        errors_na.append(df['na_who_error'][i] - df['best_na_enet_error'][i])
    south_errors_df[FILE + '_ha'] = errors_ha
    south_errors_df[FILE + '_na'] = errors_na
    
errors_df = north_errors_df.join(south_errors_df, how='outer')
errors_df.to_csv(OUT_DIR + 'errors_difference.csv', index=False)
errors_df

Unnamed: 0,north_season,north_h1n1_ha,north_h1n1_na,north_h3n2_ha,north_h3n2_na,south_season,south_h1n1_ha,south_h1n1_na,south_h3n2_ha,south_h3n2_na
0,02_03,3.0,1.0,17.0,13.0,3,3.0,1.0,23.0,9.0
1,03_04,4.0,-1.0,20.0,10.0,4,4.3,1.0,3.0,1.0
2,04_05,5.0,1.0,3.0,2.0,5,5.4,2.0,0.0,2.0
3,05_06,4.6,3.0,5.0,0.0,6,7.4,5.0,5.8,0.5
4,06_07,4.0,3.0,-1.0,-2.0,7,8.0,2.0,4.0,6.0
5,07_08,-4.0,2.0,3.0,7.0,8,7.0,24.0,1.0,-1.0
6,08_09,-2.0,0.0,3.0,1.0,9,2.0,0.0,2.0,0.0
7,09_10,118.0,89.0,2.0,2.0,10,1.0,2.0,3.0,0.0
8,10_11,4.0,2.0,5.0,2.0,11,3.0,1.0,-2.0,3.0
9,11_12,7.0,4.0,1.0,1.0,12,1.8,1.0,4.0,5.0


## What if we used a random strain from that season instead of our predicted strain?

In [42]:
for FILE in FILES:    
    df = pd.read_csv(PRED_DIR + FILE + '_predictions.csv')[['season']]
    # WHO recommendation name, sequence
    who_ha_df = pd.read_csv(WHO_DIR + FILE + '_ha.csv')
    df['name_who'] = who_ha_df['who_recommendation_name']
    df['ha_seq_who'] = who_ha_df['who_recommendation_sequence']
    # error columns
    df['ha_random_error'] = -1 * np.ones(len(df))
    df['ha_who_error'] = -1 * np.ones(len(df))
    df.to_csv(OUT_DIR + FILE + '_random.csv', index=False)
    
NORTH_YEARS_RANDOM = []
for i in np.arange(1, 22):
    YEAR = ''
    if i < 10:
        YEAR += '0' + str(i)
    else:
        YEAR += (str(i))
    if i + 1 < 10:
        YEAR += '_0' + str(i + 1)
    else:
        YEAR += '_' + str(i + 1)
    NORTH_YEARS_RANDOM.append(YEAR)
        
SOUTH_YEARS_RANDOM = []
for i in np.arange(2, 23):
    if i < 10:
        SOUTH_YEARS_RANDOM.append('0' + str(i))
    else:
        SOUTH_YEARS_RANDOM.append(str(i))

In [54]:
for FILE in FILES:
    YEARS = NORTH_YEARS_RANDOM
    if FILE[:5] == 'south':
        YEARS = SOUTH_YEARS_RANDOM
    
    df = pd.read_csv(OUT_DIR + FILE + '_random.csv')
    seasons = df['season'].values
        
    for n in np.arange(30):
        random_seqs = []
        for YEAR in YEARS:
            population_df = pd.read_csv('raw_data/merged/' + FILE + '/' + FILE + '_ha_' + YEAR + '.csv')
            random_seq = population_df.sample(1, random_state=n)['sequence'].values[0]
            random_seqs.append(random_seq)

        random_errors_ha = []
        who_errors_ha = []

        for i in range(len(seasons)):
            season_str = str(seasons[i])
            if len(season_str) == 1:
                season_str = str('0' + season_str)
            # read dominant sequences
            DATA_DIR_HA = DOM_DIR + FILE + '_ha/' + FILE + '_ha_' + season_str + '.csv'
            if not os.path.isfile(DATA_DIR_HA):
                random_errors_ha.append(-1)
                who_errors_ha.append(-1)
                continue
            dom_df_ha = pd.read_csv(DATA_DIR_HA)
            top_dominant_seqs_ha = dom_df_ha['sequence'].values[:10]

            # access who recommendations
            who_ha_seq = df[df['season'] == seasons[i]]['ha_seq_who'].values[0][:HA_TRUNC]

            # random seq
            random_seq_ha = random_seqs[i][:HA_TRUNC]

            # find average enet and who errors
            total_random_error_ha = 0
            total_who_error_ha = 0
            for domseq in top_dominant_seqs_ha:
                total_random_error_ha += distance(random_seq_ha, domseq[:HA_TRUNC])
                total_who_error_ha += distance(who_ha_seq, domseq[:HA_TRUNC])
            random_errors_ha.append(total_random_error_ha/10)
            who_errors_ha.append(total_who_error_ha/10)

        # add to dataframe
        df['ha_random_error_' + str(n)] = random_errors_ha
        df['ha_who_error'] = who_errors_ha
    
    total_random_error = np.zeros(len(df))
    for n in np.arange(30):
        total_random_error += df['ha_random_error_' + str(n)]
    df['avg_random_error'] = total_random_error/30
    df.to_csv(OUT_DIR + FILE + '_random.csv', index=False)

In [55]:
for i in range(4):
    df1 = pd.read_csv(OUT_DIR + FILES[i] + '_random.csv')[['season','avg_random_error']]
    df2 = pd.read_csv(OUT_DIR + FILES[i] + '.csv')[['season','ha_who_error','best_ha_enet_error']]
    df1 = df1.merge(df2, on='season')
    df1['diff'] = df1['avg_random_error'] - df1['best_ha_enet_error']
    print(FILES[i])
    print('    Avg. Improvement', round(sum(df1['diff'][:20])/20, 2))
    print('    % Improvement:', round(100 * (sum(df1['avg_random_error'][:20])/sum(df1['best_ha_enet_error'][:20])-1), 2))

north_h1n1
    Avg. Improvement 5.58
    % Improvement: 134.71
north_h3n2
    Avg. Improvement 2.55
    % Improvement: 40.73
south_h1n1
    Avg. Improvement 3.61
    % Improvement: 35.68
south_h3n2
    Avg. Improvement 3.72
    % Improvement: 79.31


## Two Random Strains per Season

In [59]:
m=10
for FILE in FILES:
    YEARS = NORTH_YEARS_RANDOM
    if FILE[:5] == 'south':
        YEARS = SOUTH_YEARS_RANDOM
    
    df = pd.read_csv(OUT_DIR + FILE + '_random.csv')
    seasons = df['season'].values
        
    for n in np.arange(m):
        random_seqs = []
        for YEAR in YEARS:
            population_df = pd.read_csv('raw_data/merged/' + FILE + '/' + FILE + '_ha_' + YEAR + '.csv')
            random_seq = population_df.sample(2, random_state=n)['sequence'].values
            random_seqs.append(random_seq)

        random_errors_ha = []
        who_errors_ha = []

        for i in range(len(seasons)):
            season_str = str(seasons[i])
            if len(season_str) == 1:
                season_str = str('0' + season_str)
            # read dominant sequences
            DATA_DIR_HA = DOM_DIR + FILE + '_ha/' + FILE + '_ha_' + season_str + '.csv'
            if not os.path.isfile(DATA_DIR_HA):
                random_errors_ha.append(-1)
                who_errors_ha.append(-1)
                continue
            dom_df_ha = pd.read_csv(DATA_DIR_HA)
            top_dominant_seqs_ha = dom_df_ha['sequence'].values[:10]

            # access who recommendations
            who_ha_seq = df[df['season'] == seasons[i]]['ha_seq_who'].values[0][:HA_TRUNC]

            # random seq
            random_seq_ha = random_seqs[i][:HA_TRUNC]

            # find average enet and who errors
            total_random_error_ha = 0
            total_who_error_ha = 0
            for domseq in top_dominant_seqs_ha:
                total_random_error_ha += min(distance(random_seq_ha[0][:HA_TRUNC], domseq[:HA_TRUNC]), distance(random_seq_ha[1][:HA_TRUNC], domseq[:HA_TRUNC]))
                total_who_error_ha += distance(who_ha_seq, domseq[:HA_TRUNC])
            random_errors_ha.append(total_random_error_ha/10)
            who_errors_ha.append(total_who_error_ha/10)

        # add to dataframe
        df['ha_random_error_' + str(n)] = random_errors_ha
        df['ha_who_error'] = who_errors_ha
    
    total_random_error = np.zeros(len(df))
    for n in np.arange(m):
        total_random_error += df['ha_random_error_' + str(n)]
    df['avg_random_error'] = total_random_error/m
    df.to_csv(OUT_DIR + FILE + '_random.csv', index=False)

In [60]:
for i in range(4):
    df1 = pd.read_csv(OUT_DIR + FILES[i] + '_random.csv')[['season','avg_random_error']]
    df2 = pd.read_csv(OUT_DIR + FILES[i] + '.csv')[['season','ha_who_error','best_ha_enet_error']]
    df1 = df1.merge(df2, on='season')
    df1['diff'] = df1['avg_random_error'] - df1['best_ha_enet_error']
    print(FILES[i])
    print('    Avg. Improvement', round(sum(df1['diff'][:20])/20, 2))
    print('    Var. Improvement', round(np.std(df1['diff'][:20]), 2))
    print('    % Improvement:', round(100 * (sum(df1['avg_random_error'][:20])/sum(df1['best_ha_enet_error'][:20])-1), 2))

north_h1n1
    Avg. Improvement 2.29
    Var. Improvement 10.69
    % Improvement: 55.28
north_h3n2
    Avg. Improvement 0.08
    Var. Improvement 3.96
    % Improvement: 1.22
south_h1n1
    Avg. Improvement 1.5
    Var. Improvement 5.84
    % Improvement: 14.84
south_h3n2
    Avg. Improvement 0.7
    Var. Improvement 4.13
    % Improvement: 14.89
