# IRAT vs. Enet - Current Score Predictions
- **Note: Enet and Qnet are interchangeable, Qnet was the old name**
- Compares risk assesment from IRAT and risk assessment using Qnet q-distance
- Collected human HA and NA strains within the past year (September 30, 2021 - September 30, 2022)
    - If not enough strains available, take strains in last decade, this was done for all strains but H1N1 and H3N2
    - If even the above is not enough, use HXN_, where X matches the strain in question and N can be anything
    - No strains available for H10N_, so exclude A/Jiangxi-Donghu/346/2013	
    - A/duck/New York/1996 excluded
- Compute the average q-distance among the strain in question and the circulating human strains for both NA and HA
    - Do this 10 times for each strain, with 100 samples from the human population each time
    - This is to compute variances
- Average the NA and HA averages (using arithmetic and geometric mean)

In [2]:
# basic imports
import os 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import trange, tqdm

# other
from Bio import SeqIO
from collections import Counter
import Levenshtein as lev

# enet
from quasinet.qnet import Qnet, qdistance, qdistance_matrix, membership_degree, save_qnet, load_qnet
from quasinet.qseqtools import list_trained_qnets, load_trained_qnet
from zedstat.textable import textable
from emergenet import Enet, save_model, load_model

## Data Sources
- IRAT (CDC): https://www.cdc.gov/flu/pandemic-resources/monitoring/irat-virus-summaries.htm#H1N2variant
- NCBI: https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Protein
- GISAID: https://platform.epicov.org/epi3/cfrontend#586f5f

In [40]:
GISAID_PATH = 'raw_data/gisaid_current/'
IRAT_PATH = 'results/'
ENET_PATH = 'enet_models/current_enets/'

NA_TRUNC = 449
HA_TRUNC = 550

df = pd.read_csv(IRAT_PATH + 'irat_data.csv')

## Creating New Enet
- Truncate NA at 449 amino acids, HA at 550 amino acids (to prevent too many strains from being filtered out)
- Construct a Enet using these strains **if there are more than 30 strains in the population** after truncating to match the length of the IRAT strain
- Make sure to include IRAT sequence in training the Enet
- Save Enet as "VIRUS_NAME_na.joblib" or "VIRUS_NAME_ha.joblib"

In [41]:
# input: fasta file name, length to truncate each sequence
# output: dataframe of sequences
def parse_fasta(file_name, trunc):
    acc = []
    seq = []
    for record in SeqIO.parse(file_name, 'fasta'):
        if len(record.seq) < trunc:
            continue
        acc.append(record.id.split('|')[0])
        seq.append(np.array(record.seq[:trunc].upper()))
    df = pd.DataFrame({'name':acc, 'sequence':seq})
    return df


# input: dataframe of sequences, number of samples, IRAT strain
# output: array of nucleotide lists
def sequence_array(seq_df, sample_size, IRAT_strain, random_state = 42):
    seqs = seq_df['sequence'].sample(sample_size, random_state = random_state).values
    seq_lst = []
    for seq in seqs:
        seq_lst.append(seq)
    seq_lst.append(np.array(list(IRAT_strain)))
    return np.array(seq_lst)


# input: name to call enet, array of nucleotide lists, number of nucleotides
# output: save enet as joblib
def train_save_enet(name, seq_arr, num_nuc):
    myenet = Qnet(feature_names=['x'+str(i) for i in np.arange(num_nuc)],n_jobs=1)
    myenet.fit(seq_arr)
    save_qnet(myenet, ENET_PATH + name + '.joblib')

In [42]:
# find new sample sizes HA and NA
ha_sample = []
na_sample = []

for i in trange(len(df)):
    VIRUS_TYPE = df['Virus Type'].iloc[i].lower()
    if VIRUS_TYPE == 'h10n8':
        ha_sample.append(-1)
        na_sample.append(-1)
        continue
    ha_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_na.fasta", NA_TRUNC)
    ha_sample.append(len(ha_df))
    na_sample.append(len(na_df))
    
df['HA Qnet Sample'] = ha_sample
df['NA Qnet Sample'] = na_sample

  0%|          | 0/23 [00:00<?, ?it/s]

In [9]:
for i in trange(len(df)):
    if df['HA Qnet Sample'].iloc[i] == -1 or df['NA Qnet Sample'].iloc[i] == -1:
        continue
    
    VIRUS_TYPE = df['Virus Type'].iloc[i].lower()
    ha_irat_seq = df['HA Sequence'].iloc[i][:HA_TRUNC]
    na_irat_seq = df['NA Sequence'].iloc[i][:NA_TRUNC]
    
    # skip if one or both HA and NA sequences are not available
    if ha_irat_seq == '-1' or na_irat_seq == '-1':
        continue
    ha_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_na.fasta", NA_TRUNC)
    
    # skip if less than 30 sequences available or enet already exists 
    if len(ha_df) < 30 or os.path.exists(ENET_PATH + VIRUS_TYPE + '_ha.joblib'):
        continue
    if len(na_df) < 30 or os.path.exists(ENET_PATH + VIRUS_TYPE + '_na.joblib'):
        continue
    
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), ha_irat_seq)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), na_irat_seq)
    
    train_save_enet(VIRUS_TYPE + '_ha', ha_arr, HA_TRUNC)
    train_save_enet(VIRUS_TYPE + '_na', na_arr, NA_TRUNC)

  0%|          | 0/23 [00:00<?, ?it/s]

## Average Qdistance
- Compute average qdistance between IRAT strain and the rest of the strains
- Do this 10 times for each strain, with 100 samples from the human population each time
- This is to compute variances

In [43]:
avg_qdists_ha_10 = []
avg_qdists_na_10 = []
avg_qdists_both_10 = []
avg_qdists_geom_10 = []

for i in trange(len(df)):
    VIRUS_TYPE = df['Virus Type'].iloc[i].lower()
    
    # skip if enet doesn't exist or one of the sequences doesn't exist
    if not os.path.exists(ENET_PATH + VIRUS_TYPE + '_ha.joblib')\
    or not os.path.exists(ENET_PATH + VIRUS_TYPE + '_na.joblib')\
    or df['HA Sequence'].iloc[i] == '-1' or df['HA Sequence'].iloc[i] == '-1':
        avg_qdists_ha_10.append(-1)
        avg_qdists_na_10.append(-1)
        avg_qdists_both_10.append(-1)
        avg_qdists_geom_10.append(-1)
        continue
        
    # load enets
    ha_enet = load_qnet(ENET_PATH + VIRUS_TYPE + '_ha.joblib')
    na_enet = load_qnet(ENET_PATH + VIRUS_TYPE + '_na.joblib')
    
    # access irat sequences and all sequences
    ha_irat_seq = np.array(list(df['HA Sequence'].iloc[i][:HA_TRUNC]))
    na_irat_seq = np.array(list(df['NA Sequence'].iloc[i][:NA_TRUNC]))
    ha_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_na.fasta", NA_TRUNC)
    
    avg_qdists_ha = []
    avg_qdists_na = []
    avg_qdists_both = []
    avg_qdists_geom = []
    # repeat 10 times for variance computation
    for j in range(42, 52):
        ha_arr = sequence_array(ha_df, min(100, len(ha_df)//2), ha_irat_seq, random_state=j)
        na_arr = sequence_array(na_df, min(100, len(na_df)//2), na_irat_seq, random_state=j)
        # compute qdistance sum
        ha_qdist_sum = 0
        na_qdist_sum = 0
        num_ha = len(ha_arr)
        num_na = len(na_arr)
        for k in range(len(ha_arr)):
            qdist = qdistance(ha_irat_seq, ha_arr[k], ha_enet, ha_enet)
            if np.isnan(qdist):
                num_ha -= 1
                continue
            ha_qdist_sum += qdist
        for k in range(len(na_arr)):
            qdist = qdistance(na_irat_seq, na_arr[k], na_enet, na_enet)
            if np.isnan(qdist):
                num_na -= 1
                continue
            na_qdist_sum += qdist
        # compute qdistance averages
        avg_qdists_ha.append(ha_qdist_sum/num_ha)
        avg_qdists_na.append(na_qdist_sum/num_na)
        avg_qdists_both.append((ha_qdist_sum + na_qdist_sum)/(num_ha + num_na))
        avg_qdists_geom.append(np.sqrt((ha_qdist_sum/num_ha) * (na_qdist_sum/num_na))) 
    
    avg_qdists_ha_10.append(avg_qdists_ha)
    avg_qdists_na_10.append(avg_qdists_na)
    avg_qdists_both_10.append(avg_qdists_both) 
    avg_qdists_geom_10.append(avg_qdists_geom)

df['HA Qdistance'] = avg_qdists_ha_10
df['NA Qdistance'] = avg_qdists_na_10
df['Arithmetic Mean'] = avg_qdists_both_10
df['Geometric Mean'] = avg_qdists_geom_10

# save dataframe as csv
os.makedirs('results', exist_ok=True)
df.to_csv('results/irat_average_qdistances_current.csv', index=False)  

  0%|          | 0/23 [00:00<?, ?it/s]

## A/Jiangxi-Donghu/346/2013
- Compute risk score using every Enet from the other strains, and take the average among NA and HA
- Take the geometric mean of the resulting NA and HA averages

In [44]:
STRAINS = ['h1n1', 'h1n2', 'h3n2', 'h5n1', 'h5n2', 'h5n6', 
           'h5n8', 'h7n7', 'h7n8', 'h7n9', 'h9n2']
df_filled = pd.read_csv('results/irat_average_qdistances_current.csv')

n = 22
ha_irat_seq = np.array(list(df['HA Sequence'].iloc[n][:HA_TRUNC]))
na_irat_seq = np.array(list(df['NA Sequence'].iloc[n][:NA_TRUNC]))

ha_risk = []
na_risk = []
both_risk = []
geom_mean = []

for STRAIN in tqdm(STRAINS):
    # skip if enet doesn't exist
    if not os.path.exists(ENET_PATH + STRAIN + '_ha.joblib') or not os.path.exists(ENET_PATH + STRAIN + '_na.joblib'):
        continue
    # skip duck enet
    if STRAIN == 'A:duck:New York:1996':
        continue

    # load enets
    ha_enet = load_qnet(ENET_PATH + STRAIN + '_ha.joblib')
    na_enet = load_qnet(ENET_PATH + STRAIN + '_na.joblib')

    # access irat sequences and all sequences
    ha_df = parse_fasta(GISAID_PATH + STRAIN + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + STRAIN + "_na.fasta", NA_TRUNC)
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), ha_irat_seq)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), na_irat_seq)

    # compute qdistance sum
    ha_qdist_sum = 0
    na_qdist_sum = 0
    num_ha = len(ha_arr)
    num_na = len(na_arr)
    for j in range(len(ha_arr)):
        qdist = qdistance(ha_irat_seq, ha_arr[j], ha_enet, ha_enet)
        if np.isnan(qdist):
            num_ha -= 1
            continue
        ha_qdist_sum += qdist
    for j in range(len(na_arr)):
        qdist = qdistance(na_irat_seq, na_arr[j], na_enet, na_enet)
        if np.isnan(qdist):
            num_na -= 1
            continue
        na_qdist_sum += qdist

    # compute qdistance averages
    ha_risk.append(ha_qdist_sum/num_ha)
    na_risk.append(na_qdist_sum/num_na)
    both_risk.append((ha_qdist_sum + na_qdist_sum)/(num_ha + num_na))
    geom_mean.append(np.sqrt((ha_qdist_sum/num_ha) * (na_qdist_sum/num_na)))

# save to results dataframe
df_filled.at[n, 'HA Qdistance'] = ha_risk
df_filled.at[n, 'NA Qdistance'] = na_risk
df_filled.at[n, 'Arithmetic Mean'] = both_risk
df_filled.at[n, 'Geometric Mean'] = geom_mean

  0%|          | 0/11 [00:00<?, ?it/s]

In [45]:
df_filled.iloc[[22]]

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Qdistance,NA Qdistance,Arithmetic Mean,Geometric Mean
22,A/Jiangxi-Donghu/346/2013,H10N8,Feb 2014,4.3,6.0,Moderate,MYKIVVIIALLGAVKGLDKICLGHHAVANGTIVKTLTNEQEEVTNA...,MNPNQKIITIGSVSLGLVILNILLHIVSITVTVLVLPGNGNNESCN...,-1,-1,"[0.03929273071556801, 0.3559593557688923, 0.15...","[0.01867794582527127, 0.0884568883907169, 0.03...","[0.02898533827041964, 0.2222081220798046, 0.09...","[0.027090727115239115, 0.1774459269887038, 0.0..."


In [46]:
# save dataframe as csv
df_filled.to_csv('results/irat_average_qdistances_current.csv', index=False)  

## Average Results

In [47]:
df_filled = pd.read_csv('results/irat_average_qdistances_current.csv')
df_filled['Avg. HA Qdistance'] = df_filled['HA Qdistance'].apply(eval).apply(np.mean)
df_filled['Var. HA Qdistance'] = df_filled['HA Qdistance'].apply(eval).apply(np.var)
df_filled['Avg. NA Qdistance'] = df_filled['NA Qdistance'].apply(eval).apply(np.mean)
df_filled['Var. NA Qdistance'] = df_filled['NA Qdistance'].apply(eval).apply(np.var)
df_filled['Avg. Arithmetric Mean'] = df_filled['Arithmetic Mean'].apply(eval).apply(np.mean)
df_filled['Var. Arithmetric Mean'] = df_filled['Arithmetic Mean'].apply(eval).apply(np.var)
df_filled['Avg. Geometric Mean'] = df_filled['Geometric Mean'].apply(eval).apply(np.mean)
df_filled['Var. Geometric Mean'] = df_filled['Geometric Mean'].apply(eval).apply(np.var)
df_filled = df_filled.sort_values(by='Potential Emergence Estimate', ascending=False)
df_filled.to_csv('results/irat_average_qdistances_current.csv', index=False)
df_filled

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,...,Arithmetic Mean,Geometric Mean,Avg. HA Qdistance,Var. HA Qdistance,Avg. NA Qdistance,Var. NA Qdistance,Avg. Arithmetric Mean,Var. Arithmetric Mean,Avg. Geometric Mean,Var. Geometric Mean
0,A/swine/Shandong/1207/2016,H1N1,Jul 2020,7.5,6.9,Moderate,MEARLFVLFCAFTTLKADTICVGYHANNSTDTVDTILEKNVTVTHS...,MNPNQKIITIGSICMTIGIASLILQIGNIISIWISHSIQIENQNQS...,1826,1831,...,"[0.0504028571379133, 0.05020971755437922, 0.05...","[0.04961693798040306, 0.04939691038300986, 0.0...",0.059325,4.396109e-08,0.041355,2.98951e-08,0.05034,1.048258e-08,0.049531,1.039585e-08
3,A/Ohio/13/2017,H3N2,Jul 2019,6.6,5.8,Moderate,MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSP...,18820,18819,...,"[0.03794927601838734, 0.038008387254598104, 0....","[0.02257302209884124, 0.0227066090672604, 0.02...",0.008455,9.058305e-06,0.068417,3.483698e-09,0.038436,2.272947e-06,0.02378,1.298575e-05
18,A/Hong Kong/125/2017,H7N9,May 2017,6.5,7.5,Moderate-High,MNTQILVFALIAIIPTNADKICLGHHAVSNGTKVNTLTERGVEVVN...,MNPNQKILCTSATAITIGAIAVLIGIANLGLNIGLHLKPGCNCSHS...,1248,1247,...,"[0.004877826668721765, 0.006650369531842794, 0...","[0.0037028892207661, 0.004325280096255358, 0.0...",0.009787,4.701049e-06,0.003446,8.910172e-06,0.006616,5.087076e-06,0.005481,7.056662e-06
19,A/Shanghai/02/2013,H7N9,Apr 2016,6.4,7.2,Moderate-High,MNTQILVFALIAIIPTNADKICLGHHAVSNGTKVNTLTERGVEVVN...,MNPNQKILCTSATAIIIGAIAVLIGMANLGLNIGLHLKPGCNCSHS...,1248,1247,...,"[0.0022201500757578476, 0.0026407075614291124,...","[0.00210888252534967, 0.002386143249933443, 0....",0.003131,1.506534e-07,0.00317,8.648503e-06,0.003154,2.085731e-06,0.002846,1.369962e-06
21,A/Anhui-Lujiang/39/2018,H9N2,Jul 2019,6.2,5.9,Moderate,METVSLITILLVATASNADKICIGYQSTNSTETVDTLTENNVPVTH...,MNPNQKITAIGSVSLIIAIICLLMQIAILTTTMTLHFGQKECSNPS...,58,58,...,"[0.019890838095902173, 0.02909631887940379, 0....","[0.018971508708604274, 0.025275351068975476, 0...",0.014639,2.648809e-06,0.035348,0.0003212412,0.024993,8.650718e-05,0.022034,4.304632e-05
4,A/Indiana/08/2011,H3N2,Dec 2012,6.0,4.5,Moderate,MKTIIAFSCILCLIFAQKLPGSDNSMATLCLGHHAVPNGTLVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTVTLHFKQHDYNSPP...,18820,18819,...,"[0.016938613241492626, 0.01687016264956128, 0....","[0.016902504332314258, 0.01683599355230769, 0....",0.016789,8.688213e-06,0.017994,6.637876e-09,0.017392,2.192359e-06,0.017326,1.935685e-06
2,A/California/62/2018,H1N2,Jul 2019,5.8,5.7,Moderate,MKVKLMVLLCTFTATYADTICVGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSISLTLAAMCFLMQTAILVTNVTLHFNQCECHYPP...,37,37,...,"[0.1511832900942173, 0.104406568309434, 0.1195...","[0.11064301441040252, 0.08010185536798671, 0.0...",0.1961,0.0008630988,0.045251,2.467554e-05,0.120676,0.0002724791,0.094064,0.000126788
20,A/Bangladesh/0994/2011,H9N2,Feb 2014,5.6,5.4,Moderate,METVSLMTILLLVTTSNADKICIGHQSTNSTETVDTLTETNVPVTH...,MNPNQKIIALGSASLTIAIICLLIQIAILATTMTLHFMQNEHTNST...,58,58,...,"[0.2615864368996027, 0.25348700840317945, 0.26...","[0.1509708016952567, 0.14755990149418968, 0.15...",0.047122,1.532937e-06,0.467093,0.000251183,0.257108,6.576701e-05,0.148334,1.277045e-05
10,A/Sichuan/06681/2021,H5N6,Oct 2021,5.3,6.3,Moderate,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKITCISATGVTLSIVSLLIGITNLGLNIGLHYKVSDSTTIN...,46,46,...,"[0.19929853090062497, 0.24969785533520214, 0.1...","[0.14098323610429828, 0.16479919492131653, 0.1...",0.340975,0.001377656,0.06138,1.752663e-05,0.201178,0.0003347547,0.144336,6.820357e-05
8,A/Vietnam/1203/2004,H5N1,Nov 2011,5.2,6.6,Moderate,MEKIVLLFAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSICMVTGIVSLMLQIGNMISIWVSHSIHTGNQHQS...,48,45,...,"[0.0868428970015986, 0.07644320161533989, 0.09...","[0.07357578392227798, 0.06512340773617227, 0.0...",0.128744,0.0003333802,0.039219,3.189339e-06,0.085847,0.0001031423,0.070949,3.950884e-05


In [3]:
pd.read_csv('results/irat_average_qdistances_current.csv')[:22].corr()[:2][['Avg. Geometric Mean']]

Unnamed: 0,Avg. Geometric Mean
Potential Emergence Estimate,-0.601374
Potential Impact Estimate,-0.420468
