# IRAT vs. Enet - Current Score Predictions
- **Note: Enet and Qnet are interchangeable, Qnet was the old name**
- Compares risk assesment from IRAT and risk assessment using Qnet q-distance
- Collected human HA and NA strains within the past year (September 30, 2021 - September 30, 2022)
    - If not enough strains available, take strains in last decade, this was done for all strains but H1N1 and H3N2
    - If even the above is not enough, use HXN_, where X matches the strain in question and N can be anything
    - No strains available for H10N_, so exclude A/Jiangxi-Donghu/346/2013	
    - A/duck/New York/1996 excluded
- Compute the average q-distance among the strain in question and the circulating human strains for both NA and HA
- Average the NA and HA averages (using arithmetic and geometric mean)

In [4]:
# basic imports
import os 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import trange, tqdm

# other
from Bio import SeqIO
from collections import Counter
import Levenshtein as lev

# enet
from quasinet.qnet import Qnet, qdistance, qdistance_matrix, membership_degree, save_qnet, load_qnet
from quasinet.qseqtools import list_trained_qnets, load_trained_qnet
from zedstat.textable import textable
from emergenet import Enet, save_model, load_model

## Data Sources
- IRAT (CDC): https://www.cdc.gov/flu/pandemic-resources/monitoring/irat-virus-summaries.htm#H1N2variant
- NCBI: https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Protein
- GISAID: https://platform.epicov.org/epi3/cfrontend#586f5f

In [5]:
GISAID_PATH = 'raw_data/gisaid_current/'
IRAT_PATH = 'results/'
ENET_PATH = 'enet_models/current_enets/'

NA_TRUNC = 449
HA_TRUNC = 550

df = pd.read_csv(IRAT_PATH + 'irat_data.csv')

## Creating New Enet
- Truncate NA at 449 amino acids, HA at 550 amino acids (to prevent too many strains from being filtered out)
- Construct a Enet using these strains **if there are more than 30 strains in the population** after truncating to match the length of the IRAT strain
- Make sure to include IRAT sequence in training the Enet
- Save Enet as "VIRUS_NAME_na.joblib" or "VIRUS_NAME_ha.joblib"

In [13]:
# input: fasta file name, length to truncate each sequence
# output: dataframe of sequences
def parse_fasta(file_name, trunc):
    acc = []
    seq = []
    for record in SeqIO.parse(file_name, 'fasta'):
        if len(record.seq) < trunc:
            continue
        acc.append(record.id.split('|')[0])
        seq.append(np.array(record.seq[:trunc].upper()))
    df = pd.DataFrame({'name':acc, 'sequence':seq})
    return df


# input: dataframe of sequences, number of samples, IRAT strain
# output: array of nucleotide lists
def sequence_array(seq_df, sample_size, IRAT_strain, random_state = 42):
    seqs = seq_df['sequence'].sample(sample_size, random_state = random_state).values
    seq_lst = []
    for seq in seqs:
        seq_lst.append(seq)
    seq_lst.append(np.array(list(IRAT_strain)))
    return np.array(seq_lst)


# input: name to call enet, array of nucleotide lists, number of nucleotides
# output: save enet as joblib
def train_save_enet(name, seq_arr, num_nuc):
    myenet = Qnet(feature_names=['x'+str(i) for i in np.arange(num_nuc)],n_jobs=1)
    myenet.fit(seq_arr)
    save_qnet(myenet, ENET_PATH + name + '.joblib')

In [7]:
# find new sample sizes HA and NA
ha_sample = []
na_sample = []

for i in trange(len(df)):
    VIRUS_TYPE = df['Virus Type'].iloc[i].lower()
    if VIRUS_TYPE == 'h10n8':
        ha_sample.append(-1)
        na_sample.append(-1)
        continue
    ha_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_na.fasta", NA_TRUNC)
    ha_sample.append(len(ha_df))
    na_sample.append(len(na_df))
    
df['HA Qnet Sample'] = ha_sample
df['NA Qnet Sample'] = na_sample

  0%|          | 0/23 [00:00<?, ?it/s]

In [9]:
for i in trange(len(df)):
    if df['HA Qnet Sample'].iloc[i] == -1 or df['NA Qnet Sample'].iloc[i] == -1:
        continue
    
    VIRUS_TYPE = df['Virus Type'].iloc[i].lower()
    ha_irat_seq = df['HA Sequence'].iloc[i][:HA_TRUNC]
    na_irat_seq = df['NA Sequence'].iloc[i][:NA_TRUNC]
    
    # skip if one or both HA and NA sequences are not available
    if ha_irat_seq == '-1' or na_irat_seq == '-1':
        continue
    ha_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_na.fasta", NA_TRUNC)
    
    # skip if less than 30 sequences available or enet already exists 
    if len(ha_df) < 30 or os.path.exists(ENET_PATH + VIRUS_TYPE + '_ha.joblib'):
        continue
    if len(na_df) < 30 or os.path.exists(ENET_PATH + VIRUS_TYPE + '_na.joblib'):
        continue
    
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), ha_irat_seq)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), na_irat_seq)
    
    train_save_enet(VIRUS_TYPE + '_ha', ha_arr, HA_TRUNC)
    train_save_enet(VIRUS_TYPE + '_na', na_arr, NA_TRUNC)

  0%|          | 0/23 [00:00<?, ?it/s]

## Average Qdistance
- Compute average qdistance between IRAT strain and the rest of the strains
- Do this 10 times for each strain, with 100 samples from the human population each time
- This is to compute variances

In [14]:
avg_qdists_ha_10 = []
avg_qdists_na_10 = []
avg_qdists_both_10 = []
avg_qdists_geom_10 = []

for i in trange(len(df)):
    VIRUS_TYPE = df['Virus Type'].iloc[i].lower()
    
    # skip if enet doesn't exist or one of the sequences doesn't exist
    if not os.path.exists(ENET_PATH + VIRUS_TYPE + '_ha.joblib')\
    or not os.path.exists(ENET_PATH + VIRUS_TYPE + '_na.joblib')\
    or df['HA Sequence'].iloc[i] == '-1' or df['HA Sequence'].iloc[i] == '-1':
        avg_qdists_ha_10.append(-1)
        avg_qdists_na_10.append(-1)
        avg_qdists_both_10.append(-1)
        avg_qdists_geom_10.append(-1)
        continue
        
    # load enets
    ha_enet = load_qnet(ENET_PATH + VIRUS_TYPE + '_ha.joblib')
    na_enet = load_qnet(ENET_PATH + VIRUS_TYPE + '_na.joblib')
    
    # access irat sequences and all sequences
    ha_irat_seq = np.array(list(df['HA Sequence'].iloc[i][:HA_TRUNC]))
    na_irat_seq = np.array(list(df['NA Sequence'].iloc[i][:NA_TRUNC]))
    ha_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_na.fasta", NA_TRUNC)
    
    avg_qdists_ha = []
    avg_qdists_na = []
    avg_qdists_both = []
    avg_qdists_geom = []
    # repeat 10 times for variance computation
    for j in range(42, 52):
        ha_arr = sequence_array(ha_df, min(100, len(ha_df)//2), ha_irat_seq, random_state=j)
        na_arr = sequence_array(na_df, min(100, len(na_df)//2), na_irat_seq, random_state=j)
        # compute qdistance sum
        ha_qdist_sum = 0
        na_qdist_sum = 0
        for k in range(len(ha_arr)):
            qdist = qdistance(ha_irat_seq, ha_arr[k], ha_enet, ha_enet)
            if np.isnan(qdist):
                continue
            ha_qdist_sum += qdist
        for k in range(len(na_arr)):
            qdist = qdistance(na_irat_seq, ha_arr[k], ha_enet, ha_enet)
            if np.isnan(qdist):
                continue
            na_qdist_sum += qdist
        # compute qdistance averages
        avg_qdists_ha.append(ha_qdist_sum/len(ha_arr))
        avg_qdists_na.append(na_qdist_sum/len(na_arr))
        avg_qdists_both.append((ha_qdist_sum + na_qdist_sum)/(len(ha_arr) + len(na_arr)))
        avg_qdists_geom.append(np.sqrt((ha_qdist_sum/len(ha_arr)) * (na_qdist_sum/len(na_arr))))
    
    avg_qdists_ha_10.append(avg_qdists_ha)
    avg_qdists_na_10.append(avg_qdists_na)
    avg_qdists_both_10.append(avg_qdists_both) 
    avg_qdists_geom_10.append(avg_qdists_geom)

df['HA Qdistance'] = avg_qdists_ha_10
df['NA Qdistance'] = avg_qdists_na_10
df['Arithmetic Mean'] = avg_qdists_both_10
df['Geometric Mean'] = avg_qdists_geom_10

# save dataframe as csv
os.makedirs('results', exist_ok=True)
df.to_csv('results/irat_average_qdistances_current_new.csv', index=False)  

  0%|          | 0/23 [00:00<?, ?it/s]

In [19]:
df_filled

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Qdistance,NA Qdistance,Arithmetic Mean,Geometric Mean
0,A/swine/Shandong/1207/2016,H1N1,Jul 2020,7.5,6.9,Moderate,MEARLFVLFCAFTTLKADTICVGYHANNSTDTVDTILEKNVTVTHS...,MNPNQKIITIGSICMTIGIASLILQIGNIISIWISHSIQIENQNQS...,1000,1000,"[0.059268942695459764, 0.059207549599012387, 0...","[0.04153677158036683, 0.04121188550974603, 0.0...","[0.0504028571379133, 0.05020971755437922, 0.05...","[0.04961693798040306, 0.04939691038300986, 0.0..."
1,A/duck/New York/1996,H1N1,Nov 2011,2.3,2.4,Low,-1,MNPNQKIITIGSICMAIGIISLVLQIGNIISIWVSHSIQTGSQSHP...,1000,1000,-1,-1,-1,-1
2,A/California/62/2018,H1N2,Jul 2019,5.8,5.7,Moderate,MKVKLMVLLCTFTATYADTICVGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSISLTLAAMCFLMQTAILVTNVTLHFNQCECHYPP...,37,37,"[0.25421003694129446, 0.17137241238504203, 0.1...","[0.04815654324714014, 0.037440724233825974, 0....","[0.1511832900942173, 0.104406568309434, 0.1195...","[0.11064301441040252, 0.08010185536798671, 0.0..."
3,A/Ohio/13/2017,H3N2,Jul 2019,6.6,5.8,Moderate,MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSP...,1000,1000,"[0.007443436845452373, 0.007528108280118501, 0...","[0.0684551151913223, 0.06848866622907772, 0.06...","[0.03794927601838734, 0.038008387254598104, 0....","[0.02257302209884124, 0.0227066090672604, 0.02..."
4,A/Indiana/08/2011,H3N2,Dec 2012,6.0,4.5,Moderate,MKTIIAFSCILCLIFAQKLPGSDNSMATLCLGHHAVPNGTLVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTVTLHFKQHDYNSPP...,1000,1000,"[0.015833187603574352, 0.01579698562348106, 0....","[0.018044038879410897, 0.017943339675641504, 0...","[0.016938613241492626, 0.01687016264956128, 0....","[0.016902504332314258, 0.01683599355230769, 0...."
5,A/canine/Illinois/12191/2015,H3N2,Jun 2016,3.7,3.7,Low,MKTVIALSYIFCLAFGQNLLGNENNAATLCLGHHAVPNGTMVKTIT...,MNPNQKIIAIGSVSLTIATVCFLLQIAILATTVTLYFKQNECNIPS...,1000,1000,"[0.01616230298118454, 0.016067588755834316, 0....","[0.03699740575337532, 0.036974918026864166, 0....","[0.02657985436727993, 0.02652125339134924, 0.0...","[0.024453287740176615, 0.024374121053616235, 0..."
6,A/American wigeon/South Carolina/AH0195145/2021,H5N1,Mar 2022,4.4,5.1,Moderate,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKITTIGSICMVIGIVSLMLQIGNIISIWVSHSIQTGNQYQP...,48,45,"[0.22507345650411203, 0.253385372610634, 0.250...","[0.30452166978027756, 0.3103378783035153, 0.30...","[0.2631423920322747, 0.2806751149218063, 0.276...","[0.2618009640888491, 0.28041946959711944, 0.27..."
7,A/American green-winged teal/Washington/195705...,H5N1,Mar 2015,3.6,4.1,Low-Moderate,MEKIVLLLAVISLVKSDQICIGYHANNSTKQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSICMVIGIISLVLQIGNIISIWVSHSIQTGSQNHP...,48,45,"[0.22532859992890514, 0.2523992705219045, 0.24...","[0.2981834341908242, 0.3031912977361294, 0.298...","[0.2602382080127414, 0.2767371168953872, 0.273...","[0.2592089036823605, 0.2766319981079351, 0.273..."
8,A/Vietnam/1203/2004,H5N1,Nov 2011,5.2,6.6,Moderate,MEKIVLLFAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSICMVTGIVSLMLQIGNMISIWVSHSIHTGNQHQS...,48,45,"[0.12775494527491757, 0.11190370497308454, 0.1...","[0.042373279313208424, 0.037899176226487016, 0...","[0.0868428970015986, 0.07644320161533989, 0.09...","[0.07357578392227798, 0.06512340773617227, 0.0..."
9,A/Northern pintail/Washington/40964/2014,H5N2,Mar 2015,3.8,4.1,Low-Moderate,MEKIVLLLAVISLVKSDQICIGYHANNSTKQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSVSLTIATVCFLMQIAILATTVTLHFKQNECSIPP...,95,92,"[0.2410948922603147, 0.22971447028474637, 0.21...","[0.5062597166948044, 0.5044064158788306, 0.502...","[0.37228170013843065, 0.36561469599971436, 0.3...","[0.349366042786461, 0.34039602323151935, 0.331..."


## A/Jiangxi-Donghu/346/2013
- Compute risk score using every Enet from the other strains, and take the average among NA and HA
- Take the geometric mean of the resulting NA and HA averages

In [21]:
STRAINS = ['h1n1', 'h1n2', 'h3n2', 'h5n1', 'h5n2', 'h5n6', 
           'h5n8', 'h7n7', 'h7n8', 'h7n9', 'h9n2']
df_filled = pd.read_csv('results/irat_average_qdistances_current_new.csv')

n = 22
ha_irat_seq = np.array(list(df['HA Sequence'].iloc[n][:HA_TRUNC]))
na_irat_seq = np.array(list(df['NA Sequence'].iloc[n][:NA_TRUNC]))

ha_risk = []
na_risk = []
both_risk = []
geom_mean = []

for STRAIN in tqdm(STRAINS):
    # skip if enet doesn't exist
    if not os.path.exists(ENET_PATH + STRAIN + '_ha.joblib') or not os.path.exists(ENET_PATH + STRAIN + '_na.joblib'):
        continue
    # skip duck enet
    if STRAIN == 'A:duck:New York:1996':
        continue

    # load enets
    ha_enet = load_qnet(ENET_PATH + STRAIN + '_ha.joblib')
    na_enet = load_qnet(ENET_PATH + STRAIN + '_na.joblib')

    # access irat sequences and all sequences
    ha_df = parse_fasta(GISAID_PATH + STRAIN + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + STRAIN + "_na.fasta", NA_TRUNC)
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), ha_irat_seq)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), na_irat_seq)

    # compute qdistance sum
    ha_qdist_sum = 0
    na_qdist_sum = 0
    for j in range(len(ha_arr)):
        qdist = qdistance(ha_irat_seq, ha_arr[j], ha_enet, ha_enet)
        if np.isnan(qdist):
            continue
        ha_qdist_sum += qdist
    for j in range(len(na_arr)):
        qdist = qdistance(na_irat_seq, na_arr[j], na_enet, na_enet)
        if np.isnan(qdist):
            continue
        na_qdist_sum += qdist

    # compute qdistance averages
    ha_risk.append(ha_qdist_sum/len(ha_arr))
    na_risk.append(na_qdist_sum/len(na_arr))
    both_risk.append((ha_qdist_sum + na_qdist_sum)/(len(ha_arr) + len(na_arr)))
    geom_mean.append(np.sqrt((ha_qdist_sum/len(ha_arr)) * (na_qdist_sum/len(na_arr))))

# save to results dataframe
df_filled.at[n, 'HA Qdistance'] = ha_risk
df_filled.at[n, 'NA Qdistance'] = na_risk
df_filled.at[n, 'Arithmetic Mean'] = both_risk
df_filled.at[n, 'Geometric Mean'] = geom_mean

  0%|          | 0/11 [00:00<?, ?it/s]

In [24]:
df_filled.iloc[[22]]

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Qdistance,NA Qdistance,Arithmetic Mean,Geometric Mean
22,A/Jiangxi-Donghu/346/2013,H10N8,Feb 2014,4.3,6.0,Moderate,MYKIVVIIALLGAVKGLDKICLGHHAVANGTIVKTLTNEQEEVTNA...,MNPNQKIITIGSVSLGLVILNILLHIVSITVTVLVLPGNGNNESCN...,-1,-1,"[0.03929273071556801, 0.3559593557688923, 0.15...","[0.01867794582527127, 0.0884568883907169, 0.03...","[0.02898533827041964, 0.2222081220798046, 0.09...","[0.027090727115239115, 0.1774459269887038, 0.0..."


In [28]:
# save dataframe as csv
df_filled.to_csv('results/irat_average_qdistances_current_new.csv', index=False)  

In [29]:
VIRUS_TYPE = df['Virus Type'].iloc[19].lower()

# load enets
ha_enet = load_qnet(ENET_PATH + VIRUS_TYPE + '_ha.joblib')
na_enet = load_qnet(ENET_PATH + VIRUS_TYPE + '_na.joblib')

# access irat sequences and all sequences
ha_irat_seq = np.array(list(df['HA Sequence'].iloc[i][:HA_TRUNC]))
na_irat_seq = np.array(list(df['NA Sequence'].iloc[i][:NA_TRUNC]))
ha_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_ha.fasta", HA_TRUNC)
na_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_na.fasta", NA_TRUNC)

avg_qdists_ha = []
avg_qdists_na = []
avg_qdists_both = []
avg_qdists_geom = []
# repeat 10 times for variance computation
for j in range(42, 52):
    ha_arr = sequence_array(ha_df, min(100, len(ha_df)//2), ha_irat_seq, random_state=j)
    na_arr = sequence_array(na_df, min(100, len(na_df)//2), na_irat_seq, random_state=j)
    # compute qdistance sum
    ha_qdist_sum = 0
    na_qdist_sum = 0
    for k in range(len(ha_arr)):
        ha_qdist_sum += qdistance(ha_irat_seq, ha_arr[k], ha_enet, ha_enet)
    for k in range(len(na_arr)):
        na_qdist_sum += qdistance(na_irat_seq, na_arr[k], na_enet, na_enet)
    # compute qdistance averages
    avg_qdists_ha.append(ha_qdist_sum/len(ha_arr))
    avg_qdists_na.append(na_qdist_sum/len(na_arr))
    avg_qdists_both.append((ha_qdist_sum + na_qdist_sum)/(len(ha_arr) + len(na_arr)))
    avg_qdists_geom.append(np.sqrt((ha_qdist_sum/len(ha_arr)) * (na_qdist_sum/len(na_arr)))) 

In [33]:
ha_qdist_sum

nan

In [30]:
avg_qdists_ha

[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]

In [31]:
avg_qdists_na

[0.0014946170772583095,
 0.0014623661892494138,
 0.0014554822455218573,
 0.005297617449924132,
 0.0011906515478159435,
 0.0013909683865565888,
 0.01010533819050793,
 0.006574688768657108,
 0.0013863540805583957,
 0.0013422156587852316]