# IRAT vs. Qnet Comparison
- Compares risk assesment from IRAT and risk assessment using Qnet q-distance
- Use both NA and HA segments
- For each strain previously analyzed by IRAT
    - Collect strains one year leading up to month of analysis
    - For example, the "A/swine/Shandong/1207/2016" strain was assessed by IRAT in July 2020, so we will use human H1N1 strains circulating between July 1, 2019 through June 30, 2020
    - Note: had difficulty finding 'A/duck/New York/1996', only NA available, do not use in final results
    - For the following strains, only use upper bound of date due to small sample size
        - H1N2
        - H5N1
        - H5N6
        - H7N7
        - H9N2
    - Strains with 'Qnet Sample' = -1 have no available human strains
    - Construct a Qnet using these strains **if there are more than 30 strains in the population for both NA and HA**
    - Compute the average q-distance among the strain in question and the circulating human strains for both NA and HA
    - Average the NA and HA averages (using arithmetic and geometric mean)

In [62]:
# basic imports
import os 
import numpy as np
import pandas as pd
import math
from scipy.stats import gmean
import warnings
warnings.filterwarnings('ignore')
import tqdm
from tqdm.notebook import trange, tqdm

# visualization
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

# other
from Bio import SeqIO
from collections import Counter
import Levenshtein as lev

# qnet
from quasinet.qnet import Qnet, qdistance, qdistance_matrix, membership_degree, save_qnet, load_qnet
from quasinet.qseqtools import list_trained_qnets, load_trained_qnet
from zedstat.textable import textable
from emergenet.emergenet import Enet, save_model, load_model

## Data Sources
- IRAT (CDC): https://www.cdc.gov/flu/pandemic-resources/monitoring/irat-virus-summaries.htm#H1N2variant
- NCBI: https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Protein
- GISAID: https://platform.epicov.org/epi3/cfrontend#586f5f

In [2]:
NCBI_PATH = 'raw_data/ncbi/'
GISAID_PATH = 'raw_data/gisaid/'
IRAT_PATH = 'results/'

NA_TRUNC = 449
HA_TRUNC = 550

df = pd.read_csv(IRAT_PATH + 'irat_data.csv')

## Creating New Qnet
- Truncate NA at 449 amino acids, HA at 550 amino acids (to prevent too many strains from being filtered out)
- Construct a Qnet using these strains **if there are more than 30 strains in the population** after truncating to match the length of the IRAT strain
- Make sure to include IRAT sequence in training the Qnet
- Save Qnet as "VIRUS_NAME_na.joblib" or "VIRUS_NAME_ha.joblib"

In [3]:
# input: fasta file name, length to truncate each sequence
# output: dataframe of sequences
def parse_fasta(file_name, trunc):
    acc = []
    seq = []
    for record in SeqIO.parse(file_name, 'fasta'):
        if len(record.seq) < trunc:
            continue
        acc.append(record.id.split('|')[0])
        seq.append(np.array(record.seq[:trunc].upper()))
    df = pd.DataFrame({'name':acc, 'sequence':seq})
    return df


# input: dataframe of sequences, number of samples, IRAT strain
# output: array of nucleotide lists
def sequence_array(seq_df, sample_size, IRAT_strain):
    seqs = seq_df['sequence'].sample(sample_size, random_state = 42).values
    seq_lst = []
    for seq in seqs:
        seq_lst.append(seq)
    seq_lst.append(np.array(list(IRAT_strain)))
    return np.array(seq_lst)


# input: name to call qnet, array of nucleotide lists, number of nucleotides
# output: save qnet as joblib
def train_save_qnet(name, seq_arr, num_nuc):
    myqnet = Qnet(feature_names=['x'+str(i) for i in np.arange(num_nuc)],n_jobs=1)
    myqnet.fit(seq_arr)
    save_qnet(myqnet, 'qnet_models/' + name + '.joblib')

In [45]:
for i in trange(len(df)):
    if df['HA Qnet Sample'].iloc[i] == -1 or df['NA Qnet Sample'].iloc[i] == -1:
        continue
    
    STRAIN = df['Influenza Virus'].iloc[i].replace('/',':')
    ha_irat_seq = df['HA Sequence'].iloc[i][:HA_TRUNC]
    na_irat_seq = df['NA Sequence'].iloc[i][:NA_TRUNC]
    
    # skip if one or both HA and NA sequences are not available
    if ha_irat_seq == '-1' or na_irat_seq == '-1':
        continue
    ha_df = parse_fasta(GISAID_PATH + STRAIN + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + STRAIN + "_na.fasta", NA_TRUNC)
    
    # skip if less than 30 sequences available or qnet already exists 
    if len(ha_df) < 30 or os.path.exists('qnet_models/' + STRAIN + '_ha.joblib'):
        continue
    if len(na_df) < 30 or os.path.exists('qnet_models/' + STRAIN + '_na.joblib'):
        continue
    
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), ha_irat_seq)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), na_irat_seq)
    
    train_save_qnet(STRAIN + '_ha', ha_arr, HA_TRUNC)
    train_save_qnet(STRAIN + '_na', na_arr, NA_TRUNC)

  0%|          | 0/23 [00:00<?, ?it/s]

## Average Qdistance
- Compute average qdistance between IRAT strain and the rest of the strains

In [None]:
avg_qdists_ha = []
avg_qdists_na = []
avg_qdists_both = []

for i in trange(len(df)):
    STRAIN = df['Influenza Virus'].iloc[i].replace('/',':')
    
    # skip if Qnet doesn't exist
    if not os.path.exists('qnet_models/' + STRAIN + '_ha.joblib') or not os.path.exists('qnet_models/' + STRAIN + '_na.joblib'):
        avg_qdists_ha.append(-1)
        avg_qdists_na.append(-1)
        avg_qdists_both.append(-1)
        continue
    # skip if one of the sequences doesn't exist
    if df['HA Sequence'].iloc[i] == '-1' or df['HA Sequence'].iloc[i] == '-1':
        avg_qdists_ha.append(-1)
        avg_qdists_na.append(-1)
        avg_qdists_both.append(-1)
        continue
        
    # load qnets
    ha_qnet = load_qnet('qnet_models/' + STRAIN + '_ha.joblib')
    na_qnet = load_qnet('qnet_models/' + STRAIN + '_na.joblib')
    
    # access irat sequences and all sequences
    ha_irat_seq = np.array(list(df['HA Sequence'].iloc[i][:HA_TRUNC]))
    na_irat_seq = np.array(list(df['NA Sequence'].iloc[i][:NA_TRUNC]))
    ha_df = parse_fasta(GISAID_PATH + STRAIN + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + STRAIN + "_na.fasta", NA_TRUNC)
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), ha_irat_seq)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), na_irat_seq)
    
    # compute qdistance sum
    ha_qdist_sum = 0
    na_qdist_sum = 0
    for j in range(len(ha_arr)):
        ha_qdist_sum += qdistance(ha_irat_seq, ha_arr[j], ha_qnet, ha_qnet)
    for j in range(len(na_arr)):
        na_qdist_sum += qdistance(na_irat_seq, na_arr[j], na_qnet, na_qnet)
        
    # compute qdistance averages
    avg_qdists_ha.append(ha_qdist_sum/len(ha_arr))
    avg_qdists_na.append(na_qdist_sum/len(na_arr))
    avg_qdists_both.append((ha_qdist_sum + na_qdist_sum)/(len(ha_arr) + len(na_arr)))
    

df['HA Average Qdistance'] = avg_qdists_ha
df['NA Average Qdistance'] = avg_qdists_na
df['Both Average Qdistance'] = avg_qdists_both

In [6]:
# geometric mean column
df = pd.read_csv('results/irat_average_qdistances.csv')
df['Geometric Mean'] = np.sqrt(df['HA Average Qdistance']*df['NA Average Qdistance'])
df.sort_values(by = 'Potential Emergence Estimate')

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance,Geometric Mean
1,A/duck/New York/1996,H1N1,Nov 2011,2.3,2.4,Low,-1,MNPNQKIITIGSICMAIGIISLVLQIGNIISIWVSHSIQTGSQSHP...,1000.0,1000.0,-1.0,-1.0,-1.0,1.0
17,A/chicken/Tennessee/17-007147-2/2017,H7N9,Oct 2017,2.8,3.5,Low,MNTQILALIACMLIGAKGDKICLGHHAVANGTKVNTLTERGIEVVN...,MNPNQKILCTSATAIVIGTIAVLIGIANLGLNIGLHLKPNCNCSNS...,496.0,495.0,0.209532,0.25409,0.231788,0.230738
16,A/chicken/Tennessee/17-007431-3/2017,H7N9,Oct 2017,3.1,3.5,Low,MNTQILALIACMLIGAKGDKICLGHHAVANGTKVNTLTERGIEVVN...,MNPNQKILCTSATAIVIGTIAVLIGIANLGLNIGLHLKPNCNCSNS...,496.0,495.0,0.102686,0.256855,0.179693,0.162405
15,A/turkey/Indiana/1573-2/2016,H7N8,Jul 2017,3.4,3.9,Low,MNTQILALIACMLIGAKGDKICLGHHAVANGTKVNTLTERGIEVVN...,MNPNKKIITIGSVSLGLVVLNILLHIVSITVTVLVLPGNGNNGNCN...,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
7,A/American green-winged teal/Washington/195705...,H5N1,Mar 2015,3.6,4.1,Low-Moderate,MEKIVLLLAVISLVKSDQICIGYHANNSTKQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSICMVIGIISLVLQIGNIISIWVSHSIQTGSQNHP...,326.0,314.0,0.191127,0.448228,0.316856,0.292691
5,A/canine/Illinois/12191/2015,H3N2,Jun 2016,3.7,3.7,Low,MKTVIALSYIFCLAFGQNLLGNENNAATLCLGHHAVPNGTMVKTIT...,MNPNQKIIAIGSVSLTIATVCFLLQIAILATTVTLYFKQNECNIPS...,1000.0,1000.0,0.060665,0.150856,0.105761,0.095664
9,A/Northern pintail/Washington/40964/2014,H5N2,Mar 2015,3.8,4.1,Low-Moderate,MEKIVLLLAVISLVKSDQICIGYHANNSTKQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSVSLTIATVCFLMQIAILATTVTLHFKQNECSIPP...,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
13,A/gyrfalcon/Washington/41088/2014,H5N8,Mar 2015,4.2,4.6,Low-Moderate,MEKIVLLLAVISLVKSDQICIGYHANNSTKQVDTIMEKNVTVTHAQ...,MNPNQKIVTIGSLSLGLVVFNVLLHAVSIILTVLALGKSENNGICN...,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
22,A/Jiangxi-Donghu/346/2013,H10N8,Feb 2014,4.3,6.0,Moderate,MYKIVVIIALLGAVKGLDKICLGHHAVANGTIVKTLTNEQEEVTNA...,MNPNQKIITIGSVSLGLVILNILLHIVSITVTVLVLPGNGNNESCN...,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
6,A/American wigeon/South Carolina/AH0195145/2021,H5N1,Mar 2022,4.4,5.1,Moderate,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKITTIGSICMVIGIVSLMLQIGNIISIWVSHSIQTGNQYQP...,335.0,323.0,0.17218,0.511387,0.337368,0.296734


In [5]:
# save dataframe as csv
os.makedirs('results', exist_ok=True)
df.to_csv('results/irat_average_qdistances.csv', index=False)  

In [None]:
df_1 = pd.read_csv('results/irat_average_qdistances.csv', index_col=0)
df_1.drop(columns=['Summary Risk Score Category', 'HA Sequence', 'NA Sequence', 'Both Average Qdistance'], inplace=True)
# textable(df_1,tabname='results/irat_average_qdistances.tex', FORMAT='%1.4f')

## Filling in the Table
- To get a risk prediction score for each IRAT strain (The additional 8 strains)
    - Use all human strains that match the H number, i.e. H5NX for H5N6
- List of remaining strains
- For the following strains, only use upper bound of date due to small sample size
    - H5N2
    - H5N6
    - H5N8
- To be done
    - A/duck/New York/1996 (missing target strain HA)
    - A/Jiangxi-Donghu/346/2013 (only 5 H10N8 human strains ever recorded, 3 of them are the IRAT target strain itself)
    - A/Bangladesh/0994/2011 (only 12 H9N2 human strains up to Feb 2014)

In [46]:
df = pd.read_csv('results/irat_average_qdistances.csv')
df2 = df[(df['HA Qnet Sample'] <= 30) & (df['NA Qnet Sample'] <= 30)]
df2

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance,Geometric Mean
9,A/Northern pintail/Washington/40964/2014,H5N2,Mar 2015,3.8,4.1,Low-Moderate,MEKIVLLLAVISLVKSDQICIGYHANNSTKQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSVSLTIATVCFLMQIAILATTVTLHFKQNECSIPP...,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
11,A/Yunnan/14564/2015,H5N6,Apr 2016,5.0,6.6,Moderate,MEKMVLLLAAVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKITCISATGVTLSVVSLLIGIANLGLNIGLHYKVSDSTTIN...,16.0,16.0,-1.0,-1.0,-1.0,1.0
12,A/Astrakhan/3212/2020,H5N8,Mar 2021,4.6,5.2,Moderate,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKIATIGSISLGLVVFNVLLHALNIILMVLALGKSENNGICK...,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
13,A/gyrfalcon/Washington/41088/2014,H5N8,Mar 2015,4.2,4.6,Low-Moderate,MEKIVLLLAVISLVKSDQICIGYHANNSTKQVDTIMEKNVTVTHAQ...,MNPNQKIVTIGSLSLGLVVFNVLLHAVSIILTVLALGKSENNGICN...,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
15,A/turkey/Indiana/1573-2/2016,H7N8,Jul 2017,3.4,3.9,Low,MNTQILALIACMLIGAKGDKICLGHHAVANGTKVNTLTERGIEVVN...,MNPNKKIITIGSVSLGLVVLNILLHIVSITVTVLVLPGNGNNGNCN...,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
20,A/Bangladesh/0994/2011,H9N2,Feb 2014,5.6,5.4,Moderate,METVSLMTILLLVTTSNADKICIGHQSTNSTETVDTLTETNVPVTH...,MNPNQKIIALGSASLTIAIICLLIQIAILATTMTLHFMQNEHTNST...,13.0,12.0,-1.0,-1.0,-1.0,1.0
22,A/Jiangxi-Donghu/346/2013,H10N8,Feb 2014,4.3,6.0,Moderate,MYKIVVIIALLGAVKGLDKICLGHHAVANGTIVKTLTNEQEEVTNA...,MNPNQKIITIGSVSLGLVILNILLHIVSITVTVLVLPGNGNNESCN...,-1.0,-1.0,-1.0,-1.0,-1.0,1.0


In [31]:
for i in trange(len(df2)-2):
    STRAIN = df2['Influenza Virus'].iloc[i].replace('/',':')
    ha_irat_seq = df2['HA Sequence'].iloc[i][:HA_TRUNC]
    na_irat_seq = df2['NA Sequence'].iloc[i][:NA_TRUNC]
    
    # skip if one or both HA and NA sequences are not available
    if ha_irat_seq == '-1' or na_irat_seq == '-1':
        continue
    ha_df = parse_fasta(GISAID_PATH + STRAIN + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + STRAIN + "_na.fasta", NA_TRUNC)
    
    # skip if qnet already exists 
    if os.path.exists('qnet_models/' + STRAIN + '_ha.joblib'):
        continue
    if os.path.exists('qnet_models/' + STRAIN + '_na.joblib'):
        continue
    
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), ha_irat_seq)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), na_irat_seq)
    
    train_save_qnet(STRAIN + '_ha', ha_arr, HA_TRUNC)
    train_save_qnet(STRAIN + '_na', na_arr, NA_TRUNC)

  0%|          | 0/5 [00:00<?, ?it/s]

In [32]:
ha_sample = []
na_sample = []
avg_qdists_ha = []
avg_qdists_na = []
avg_qdists_both = []

for i in trange(len(df2)):
    STRAIN = df2['Influenza Virus'].iloc[i].replace('/',':')
    
    # skip if Qnet doesn't exist
    if not os.path.exists('qnet_models/' + STRAIN + '_ha.joblib') or not os.path.exists('qnet_models/' + STRAIN + '_na.joblib'):
        ha_sample.append(-1)
        na_sample.append(-1)
        avg_qdists_ha.append(-1)
        avg_qdists_na.append(-1)
        avg_qdists_both.append(-1)
        continue
        
    # load qnets
    ha_qnet = load_qnet('qnet_models/' + STRAIN + '_ha.joblib')
    na_qnet = load_qnet('qnet_models/' + STRAIN + '_na.joblib')
    
    # access irat sequences and all sequences
    ha_irat_seq = np.array(list(df2['HA Sequence'].iloc[i][:HA_TRUNC]))
    na_irat_seq = np.array(list(df2['NA Sequence'].iloc[i][:NA_TRUNC]))
    ha_df = parse_fasta(GISAID_PATH + STRAIN + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + STRAIN + "_na.fasta", NA_TRUNC)
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), ha_irat_seq)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), na_irat_seq)
    
    # update qnet sample size
    ha_sample.append(min(1000, len(ha_df)))
    na_sample.append(min(1000, len(na_df)))
    
    # compute qdistance sum
    ha_qdist_sum = 0
    na_qdist_sum = 0
    for j in range(len(ha_arr)):
        ha_qdist_sum += qdistance(ha_irat_seq, ha_arr[j], ha_qnet, ha_qnet)
    for j in range(len(na_arr)):
        na_qdist_sum += qdistance(na_irat_seq, na_arr[j], na_qnet, na_qnet)
        
    # compute qdistance averages
    avg_qdists_ha.append(ha_qdist_sum/len(ha_arr))
    avg_qdists_na.append(na_qdist_sum/len(na_arr))
    avg_qdists_both.append((ha_qdist_sum + na_qdist_sum)/(len(ha_arr) + len(na_arr)))

  0%|          | 0/7 [00:00<?, ?it/s]

In [63]:
df2['HA Average Qdistance'] = avg_qdists_ha
df2['NA Average Qdistance'] = avg_qdists_na
df2['Both Average Qdistance'] = avg_qdists_both
df2['HA Qnet Sample'] = ha_sample
df2['NA Qnet Sample'] = na_sample
df2['Geometric Mean'] = np.sqrt(df2['HA Average Qdistance']*df2['NA Average Qdistance'])

In [60]:
df1 = df[(df['HA Qnet Sample'] > 30) | (df['NA Qnet Sample'] > 30)]
df_filled = df1.append(df2).sort_values(by = 'Potential Emergence Estimate', ascending = False)
df_filled

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance,Geometric Mean
0,A/swine/Shandong/1207/2016,H1N1,Jul 2020,7.5,6.9,Moderate,MEARLFVLFCAFTTLKADTICVGYHANNSTDTVDTILEKNVTVTHS...,MNPNQKIITIGSICMTIGIASLILQIGNIISIWISHSIQIENQNQS...,1000.0,1000.0,0.094135,0.02053,0.057332,0.043961
3,A/Ohio/13/2017,H3N2,Jul 2019,6.6,5.8,Moderate,MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSP...,1000.0,1000.0,0.018431,0.030623,0.024527,0.023757
18,A/Hong Kong/125/2017,H7N9,May 2017,6.5,7.5,Moderate-High,MNTQILVFALIAIIPTNADKICLGHHAVSNGTKVNTLTERGVEVVN...,MNPNQKILCTSATAITIGAIAVLIGIANLGLNIGLHLKPGCNCSHS...,437.0,437.0,0.029599,0.005775,0.017687,0.013074
19,A/Shanghai/02/2013,H7N9,Apr 2016,6.4,7.2,Moderate-High,MNTQILVFALIAIIPTNADKICLGHHAVSNGTKVNTLTERGVEVVN...,MNPNQKILCTSATAIIIGAIAVLIGMANLGLNIGLHLKPGCNCSHS...,178.0,178.0,0.005495,0.003556,0.004525,0.00442
21,A/Anhui-Lujiang/39/2018,H9N2,Jul 2019,6.2,5.9,Moderate,METVSLITILLVATASNADKICIGYQSTNSTETVDTLTENNVPVTH...,MNPNQKITAIGSVSLIIAIICLLMQIAILTTTMTLHFGQKECSNPS...,31.0,30.0,0.029024,0.16809,0.098557,0.069847
4,A/Indiana/08/2011,H3N2,Dec 2012,6.0,4.5,Moderate,MKTIIAFSCILCLIFAQKLPGSDNSMATLCLGHHAVPNGTLVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTVTLHFKQHDYNSPP...,1000.0,1000.0,0.052311,0.009103,0.030707,0.021822
2,A/California/62/2018,H1N2,Jul 2019,5.8,5.7,Moderate,MKVKLMVLLCTFTATYADTICVGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSISLTLAAMCFLMQTAILVTNVTLHFNQCECHYPP...,55.0,55.0,0.108902,0.060951,0.084926,0.081472
20,A/Bangladesh/0994/2011,H9N2,Feb 2014,5.6,5.4,Moderate,METVSLMTILLLVTTSNADKICIGHQSTNSTETVDTLTETNVPVTH...,MNPNQKIIALGSASLTIAIICLLIQIAILATTMTLHFMQNEHTNST...,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
10,A/Sichuan/06681/2021,H5N6,Oct 2021,5.3,6.3,Moderate,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKITCISATGVTLSIVSLLIGITNLGLNIGLHYKVSDSTTIN...,45.0,45.0,0.361591,0.051794,0.206692,0.136852
8,A/Vietnam/1203/2004,H5N1,Nov 2011,5.2,6.6,Moderate,MEKIVLLFAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSICMVTGIVSLMLQIGNMISIWVSHSIHTGNQHQS...,258.0,246.0,0.167254,0.011074,0.091342,0.043037


In [61]:
# save dataframe as csv
os.makedirs('results', exist_ok=True)
df_filled.to_csv('results/irat_average_qdistances_filled.csv', index=False)  

In [64]:
df_filled_1 = pd.read_csv('results/irat_average_qdistances_filled.csv', index_col=0)
df_filled_1.drop(columns=['Summary Risk Score Category', 'HA Sequence', 'NA Sequence', 'Both Average Qdistance'], inplace=True)
# textable(df_filled_1,tabname='results/irat_average_qdistances_filled.tex', FORMAT='%1.4f')

## A/duck/New York/1996
- A/duck/New York/1996 is missing target strain HA
- Use NA score only

In [26]:
df_filled_1 = pd.read_csv('results/irat_average_qdistances_filled.csv')
df3 = df_filled_1[22:]
df3

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance,Geometric Mean
22,A/duck/New York/1996,H1N1,Nov 2011,2.3,2.4,Low,-1,MNPNQKIITIGSICMAIGIISLVLQIGNIISIWVSHSIQTGSQSHP...,1000.0,1000.0,-1.0,-1.0,-1.0,1.0


## A/Bangladesh/0994/2011	
- Compute risk score using every qnet from the other strains, and take the average among NA and HA
- Take the geometric mean of the resulting NA and HA averages

In [85]:
df = pd.read_csv(IRAT_PATH + 'irat_data.csv')
df_filled = pd.read_csv('results/irat_average_qdistances_filled.csv')
bangladesh_ha = np.array(list(df['HA Sequence'].iloc[20][:HA_TRUNC]))
bangladesh_na = np.array(list(df['NA Sequence'].iloc[20][:NA_TRUNC]))

In [41]:
bangladesh_qnet_used = []
bangladesh_ha_risk = []
bangladesh_na_risk = []
bangladesh_both_risk = []

for i in trange(len(df)):
    STRAIN = df['Influenza Virus'].iloc[i].replace('/',':')
    
    # skip if Qnet doesn't exist
    if not os.path.exists('qnet_models/' + STRAIN + '_ha.joblib') or not os.path.exists('qnet_models/' + STRAIN + '_na.joblib'):
        continue
    # skip duck qnet
    if STRAIN == 'A:duck:New York:1996':
        continue
        
    # load qnets
    ha_qnet = load_qnet('qnet_models/' + STRAIN + '_ha.joblib')
    na_qnet = load_qnet('qnet_models/' + STRAIN + '_na.joblib')
    
    # access irat sequences and all sequences
    ha_df = parse_fasta(GISAID_PATH + STRAIN + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + STRAIN + "_na.fasta", NA_TRUNC)
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), bangladesh_ha)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), bangladesh_na)
    
    # compute qdistance sum
    ha_qdist_sum = 0
    na_qdist_sum = 0
    for j in range(len(ha_arr)):
        qdist = qdistance(bangladesh_ha, ha_arr[j], ha_qnet, ha_qnet)
        if np.isnan(qdist):
            continue
        ha_qdist_sum += qdist
    for j in range(len(na_arr)):
        qdist = qdistance(bangladesh_na, na_arr[j], na_qnet, na_qnet)
        if np.isnan(qdist):
            continue
        na_qdist_sum += qdist
        
    # compute qdistance averages
    bangladesh_qnet_used.append(STRAIN)
    bangladesh_ha_risk.append(ha_qdist_sum/len(ha_arr))
    bangladesh_na_risk.append(na_qdist_sum/len(na_arr))
    bangladesh_both_risk.append((ha_qdist_sum + na_qdist_sum)/(len(ha_arr) + len(na_arr)))

  0%|          | 0/23 [00:00<?, ?it/s]

In [60]:
bangladesh_df = pd.DataFrame({'Qnet Used':bangladesh_qnet_used,
                              'HA Average Qdistance':bangladesh_ha_risk,
                              'NA Average Qdistance':bangladesh_na_risk,
                              'Both Average Qdistance':bangladesh_both_risk})
bangladesh_df['Geometric Mean'] = np.sqrt(bangladesh_df['HA Average Qdistance']*bangladesh_df['NA Average Qdistance'])
bangladesh_df.to_csv('results/bangladesh.csv', index=False)  
bangladesh_df.sort_values(by='Geometric Mean')

Unnamed: 0,Qnet Used,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance,Geometric Mean
18,A:Shanghai:02:2013,0.022335,0.021804,0.022069,0.022068
3,A:Indiana:08:2011,0.038815,0.017687,0.028251,0.026202
0,A:swine:Shandong:1207:2016,0.042556,0.027112,0.034834,0.033967
2,A:Ohio:13:2017,0.039081,0.03045,0.034765,0.034496
4,A:canine:Illinois:12191:2015,0.028716,0.091761,0.060239,0.051332
17,A:Hong Kong:125:2017,0.149791,0.030714,0.090253,0.067829
1,A:California:62:2018,0.286053,0.054321,0.170187,0.124654
15,A:chicken:Tennessee:17-007431-3:2017,0.171961,0.094422,0.133231,0.127424
16,A:chicken:Tennessee:17-007147-2:2017,0.202892,0.085769,0.14439,0.131916
19,A:Anhui-Lujiang:39:2018,0.058182,0.378813,0.218498,0.14846


In [106]:
df_filled.at[7, 'HA Average Qdistance'] = np.mean(bangladesh_df['HA Average Qdistance'])
df_filled.at[7, 'NA Average Qdistance'] = np.mean(bangladesh_df['NA Average Qdistance'])
df_filled.at[7, 'Both Average Qdistance'] = (np.mean(bangladesh_df['HA Average Qdistance']) + np.mean(bangladesh_df['NA Average Qdistance']))/2
df_filled.at[7, 'Geometric Mean'] = np.sqrt(np.mean(bangladesh_df['HA Average Qdistance'])*np.mean(bangladesh_df['NA Average Qdistance']))
# save dataframe as csv
df_filled.to_csv('results/irat_average_qdistances_filled.csv', index=False)  
df_filled[7:8]

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance,Geometric Mean
7,A/Bangladesh/0994/2011,H9N2,Feb 2014,5.6,5.4,Moderate,METVSLMTILLLVTTSNADKICIGHQSTNSTETVDTLTETNVPVTH...,MNPNQKIIALGSASLTIAIICLLIQIAILATTMTLHFMQNEHTNST...,-1.0,-1.0,0.2078,0.182338,0.195069,0.194653


## A/Jiangxi-Donghu/346/2013
- Compute risk score using every qnet from the other strains, and take the average among NA and HA
- Take the geometric mean of the resulting NA and HA averages

In [99]:
df = pd.read_csv(IRAT_PATH + 'irat_data.csv')
df_filled = pd.read_csv('results/irat_average_qdistances_filled.csv')
jiangxi_ha = np.array(list(df['HA Sequence'].iloc[22][:HA_TRUNC]))
jiangxi_na = np.array(list(df['NA Sequence'].iloc[22][:NA_TRUNC]))

In [46]:
jiangxi_qnet_used = []
jiangxi_ha_risk = []
jiangxi_na_risk = []
jiangxi_both_risk = []

for i in trange(len(df)):
    STRAIN = df['Influenza Virus'].iloc[i].replace('/',':')
    
    # skip if Qnet doesn't exist
    if not os.path.exists('qnet_models/' + STRAIN + '_ha.joblib') or not os.path.exists('qnet_models/' + STRAIN + '_na.joblib'):
        continue
    # skip duck qnet
    if STRAIN == 'A:duck:New York:1996':
        continue
        
    # load qnets
    ha_qnet = load_qnet('qnet_models/' + STRAIN + '_ha.joblib')
    na_qnet = load_qnet('qnet_models/' + STRAIN + '_na.joblib')
    
    # access irat sequences and all sequences
    ha_df = parse_fasta(GISAID_PATH + STRAIN + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + STRAIN + "_na.fasta", NA_TRUNC)
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), jiangxi_ha)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), jiangxi_na)
    
    # compute qdistance sum
    ha_qdist_sum = 0
    na_qdist_sum = 0
    for j in range(len(ha_arr)):
        qdist = qdistance(jiangxi_ha, ha_arr[j], ha_qnet, ha_qnet)
        if np.isnan(qdist):
            continue
        ha_qdist_sum += qdist
    for j in range(len(na_arr)):
        qdist = qdistance(jiangxi_na, na_arr[j], na_qnet, na_qnet)
        if np.isnan(qdist):
            continue
        na_qdist_sum += qdist
        
    # compute qdistance averages
    jiangxi_qnet_used.append(STRAIN)
    jiangxi_ha_risk.append(ha_qdist_sum/len(ha_arr))
    jiangxi_na_risk.append(na_qdist_sum/len(na_arr))
    jiangxi_both_risk.append((ha_qdist_sum + na_qdist_sum)/(len(ha_arr) + len(na_arr)))

  0%|          | 0/23 [00:00<?, ?it/s]

In [49]:
jiangxi_df = pd.DataFrame({'Qnet Used':jiangxi_qnet_used,
                              'HA Average Qdistance':jiangxi_ha_risk,
                              'NA Average Qdistance':jiangxi_na_risk,
                              'Both Average Qdistance':jiangxi_both_risk})
jiangxi_df['Geometric Mean'] = np.sqrt(jiangxi_df['HA Average Qdistance']*jiangxi_df['NA Average Qdistance'])
jiangxi_df.to_csv('results/jiangxi.csv', index=False)  
jiangxi_df.sort_values(by='Geometric Mean')

Unnamed: 0,Qnet Used,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance,Geometric Mean
18,A:Shanghai:02:2013,0.016857,0.024116,0.020487,0.020163
0,A:swine:Shandong:1207:2016,0.042382,0.017829,0.030105,0.027488
2,A:Ohio:13:2017,0.032353,0.033827,0.03309,0.033082
4,A:canine:Illinois:12191:2015,0.027729,0.093973,0.060851,0.051047
17,A:Hong Kong:125:2017,0.165551,0.0184,0.091976,0.055192
15,A:chicken:Tennessee:17-007431-3:2017,0.15327,0.046255,0.099816,0.084199
16,A:chicken:Tennessee:17-007147-2:2017,0.170948,0.043211,0.107144,0.085947
3,A:Indiana:08:2011,0.044498,0.205118,0.124808,0.095538
1,A:California:62:2018,0.277616,0.077601,0.177609,0.146777
19,A:Anhui-Lujiang:39:2018,0.060141,0.400016,0.230078,0.155104


In [105]:
df_filled.at[14, 'HA Average Qdistance'] = np.mean(jiangxi_df['HA Average Qdistance'])
df_filled.at[14, 'NA Average Qdistance'] = np.mean(jiangxi_df['NA Average Qdistance'])
df_filled.at[14, 'Both Average Qdistance'] = (np.mean(jiangxi_df['HA Average Qdistance']) + np.mean(jiangxi_df['NA Average Qdistance']))/2
df_filled.at[14, 'Geometric Mean'] = np.sqrt(np.mean(jiangxi_df['HA Average Qdistance'])*np.mean(jiangxi_df['NA Average Qdistance']))
# save dataframe as csv
df_filled.to_csv('results/irat_average_qdistances_filled.csv', index=False)  
df_filled[14:15]

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance,Geometric Mean
14,A/Jiangxi-Donghu/346/2013,H10N8,Feb 2014,4.3,6.0,Moderate,MYKIVVIIALLGAVKGLDKICLGHHAVANGTIVKTLTNEQEEVTNA...,MNPNQKIITIGSVSLGLVILNILLHIVSITVTVLVLPGNGNNESCN...,-1.0,-1.0,0.208658,0.210089,0.209373,0.209372
