# IRAT vs. Qnet - Current Score Predictions
- Compares risk assesment from IRAT and risk assessment using Qnet q-distance
- Collected human HA and NA strains within the past year (September 30, 2021 - September 30, 2022)
    - If not enough strains available, take strains in last decade, this was done for all strains but H1N1 and H3N2
    - If even the above is not enough, use HXN_, where X matches the strain in question and N can be anything
    - No strains available for H10N_, so exclude A/Jiangxi-Donghu/346/2013	
    - A/duck/New York/1996 excluded
- Compute the average q-distance among the strain in question and the circulating human strains for both NA and HA
- Average the NA and HA averages (using arithmetic and geometric mean)

In [20]:
# basic imports
import os 
import numpy as np
import pandas as pd
import math
import warnings
warnings.filterwarnings('ignore')
import tqdm
from tqdm.notebook import trange, tqdm

# visualization
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

# other
from Bio import SeqIO
from collections import Counter
import Levenshtein as lev

# qnet
from quasinet.qnet import Qnet, qdistance, qdistance_matrix, membership_degree, save_qnet, load_qnet
from quasinet.qseqtools import list_trained_qnets, load_trained_qnet
from zedstat.textable import textable
from emergenet import Enet, save_model, load_model

## Data Sources
- IRAT (CDC): https://www.cdc.gov/flu/pandemic-resources/monitoring/irat-virus-summaries.htm#H1N2variant
- NCBI: https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Protein
- GISAID: https://platform.epicov.org/epi3/cfrontend#586f5f

In [21]:
GISAID_PATH = 'raw_data/gisaid_current/'
IRAT_PATH = 'results/'
QNET_PATH = 'qnet_models/current_qnets/'

NA_TRUNC = 449
HA_TRUNC = 550

df = pd.read_csv(IRAT_PATH + 'irat_data.csv')

## Creating New Qnet
- Truncate NA at 449 amino acids, HA at 550 amino acids (to prevent too many strains from being filtered out)
- Construct a Qnet using these strains **if there are more than 30 strains in the population** after truncating to match the length of the IRAT strain
- Make sure to include IRAT sequence in training the Qnet
- Save Qnet as "VIRUS_NAME_na.joblib" or "VIRUS_NAME_ha.joblib"

In [22]:
# input: fasta file name, length to truncate each sequence
# output: dataframe of sequences
def parse_fasta(file_name, trunc):
    acc = []
    seq = []
    for record in SeqIO.parse(file_name, 'fasta'):
        if len(record.seq) < trunc:
            continue
        acc.append(record.id.split('|')[0])
        seq.append(np.array(record.seq[:trunc].upper()))
    df = pd.DataFrame({'name':acc, 'sequence':seq})
    return df


# input: dataframe of sequences, number of samples, IRAT strain
# output: array of nucleotide lists
def sequence_array(seq_df, sample_size, IRAT_strain):
    seqs = seq_df['sequence'].sample(sample_size, random_state = 42).values
    seq_lst = []
    for seq in seqs:
        seq_lst.append(seq)
    seq_lst.append(np.array(list(IRAT_strain)))
    return np.array(seq_lst)


# input: name to call qnet, array of nucleotide lists, number of nucleotides
# output: save qnet as joblib
def train_save_qnet(name, seq_arr, num_nuc):
    myqnet = Qnet(feature_names=['x'+str(i) for i in np.arange(num_nuc)],n_jobs=1)
    myqnet.fit(seq_arr)
    save_qnet(myqnet, QNET_PATH + name + '.joblib')

In [33]:
# find new sample sizes HA and NA
ha_sample = []
na_sample = []

for i in trange(len(df)):
    VIRUS_TYPE = df['Virus Type'].iloc[i].lower()
    if VIRUS_TYPE == 'h10n8':
        ha_sample.append(-1)
        na_sample.append(-1)
        continue
    ha_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_na.fasta", NA_TRUNC)
    ha_sample.append(min(len(ha_df), 1000))
    na_sample.append(min(len(na_df), 1000))
    
df['HA Qnet Sample'] = ha_sample
df['NA Qnet Sample'] = na_sample

  0%|          | 0/23 [00:00<?, ?it/s]

In [25]:
for i in trange(len(df)):
    if df['HA Qnet Sample'].iloc[i] == -1 or df['NA Qnet Sample'].iloc[i] == -1:
        continue
    
    VIRUS_TYPE = df['Virus Type'].iloc[i].lower()
    ha_irat_seq = df['HA Sequence'].iloc[i][:HA_TRUNC]
    na_irat_seq = df['NA Sequence'].iloc[i][:NA_TRUNC]
    
    # skip if one or both HA and NA sequences are not available
    if ha_irat_seq == '-1' or na_irat_seq == '-1':
        continue
    ha_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_na.fasta", NA_TRUNC)
    
    # skip if less than 30 sequences available or qnet already exists 
    if len(ha_df) < 30 or os.path.exists(QNET_PATH + VIRUS_TYPE + '_ha.joblib'):
        continue
    if len(na_df) < 30 or os.path.exists(QNET_PATH + VIRUS_TYPE + '_na.joblib'):
        continue
    
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), ha_irat_seq)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), na_irat_seq)
    
    train_save_qnet(VIRUS_TYPE + '_ha', ha_arr, HA_TRUNC)
    train_save_qnet(VIRUS_TYPE + '_na', na_arr, NA_TRUNC)

  0%|          | 0/23 [00:00<?, ?it/s]

## Average Qdistance
- Compute average qdistance between IRAT strain and the rest of the strains

In [53]:
avg_qdists_ha = []
avg_qdists_na = []
avg_qdists_both = []

for i in trange(len(df)):
    VIRUS_TYPE = df['Virus Type'].iloc[i].lower()
    
    # skip if Qnet doesn't exist
    if not os.path.exists(QNET_PATH + VIRUS_TYPE + '_ha.joblib') or not os.path.exists(QNET_PATH + VIRUS_TYPE + '_na.joblib'):
        avg_qdists_ha.append(-1)
        avg_qdists_na.append(-1)
        avg_qdists_both.append(-1)
        continue
    # skip if one of the sequences doesn't exist
    if df['HA Sequence'].iloc[i] == '-1' or df['HA Sequence'].iloc[i] == '-1':
        avg_qdists_ha.append(-1)
        avg_qdists_na.append(-1)
        avg_qdists_both.append(-1)
        continue
        
    # load qnets
    ha_qnet = load_qnet(QNET_PATH + VIRUS_TYPE + '_ha.joblib')
    na_qnet = load_qnet(QNET_PATH + VIRUS_TYPE + '_na.joblib')
    
    # access irat sequences and all sequences
    ha_irat_seq = np.array(list(df['HA Sequence'].iloc[i][:HA_TRUNC]))
    na_irat_seq = np.array(list(df['NA Sequence'].iloc[i][:NA_TRUNC]))
    ha_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + VIRUS_TYPE + "_na.fasta", NA_TRUNC)
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), ha_irat_seq)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), na_irat_seq)
    
    # compute qdistance sum
    ha_qdist_sum = 0
    na_qdist_sum = 0
    for j in range(len(ha_arr)):
        ha_qdist_sum += qdistance(ha_irat_seq, ha_arr[j], ha_qnet, ha_qnet)
    for j in range(len(na_arr)):
        na_qdist_sum += qdistance(na_irat_seq, na_arr[j], na_qnet, na_qnet)
        
    # compute qdistance averages
    avg_qdists_ha.append(ha_qdist_sum/len(ha_arr))
    avg_qdists_na.append(na_qdist_sum/len(na_arr))
    avg_qdists_both.append((ha_qdist_sum + na_qdist_sum)/(len(ha_arr) + len(na_arr)))
    

df['HA Average Qdistance'] = avg_qdists_ha
df['NA Average Qdistance'] = avg_qdists_na
df['Both Average Qdistance'] = avg_qdists_both
# geometric mean column
df['Geometric Mean'] = np.sqrt(df['HA Average Qdistance']*df['NA Average Qdistance'])

df = df.sort_values(by = 'Potential Emergence Estimate', ascending = False)
df

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance,Geometric Mean
0,A/swine/Shandong/1207/2016,H1N1,Jul 2020,7.5,6.9,Moderate,MEARLFVLFCAFTTLKADTICVGYHANNSTDTVDTILEKNVTVTHS...,MNPNQKIITIGSICMTIGIASLILQIGNIISIWISHSIQIENQNQS...,1000,1000,0.059891,0.041687,0.050789,0.049967
3,A/Ohio/13/2017,H3N2,Jul 2019,6.6,5.8,Moderate,MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSP...,1000,1000,0.009133,0.069184,0.039159,0.025137
18,A/Hong Kong/125/2017,H7N9,May 2017,6.5,7.5,Moderate-High,MNTQILVFALIAIIPTNADKICLGHHAVSNGTKVNTLTERGVEVVN...,MNPNQKILCTSATAITIGAIAVLIGIANLGLNIGLHLKPGCNCSHS...,1000,1000,0.00919,0.004615,0.006903,0.006513
19,A/Shanghai/02/2013,H7N9,Apr 2016,6.4,7.2,Moderate-High,MNTQILVFALIAIIPTNADKICLGHHAVSNGTKVNTLTERGVEVVN...,MNPNQKILCTSATAIIIGAIAVLIGMANLGLNIGLHLKPGCNCSHS...,1000,1000,0.003082,0.004365,0.00374,0.003668
21,A/Anhui-Lujiang/39/2018,H9N2,Jul 2019,6.2,5.9,Moderate,METVSLITILLVATASNADKICIGYQSTNSTETVDTLTENNVPVTH...,MNPNQKITAIGSVSLIIAIICLLMQIAILTTTMTLHFGQKECSNPS...,58,58,0.015745,0.046728,0.031236,0.027124
4,A/Indiana/08/2011,H3N2,Dec 2012,6.0,4.5,Moderate,MKTIIAFSCILCLIFAQKLPGSDNSMATLCLGHHAVPNGTLVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTVTLHFKQHDYNSPP...,1000,1000,0.017557,0.018402,0.01798,0.017975
2,A/California/62/2018,H1N2,Jul 2019,5.8,5.7,Moderate,MKVKLMVLLCTFTATYADTICVGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSISLTLAAMCFLMQTAILVTNVTLHFNQCECHYPP...,37,37,0.203827,0.047728,0.125777,0.098632
20,A/Bangladesh/0994/2011,H9N2,Feb 2014,5.6,5.4,Moderate,METVSLMTILLLVTTSNADKICIGHQSTNSTETVDTLTETNVPVTH...,MNPNQKIIALGSASLTIAIICLLIQIAILATTMTLHFMQNEHTNST...,58,58,0.047339,0.465388,0.256364,0.148429
10,A/Sichuan/06681/2021,H5N6,Oct 2021,5.3,6.3,Moderate,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKITCISATGVTLSIVSLLIGITNLGLNIGLHYKVSDSTTIN...,46,46,0.344316,0.059989,0.202152,0.143719
8,A/Vietnam/1203/2004,H5N1,Nov 2011,5.2,6.6,Moderate,MEKIVLLFAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSICMVTGIVSLMLQIGNMISIWVSHSIHTGNQHQS...,48,45,0.132348,0.041099,0.088164,0.073752


In [60]:
# save dataframe as csv
os.makedirs('results', exist_ok=True)
df.to_csv('results/irat_average_qdistances_current.csv', index=False)  

In [66]:
df_1 = pd.read_csv('results/irat_average_qdistances_current.csv', index_col=0)
df_1.drop(columns=['Summary Risk Score Category', 'HA Sequence', 'NA Sequence', 'Both Average Qdistance'], inplace=True)
# textable(df_1,tabname='results/irat_average_qdistances_current.tex', FORMAT='%1.4f')

## A/Jiangxi-Donghu/346/2013
- Compute risk score using every qnet from the other strains, and take the average among NA and HA
- Take the geometric mean of the resulting NA and HA averages

In [23]:
df = pd.read_csv(IRAT_PATH + 'irat_data.csv')
df_filled = pd.read_csv('results/irat_average_qdistances_current.csv')
jiangxi_ha = np.array(list(df['HA Sequence'].iloc[22][:HA_TRUNC]))
jiangxi_na = np.array(list(df['NA Sequence'].iloc[22][:NA_TRUNC]))

In [None]:
jiangxi_qnet_used = []
jiangxi_ha_risk = []
jiangxi_na_risk = []
jiangxi_both_risk = []

STRAINS = ['h1n1', 'h1n2', 'h3n2', 'h5n1', 'h5n2', 'h5n6', 
           'h5n8', 'h7n7', 'h7n8', 'h7n9', 'h9n2']

for STRAIN in tqdm(STRAINS):
    # load qnets
    ha_qnet = load_qnet(QNET_PATH + STRAIN + '_ha.joblib')
    na_qnet = load_qnet(QNET_PATH + STRAIN + '_na.joblib')
    
    # access irat sequences and all sequences
    ha_df = parse_fasta(GISAID_PATH + STRAIN + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + STRAIN + "_na.fasta", NA_TRUNC)
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), jiangxi_ha)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), jiangxi_na)
    
    # compute qdistance sum
    ha_qdist_sum = 0
    na_qdist_sum = 0
    for j in range(len(ha_arr)):
        qdist = qdistance(jiangxi_ha, ha_arr[j], ha_qnet, ha_qnet)
        if np.isnan(qdist):
            continue
        ha_qdist_sum += qdist
    for j in range(len(na_arr)):
        qdist = qdistance(jiangxi_na, na_arr[j], na_qnet, na_qnet)
        if np.isnan(qdist):
            continue
        na_qdist_sum += qdist
        
    # compute qdistance averages
    jiangxi_qnet_used.append(STRAIN)
    jiangxi_ha_risk.append(ha_qdist_sum/len(ha_arr))
    jiangxi_na_risk.append(na_qdist_sum/len(na_arr))
    jiangxi_both_risk.append((ha_qdist_sum + na_qdist_sum)/(len(ha_arr) + len(na_arr)))

  0%|          | 0/11 [00:00<?, ?it/s]

In [None]:
jiangxi_df = pd.DataFrame({'Qnet Used':jiangxi_qnet_used,
                           'HA Average Qdistance':jiangxi_ha_risk,
                           'NA Average Qdistance':jiangxi_na_risk,
                           'Both Average Qdistance':jiangxi_both_risk})
jiangxi_df['Geometric Mean'] = np.sqrt(jiangxi_df['HA Average Qdistance']*jiangxi_df['NA Average Qdistance'])
# jiangxi_df.to_csv('results/jiangxi_current.csv', index=False)  
jiangxi_df.sort_values(by='Geometric Mean')

In [None]:
df_filled.at[14, 'HA Average Qdistance'] = np.mean(jiangxi_df['HA Average Qdistance'])
df_filled.at[14, 'NA Average Qdistance'] = np.mean(jiangxi_df['NA Average Qdistance'])
df_filled.at[14, 'Both Average Qdistance'] = (np.mean(jiangxi_df['HA Average Qdistance']) + np.mean(jiangxi_df['NA Average Qdistance']))/2
df_filled.at[14, 'Geometric Mean'] = np.sqrt(np.mean(jiangxi_df['HA Average Qdistance'])*np.mean(jiangxi_df['NA Average Qdistance']))
# save dataframe as csv
df_filled.to_csv('results/irat_average_qdistances_current.csv', index=False)  
df_filled[14:15]