# IRAT vs. Qnet Comparison
- Compares risk assesment from IRAT and risk assessment using Qnet q-distance
- Use both NA and HA segments
- For each strain previously analyzed by IRAT
    - Collect strains one year leading up to month of analysis
    - For example, the "A/swine/Shandong/1207/2016" strain was assessed by IRAT in July 2020, so we will use human H1N1 strains circulating between July 1, 2019 through June 30, 2020
    - Note: had difficulty finding 'A/duck/New York/1996', only NA available, do not use in final results
    - For the following strains, only use upper bound of date due to small sample size
        - H1N2
        - H5N1
        - H5N6
        - H7N7
        - H9N2
    - Strains with 'Qnet Sample' = -1 have no available human strains
    - Construct a Qnet using these strains **if there are more than 30 strains in the population for both NA and HA**
    - Compute the average q-distance among the strain in question and the circulating human strains for both NA and HA
    - Average the NA and HA averages

In [1]:
# basic imports
import os 
import numpy as np
import pandas as pd
import math
import warnings
warnings.filterwarnings('ignore')
import tqdm
from tqdm.notebook import trange, tqdm

# visualization
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

# other
from Bio import SeqIO
from collections import Counter
import Levenshtein as lev

# qnet
from quasinet.qnet import Qnet, qdistance, qdistance_matrix, membership_degree, save_qnet, load_qnet
from quasinet.qseqtools import list_trained_qnets, load_trained_qnet

## Data Sources
- IRAT (CDC): https://www.cdc.gov/flu/pandemic-resources/monitoring/irat-virus-summaries.htm#H1N2variant
- NCBI: https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Protein
- GISAID: https://platform.epicov.org/epi3/cfrontend#586f5f

In [2]:
NCBI_PATH = 'raw_data/ncbi/'
GISAID_PATH = 'raw_data/gisaid/'
IRAT_PATH = 'results/'

NA_TRUNC = 449
HA_TRUNC = 550

df = pd.read_csv(IRAT_PATH + 'irat_data.csv')

## Creating New Qnet
- Truncate NA at 449 amino acids, HA at 550 amino acids (to prevent too many strains from being filtered out)
- Construct a Qnet using these strains **if there are more than 30 strains in the population** after truncating to match the length of the IRAT strain
- Make sure to include IRAT sequence in training the Qnet
- Save Qnet as "VIRUS_NAME_na.joblib" or "VIRUS_NAME_ha.joblib"

In [43]:
# input: fasta file name, length to truncate each sequence
# output: dataframe of sequences
def parse_fasta(file_name, trunc):
    acc = []
    seq = []
    for record in SeqIO.parse(file_name, 'fasta'):
        if len(record.seq) < trunc:
            continue
        acc.append(record.id.split('|')[0])
        seq.append(np.array(record.seq[:trunc].upper()))
    df = pd.DataFrame({'name':acc, 'sequence':seq})
    return df


# input: dataframe of sequences, number of samples, IRAT strain
# output: array of nucleotide lists
def sequence_array(seq_df, sample_size, IRAT_strain):
    seqs = seq_df['sequence'].sample(sample_size, random_state = 42).values
    seq_lst = []
    for seq in seqs:
        seq_lst.append(seq)
    seq_lst.append(np.array(list(IRAT_strain)))
    return np.array(seq_lst)


# input: name to call qnet, array of nucleotide lists, number of nucleotides
# output: save qnet as joblib
def train_save_qnet(name, seq_arr, num_nuc):
    myqnet = Qnet(feature_names=['x'+str(i) for i in np.arange(num_nuc)],n_jobs=1)
    myqnet.fit(seq_arr)
    save_qnet(myqnet, 'qnet_models/' + name + '.joblib')

In [45]:
for i in trange(len(df)):
    if df['HA Qnet Sample'].iloc[i] == -1 or df['NA Qnet Sample'].iloc[i] == -1:
        continue
    
    STRAIN = df['Influenza Virus'].iloc[i].replace('/',':')
    ha_irat_seq = df['HA Sequence'].iloc[i][:HA_TRUNC]
    na_irat_seq = df['NA Sequence'].iloc[i][:NA_TRUNC]
    
    # skip if one or both HA and NA sequences are not available
    if ha_irat_seq == '-1' or na_irat_seq == '-1':
        continue
    ha_df = parse_fasta(GISAID_PATH + STRAIN + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + STRAIN + "_na.fasta", NA_TRUNC)
    
    # skip if less than 30 sequences available or qnet already exists 
    if len(ha_df) < 30 or os.path.exists('qnet_models/' + STRAIN + '_ha.joblib'):
        continue
    if len(na_df) < 30 or os.path.exists('qnet_models/' + STRAIN + '_na.joblib'):
        continue
    
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), ha_irat_seq)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), na_irat_seq)
    
    train_save_qnet(STRAIN + '_ha', ha_arr, HA_TRUNC)
    train_save_qnet(STRAIN + '_na', na_arr, NA_TRUNC)

  0%|          | 0/23 [00:00<?, ?it/s]

## Average Qdistance
- Compute average qdistance between IRAT strain and the rest of the strains

In [None]:
avg_qdists_ha = []
avg_qdists_na = []
avg_qdists_both = []

for i in trange(len(df)):
    STRAIN = df['Influenza Virus'].iloc[i].replace('/',':')
    
    # skip if Qnet doesn't exist
    if not os.path.exists('qnet_models/' + STRAIN + '_ha.joblib') or not os.path.exists('qnet_models/' + STRAIN + '_na.joblib'):
        avg_qdists_ha.append(-1)
        avg_qdists_na.append(-1)
        avg_qdists_both.append(-1)
        continue
        
    # load qnets
    ha_qnet = load_qnet('qnet_models/' + STRAIN + '_ha.joblib')
    na_qnet = load_qnet('qnet_models/' + STRAIN + '_na.joblib')
    
    # access irat sequences and all sequences
    ha_irat_seq = np.array(list(df['HA Sequence'].iloc[i][:HA_TRUNC]))
    na_irat_seq = np.array(list(df['NA Sequence'].iloc[i][:NA_TRUNC]))
    ha_df = parse_fasta(GISAID_PATH + STRAIN + "_ha.fasta", HA_TRUNC)
    na_df = parse_fasta(GISAID_PATH + STRAIN + "_na.fasta", NA_TRUNC)
    ha_arr = sequence_array(ha_df, min(1000, len(ha_df)), ha_irat_seq)
    na_arr = sequence_array(na_df, min(1000, len(na_df)), na_irat_seq)
    
    # compute qdistance sum
    ha_qdist_sum = 0
    na_qdist_sum = 0
    for j in range(len(ha_arr)):
        ha_qdist_sum += qdistance(ha_irat_seq, ha_arr[j], ha_qnet, ha_qnet)
    for j in range(len(na_arr)):
        na_qdist_sum += qdistance(na_irat_seq, na_arr[j], na_qnet, na_qnet)
        
    # compute qdistance averages
    avg_qdists_ha.append(ha_qdist_sum/len(ha_arr))
    avg_qdists_na.append(na_qdist_sum/len(na_arr))
    avg_qdists_both.append((ha_qdist_sum + na_qdist_sum)/(len(ha_arr) + len(na_arr)))
    

df['HA Average Qdistance'] = avg_qdists_ha
df['NA Average Qdistance'] = avg_qdists_na
df['Both Average Qdistance'] = avg_qdists_both

In [3]:
df.sort_values(by = 'Potential Emergence Estimate')

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance
1,A/duck/New York/1996,H1N1,Nov 2011,2.3,2.4,Low,-1,MNPNQKIITIGSICMAIGIISLVLQIGNIISIWVSHSIQTGSQSHP...,1000.0,1000.0,-1.0,-1.0,-1.0
17,A/chicken/Tennessee/17-007147-2/2017,H7N9,Oct 2017,2.8,3.5,Low,MNTQILALIACMLIGAKGDKICLGHHAVANGTKVNTLTERGIEVVN...,MNPNQKILCTSATAIVIGTIAVLIGIANLGLNIGLHLKPNCNCSNS...,496.0,495.0,0.209532,0.25409,0.231788
16,A/chicken/Tennessee/17-007431-3/2017,H7N9,Oct 2017,3.1,3.5,Low,MNTQILALIACMLIGAKGDKICLGHHAVANGTKVNTLTERGIEVVN...,MNPNQKILCTSATAIVIGTIAVLIGIANLGLNIGLHLKPNCNCSNS...,496.0,495.0,0.102686,0.256855,0.179693
15,A/turkey/Indiana/1573-2/2016,H7N8,Jul 2017,3.4,3.9,Low,MNTQILALIACMLIGAKGDKICLGHHAVANGTKVNTLTERGIEVVN...,MNPNKKIITIGSVSLGLVVLNILLHIVSITVTVLVLPGNGNNGNCN...,-1.0,-1.0,-1.0,-1.0,-1.0
7,A/American green-winged teal/Washington/195705...,H5N1,Mar 2015,3.6,4.1,Low-Moderate,MEKIVLLLAVISLVKSDQICIGYHANNSTKQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSICMVIGIISLVLQIGNIISIWVSHSIQTGSQNHP...,326.0,314.0,0.191127,0.448228,0.316856
5,A/canine/Illinois/12191/2015,H3N2,Jun 2016,3.7,3.7,Low,MKTVIALSYIFCLAFGQNLLGNENNAATLCLGHHAVPNGTMVKTIT...,MNPNQKIIAIGSVSLTIATVCFLLQIAILATTVTLYFKQNECNIPS...,1000.0,1000.0,0.060665,0.150856,0.105761
9,A/Northern pintail/Washington/40964/2014,H5N2,Mar 2015,3.8,4.1,Low-Moderate,MEKIVLLLAVISLVKSDQICIGYHANNSTKQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSVSLTIATVCFLMQIAILATTVTLHFKQNECSIPP...,-1.0,-1.0,-1.0,-1.0,-1.0
13,A/gyrfalcon/Washington/41088/2014,H5N8,Mar 2015,4.2,4.6,Low-Moderate,MEKIVLLLAVISLVKSDQICIGYHANNSTKQVDTIMEKNVTVTHAQ...,MNPNQKIVTIGSLSLGLVVFNVLLHAVSIILTVLALGKSENNGICN...,-1.0,-1.0,-1.0,-1.0,-1.0
22,A/Jiangxi-Donghu/346/2013,H10N8,Feb 2014,4.3,6.0,Moderate,MYKIVVIIALLGAVKGLDKICLGHHAVANGTIVKTLTNEQEEVTNA...,MNPNQKIITIGSVSLGLVILNILLHIVSITVTVLVLPGNGNNESCN...,-1.0,-1.0,-1.0,-1.0,-1.0
6,A/American wigeon/South Carolina/AH0195145/2021,H5N1,Mar 2022,4.4,5.1,Moderate,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKITTIGSICMVIGIVSLMLQIGNIISIWVSHSIQTGNQYQP...,335.0,323.0,0.17218,0.511387,0.337368


In [68]:
# save dataframe as csv
os.makedirs('results', exist_ok=True)
df.to_csv('results/irat_average_qdistances.csv', index=False)  
df.drop(columns=['HA Sequence', 'NA Sequence']).to_latex('results/irat_average_qdistances.tex', index=False)