# IRAT vs. Qnet Comparison - All Sequences
**H1N1, H1N2, and H3N2 only**
- Compares risk assesment from IRAT and risk assessment using Qnet q-distance
- Use both NA and HA segments
- For each strain previously analyzed by IRAT
    - Collect all strains of that variety
    - Note: had difficulty finding 'A/duck/New York/1996', only NA available
    - Construct a Qnet using these strains
        - For H1N2, the HA is similar to that of H1N1 and the NA to that of H3N2, so use H1N1 HA for HA Qnet and H3N2 NA for NA Qnet
    - Compute the average q-distance among the strain in question and the circulating human strains for both NA and HA
    - Average the NA and HA averages

In [2]:
# basic imports
import os 
import numpy as np
import pandas as pd
import math
import warnings
warnings.filterwarnings('ignore')
import tqdm
from tqdm.notebook import trange, tqdm

# visualization
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

# other
from Bio import SeqIO
from collections import Counter
import Levenshtein as lev

# qnet
from quasinet.qnet import Qnet, qdistance, qdistance_matrix, membership_degree, save_qnet, load_qnet
from quasinet.qseqtools import list_trained_qnets, load_trained_qnet

## Data Sources
- IRAT (CDC): https://www.cdc.gov/flu/pandemic-resources/monitoring/irat-virus-summaries.htm#H1N2variant
- NCBI: https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Protein
- GISAID: https://platform.epicov.org/epi3/cfrontend#586f5f

In [2]:
NCBI_PATH = 'raw_data/ncbi/'
GISAID_PATH = 'raw_data/gisaid/'
IRAT_PATH = 'results/'

FILES = ['h1n1_ha', 'h1n1_na', 'h3n2_ha', 'h3n2_na']

NA_TRUNC = 469
HA_TRUNC = 565

df = pd.read_csv(IRAT_PATH + 'irat_data_all_sequences.csv')

## Creating New Qnet
- Truncate NA at 469 amino acids, HA at 565 amino acids (this is possible because H1N1, H1N2, and H3N2 Irat strains are at least this long, as opposed to others which are shorter)
- Save Qnet as "h1n1_na.joblib" or "h1n1_ha.joblib" or "h3n2_na.joblib" or "h3n2_ha.joblib"

In [3]:
# input: fasta file name, length to truncate each sequence
# output: dataframe of sequences
def parse_fasta(file_name, trunc):
    acc = []
    seq = []
    for record in SeqIO.parse(file_name, 'fasta'):
        if len(record.seq) < trunc:
            continue
        acc.append(record.id.split('|')[0])
        seq.append(np.array(record.seq[:trunc].upper()))
    df = pd.DataFrame({'name':acc, 'sequence':seq})
    return df


# input: dataframe of sequences, number of samples, list of IRAT strains
# output: array of nucleotide lists
def sequence_array(seq_df, sample_size, IRAT_strains):
    seqs = seq_df['sequence'].sample(sample_size, random_state = 42).values
    seq_lst = []
    for seq in seqs:
        seq_lst.append(seq)
    for seq in IRAT_strains:
        seq_lst.append(np.array(list(seq)))
    return np.array(seq_lst)



# input: name to call qnet, array of nucleotide lists, number of nucleotides
# output: save qnet as joblib
def train_save_qnet(name, seq_arr, num_nuc):
    myqnet = Qnet(feature_names=['x'+str(i) for i in np.arange(num_nuc)],n_jobs=1)
    myqnet.fit(seq_arr)
    save_qnet(myqnet, 'qnet_models/' + name + '.joblib')

In [4]:
df

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample
0,A/swine/Shandong/1207/2016,H1N1,Jul 2020,7.5,6.9,Moderate,MEARLFVLFCAFTTLKADTICVGYHANNSTDTVDTILEKNVTVTHS...,MNPNQKIITIGSICMTIGIASLILQIGNIISIWISHSIQIENQNQS...,17402.0,14687.0
1,A/duck/New York/1996,H1N1,Nov 2011,2.3,2.4,Low,-1,MNPNQKIITIGSICMAIGIISLVLQIGNIISIWVSHSIQTGSQSHP...,17402.0,14687.0
2,A/California/62/2018,H1N2,Jul 2019,5.8,5.7,Moderate,MKVKLMVLLCTFTATYADTICVGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSISLTLAAMCFLMQTAILVTNVTLHFNQCECHYPP...,17402.0,14057.0
3,A/Ohio/13/2017,H3N2,Jul 2019,6.6,5.8,Moderate,MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSP...,17423.0,14057.0
4,A/Indiana/08/2011,H3N2,Dec 2012,6.0,4.5,Moderate,MKTIIAFSCILCLIFAQKLPGSDNSMATLCLGHHAVPNGTLVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTVTLHFKQHDYNSPP...,17423.0,14057.0
5,A/canine/Illinois/12191/2015,H3N2,Jun 2016,3.7,3.7,Low,MKTVIALSYIFCLAFGQNLLGNENNAATLCLGHHAVPNGTMVKTIT...,MNPNQKIIAIGSVSLTIATVCFLLQIAILATTVTLYFKQNECNIPS...,17423.0,14057.0


In [10]:
h1n1_ha_irat = [df['HA Sequence'][0][:HA_TRUNC], 
                df['HA Sequence'][2][:HA_TRUNC]]
h3n2_ha_irat = [df['HA Sequence'][3][:HA_TRUNC], 
                df['HA Sequence'][4][:HA_TRUNC], 
                df['HA Sequence'][5][:HA_TRUNC]]
h1n1_na_irat = [df['NA Sequence'][0][:NA_TRUNC], 
                df['NA Sequence'][1][:NA_TRUNC]]
h3n2_na_irat = [df['NA Sequence'][2][:NA_TRUNC], 
                df['NA Sequence'][3][:NA_TRUNC], 
                df['NA Sequence'][4][:NA_TRUNC], 
                df['NA Sequence'][5][:NA_TRUNC]]
irat_seq_dict = {'h1n1_ha':h1n1_ha_irat, 'h3n2_ha':h3n2_ha_irat, 
                 'h1n1_na':h1n1_na_irat, 'h3n2_na':h3n2_na_irat}

FILES = ['h1n1_na', 'h3n2_na']

# create qnets for each dataset
for FILE in tqdm(FILES):
    TRUNC = HA_TRUNC
    if 'na' in FILE:
        TRUNC = NA_TRUNC
    seq_df = parse_fasta(NCBI_PATH + FILE + ".fasta", TRUNC) 
    seq_arr = sequence_array(seq_df, min(1000, len(seq_df)), irat_seq_dict[FILE])
    train_save_qnet(FILE, seq_arr, TRUNC)

  0%|          | 0/2 [00:00<?, ?it/s]

## Average Qdistance
- Compute average qdistance between IRAT strain and the rest of the strains

In [None]:
avg_qdists_ha = []
avg_qdists_na = []
avg_qdists_both = []

for i in trange(len(df)):
    VIRUS_TYPE_HA = df['Virus Type'].iloc[i].lower() + '_ha'
    VIRUS_TYPE_NA = df['Virus Type'].iloc[i].lower() + '_na'
    if df['Virus Type'].iloc[i] == 'H1N2':
        VIRUS_TYPE_HA = 'h1n1_ha'
        VIRUS_TYPE_NA = 'h3n2_na'
        
    # load qnets
    ha_qnet = load_qnet('qnet_models/' + VIRUS_TYPE_HA + '.joblib')
    na_qnet = load_qnet('qnet_models/' + VIRUS_TYPE_NA + '.joblib')

    # skip if HA doesn't exist
    ha_exists = True
    if df['HA Sequence'].iloc[i] == '-1':
        ha_exists = False
        
    if ha_exists:
        # access irat sequences and all sequences
        ha_irat_seq = np.array(list(df['HA Sequence'].iloc[i][:HA_TRUNC]))
        na_irat_seq = np.array(list(df['NA Sequence'].iloc[i][:NA_TRUNC]))
        ha_df = parse_fasta(NCBI_PATH + VIRUS_TYPE_HA + ".fasta", HA_TRUNC)
        na_df = parse_fasta(NCBI_PATH + VIRUS_TYPE_NA + ".fasta", NA_TRUNC)
        ha_arr = sequence_array(ha_df, min(10000, len(ha_df)), [])
        na_arr = sequence_array(na_df, min(10000, len(na_df)), [])

        # compute qdistance sum
        ha_qdist_sum = 0
        na_qdist_sum = 0
        ha_nan = 0
        na_nan = 0
        for j in range(len(ha_arr)):
            if not math.isnan(qdistance(ha_irat_seq, ha_arr[j], ha_qnet, ha_qnet)):
                ha_qdist_sum += qdistance(ha_irat_seq, ha_arr[j], ha_qnet, ha_qnet)
            else:
                ha_nan += 1
        for j in range(len(na_arr)):
            if not math.isnan(qdistance(na_irat_seq, na_arr[j], na_qnet, na_qnet)):
                na_qdist_sum += qdistance(na_irat_seq, na_arr[j], na_qnet, na_qnet)
            else:
                na_nan += 1
        print('HA NaN: ', ha_nan)
        print('NA NaN: ', na_nan)

        # compute qdistance averages
        avg_qdists_ha.append(ha_qdist_sum/len(ha_arr))
        avg_qdists_na.append(na_qdist_sum/len(na_arr))
        avg_qdists_both.append((ha_qdist_sum + na_qdist_sum)/(len(ha_arr) + len(na_arr)))
    
    else:
        # access irat sequences and all sequences
        na_irat_seq = np.array(list(df['NA Sequence'].iloc[i][:NA_TRUNC]))
        na_df = parse_fasta(NCBI_PATH + VIRUS_TYPE_NA + ".fasta", NA_TRUNC)
        na_arr = sequence_array(na_df, min(10000, len(na_df)), [])

        # compute qdistance sum
        na_qdist_sum = 0
        for j in range(len(na_arr)):
            if not math.isnan(qdistance(na_irat_seq, na_arr[j], na_qnet, na_qnet)):
                na_qdist_sum += qdistance(na_irat_seq, na_arr[j], na_qnet, na_qnet)
            else:
                na_nan += 1
        print('NA NaN: ', na_nan)

        # compute qdistance averages
        avg_qdists_ha.append(-1)
        avg_qdists_na.append(na_qdist_sum/len(na_arr))
        avg_qdists_both.append(-1)
    

df['HA Average Qdistance'] = avg_qdists_ha
df['NA Average Qdistance'] = avg_qdists_na
df['Both Average Qdistance'] = avg_qdists_both

In [12]:
df.sort_values(by = 'Potential Emergence Estimate')

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance
1,A/duck/New York/1996,H1N1,Nov 2011,2.3,2.4,Low,-1,MNPNQKIITIGSICMAIGIISLVLQIGNIISIWVSHSIQTGSQSHP...,17402.0,14687.0,-1.0,0.037525,-1.0
5,A/canine/Illinois/12191/2015,H3N2,Jun 2016,3.7,3.7,Low,MKTVIALSYIFCLAFGQNLLGNENNAATLCLGHHAVPNGTMVKTIT...,MNPNQKIIAIGSVSLTIATVCFLLQIAILATTVTLYFKQNECNIPS...,17423.0,14057.0,0.017681,0.041427,0.029554
2,A/California/62/2018,H1N2,Jul 2019,5.8,5.7,Moderate,MKVKLMVLLCTFTATYADTICVGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSISLTLAAMCFLMQTAILVTNVTLHFNQCECHYPP...,17402.0,14057.0,0.379929,0.029437,0.204683
4,A/Indiana/08/2011,H3N2,Dec 2012,6.0,4.5,Moderate,MKTIIAFSCILCLIFAQKLPGSDNSMATLCLGHHAVPNGTLVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTVTLHFKQHDYNSPP...,17423.0,14057.0,0.008264,0.021427,0.014846
3,A/Ohio/13/2017,H3N2,Jul 2019,6.6,5.8,Moderate,MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSP...,17423.0,14057.0,0.006036,0.022038,0.014037
0,A/swine/Shandong/1207/2016,H1N1,Jul 2020,7.5,6.9,Moderate,MEARLFVLFCAFTTLKADTICVGYHANNSTDTVDTILEKNVTVTHS...,MNPNQKIITIGSICMTIGIASLILQIGNIISIWISHSIQIENQNQS...,17402.0,14687.0,0.088669,0.043072,0.065871


In [14]:
# save dataframe as csv
os.makedirs('results', exist_ok=True)
df.to_csv('results/irat_average_qdistances_all_sequences.csv', index=False)  
df.drop(columns=['HA Sequence', 'NA Sequence']).to_latex('results/irat_average_qdistances_all_sequences.tex', index=False)