# IRAT Data Collection - All Sequences
**H1N1, H1N2, and H3N2 only**
- Collect data on strains previously analyzed by IRAT
- Replicate the [table](https://www.cdc.gov/flu/pandemic-resources/monitoring/irat-virus-summaries.htm#H1N2variant) for H1N1, H1N2, and H3N2 only

In [None]:
import os 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from Bio import SeqIO

## Data Sources
- **IRAT (CDC)**: https://www.cdc.gov/flu/pandemic-resources/monitoring/irat-virus-summaries.htm#H1N2variant
    - Protein fasta for each strain is downloaded from GISAID or NCBI
    - Collect both NA and HA segments
    - File name is "VIRUS_NAME_na.fasta" or "VIRUS_NAME_ha.fasta"
    - Had difficulty finding 'A/duck/New York/1996', only NA available 
- **NCBI**: https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Protein
    - Collect all strains of that variety (H1N1, H1N2, and H3N2 only)
    - For H1N2, the HA is similar to that of H1N1 and the NA to that of H3N2, so use H1N1 HA and H3N2 NA data in tandem
    - Collect both NA and HA segments
    - File name is "VIRUS_NAME_na.fasta" or "VIRUS_NAME_ha.fasta"

In [5]:
NCBI_PATH = 'raw_data/ncbi/'
GISAID_PATH = 'raw_data/gisaid/'
IRAT_PATH = 'raw_data/irat_sequences/'

NA_TRUNC = 469
HA_TRUNC = 565

In [6]:
# input: fasta file name, length to truncate each sequence
# output: dataframe of sequences
def parse_fasta(file_name, trunc):
    acc = []
    seq = []
    for record in SeqIO.parse(file_name, 'fasta'):
        if len(record.seq) < trunc:
            continue
        acc.append(record.id.split('|')[0])
        seq.append(np.array(record.seq[:trunc].upper()))
    df = pd.DataFrame({'name':acc, 'sequence':seq})
    return df


# inputs: IRAT virus features
# output: updated table, HA and NA sequences parsed from 'raw_data/IRAT_sequences'
def add_irat_entry(df, virus_type, virus_name, assessment_date, emergence_risk, impact_risk, risk_category):
    # add sequences
    HA_PATH = virus_name.replace('/',':') + '_ha.fasta'
    NA_PATH = virus_name.replace('/',':') + '_na.fasta'
    ha_seq = -1
    na_seq = -1
    if os.path.exists(IRAT_PATH + HA_PATH):
        for record in SeqIO.parse(IRAT_PATH + HA_PATH, 'fasta'):
            ha_seq = str(record.seq.upper())
    if os.path.exists(IRAT_PATH + NA_PATH):
        for record in SeqIO.parse(IRAT_PATH + NA_PATH, 'fasta'):
            na_seq = str(record.seq.upper())
    # add qnet sample sizes
    ha_qnet_sample = -1
    na_qnet_sample = -1
    if virus_type == 'H1N1':
        ha_qnet_sample = len(parse_fasta(NCBI_PATH + 'h1n1_ha.fasta', HA_TRUNC))
        na_qnet_sample = len(parse_fasta(NCBI_PATH + 'h1n1_na.fasta', NA_TRUNC))
    if virus_type == 'H3N2':
        ha_qnet_sample = len(parse_fasta(NCBI_PATH + 'h3n2_ha.fasta', HA_TRUNC))
        na_qnet_sample = len(parse_fasta(NCBI_PATH + 'h3n2_na.fasta', NA_TRUNC))
    if virus_type == 'H1N2':
        ha_qnet_sample = len(parse_fasta(NCBI_PATH + 'h1n1_ha.fasta', HA_TRUNC))
        na_qnet_sample = len(parse_fasta(NCBI_PATH + 'h3n2_na.fasta', NA_TRUNC))
        
    # add entry
    entry_df = pd.DataFrame({'Influenza Virus':[virus_name],
                             'Virus Type':[virus_type],
                             'Dates of Risk Assessment':[assessment_date],
                             'Potential Emergence Estimate':[emergence_risk],
                             'Potential Impact Estimate':[impact_risk],
                             'Summary Risk Score Category':[risk_category],
                             'HA Sequence':[ha_seq],
                             'NA Sequence':[na_seq],
                             'HA Qnet Sample':[ha_qnet_sample],
                             'NA Qnet Sample':[na_qnet_sample]})
    return df.append(entry_df, ignore_index=True)

In [7]:
df = pd.DataFrame({'Influenza Virus':[],
                   'Virus Type':[],
                   'Dates of Risk Assessment':[],
                   'Potential Emergence Estimate':[],
                   'Potential Impact Estimate':[],
                   'Summary Risk Score Category':[],
                   'HA Sequence':[],
                   'NA Sequence':[],
                   'HA Qnet Sample':[],
                   'NA Qnet Sample':[]})
df = add_irat_entry(df,'H1N1','A/swine/Shandong/1207/2016','Jul 2020',7.5,6.9,'Moderate')
df = add_irat_entry(df,'H1N1','A/duck/New York/1996','Nov 2011',2.3,2.4,'Low')
df = add_irat_entry(df,'H1N2','A/California/62/2018','Jul 2019',5.8,5.7,'Moderate')
df = add_irat_entry(df,'H3N2','A/Ohio/13/2017','Jul 2019',6.6,5.8,'Moderate')
df = add_irat_entry(df,'H3N2','A/Indiana/08/2011','Dec 2012',6.0,4.5,'Moderate')
df = df = add_irat_entry(df,'H3N2','A/canine/Illinois/12191/2015','Jun 2016',3.7,3.7,'Low')
df

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample
0,A/swine/Shandong/1207/2016,H1N1,Jul 2020,7.5,6.9,Moderate,MEARLFVLFCAFTTLKADTICVGYHANNSTDTVDTILEKNVTVTHS...,MNPNQKIITIGSICMTIGIASLILQIGNIISIWISHSIQIENQNQS...,17402.0,14687.0
1,A/duck/New York/1996,H1N1,Nov 2011,2.3,2.4,Low,-1,MNPNQKIITIGSICMAIGIISLVLQIGNIISIWVSHSIQTGSQSHP...,17402.0,14687.0
2,A/California/62/2018,H1N2,Jul 2019,5.8,5.7,Moderate,MKVKLMVLLCTFTATYADTICVGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSISLTLAAMCFLMQTAILVTNVTLHFNQCECHYPP...,17402.0,14057.0
3,A/Ohio/13/2017,H3N2,Jul 2019,6.6,5.8,Moderate,MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSP...,17423.0,14057.0
4,A/Indiana/08/2011,H3N2,Dec 2012,6.0,4.5,Moderate,MKTIIAFSCILCLIFAQKLPGSDNSMATLCLGHHAVPNGTLVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTVTLHFKQHDYNSPP...,17423.0,14057.0
5,A/canine/Illinois/12191/2015,H3N2,Jun 2016,3.7,3.7,Low,MKTVIALSYIFCLAFGQNLLGNENNAATLCLGHHAVPNGTMVKTIT...,MNPNQKIIAIGSVSLTIATVCFLLQIAILATTVTLYFKQNECNIPS...,17423.0,14057.0


In [8]:
# save dataframe as csv
os.makedirs('results', exist_ok=True)  
df.to_csv('results/irat_data_all_sequences.csv', index=False)  