# IRAT Data Collection
- Collect data on strains previously analyzed by IRAT
- Replicate the [table](https://www.cdc.gov/flu/pandemic-resources/monitoring/irat-virus-summaries.htm#H1N2variant)

In [4]:
import os 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from Bio import SeqIO

## Data Sources
- **IRAT (CDC)**: https://www.cdc.gov/flu/pandemic-resources/monitoring/irat-virus-summaries.htm#H1N2variant
    - Protein fasta for each strain is downloaded from GISAID or NCBI
    - Collect both NA and HA segments
    - File name is "VIRUS_NAME_na.fasta" or "VIRUS_NAME_ha.fasta"
    - Had difficulty finding 'A/duck/New York/1996', only NA available 
- **GISAID**: https://platform.epicov.org/epi3/cfrontend#586f5f
    - Collect strains one year leading up to month of analysis
    - Collect both NA and HA segments
    - For example, the "A/swine/Shandong/1207/2016" strain was assessed by IRAT in July 2020, so we will use human H1N1 strains circulating between July 1, 2019 through June 30, 2020
    - File name is "VIRUS_NAME_na.fasta" or "VIRUS_NAME_ha.fasta"
    - For the following strains, only use upper bound of date due to small sample size
        - H1N2
        - H5N1
        - H5N6
        - H7N7
        - H9N2
    - The following strains have no human strains available
        - H5N2
        - H5N8
        - H7N8
        - H10N8

In [5]:
NCBI_PATH = 'raw_data/ncbi/'
GISAID_PATH = 'raw_data/gisaid/'
IRAT_PATH = 'raw_data/irat_sequences/'

In [6]:
# input: fasta file name
# output: dataframe of sequences
def parse_fasta(file_name):
    acc = []
    seq = []
    for record in SeqIO.parse(file_name, 'fasta'):
        acc.append(record.id.split('|')[0])
        seq.append(str(record.seq.upper()))
    df = pd.DataFrame({'name':acc, 'sequence':seq})
    return df


# inputs: IRAT virus features
# output: updated table, HA and NA sequences parsed from 'raw_data/IRAT_sequences'
def add_irat_entry(df, virus_type, virus_name, assessment_date, emergence_risk, impact_risk, risk_category):
    # add sequences
    HA_PATH = virus_name.replace('/',':') + '_ha.fasta'
    NA_PATH = virus_name.replace('/',':') + '_na.fasta'
    ha_seq = -1
    na_seq = -1
    if os.path.exists(IRAT_PATH + HA_PATH):
        for record in SeqIO.parse(IRAT_PATH + HA_PATH, 'fasta'):
            ha_seq = str(record.seq.upper())
    if os.path.exists(IRAT_PATH + NA_PATH):
        for record in SeqIO.parse(IRAT_PATH + NA_PATH, 'fasta'):
            na_seq = str(record.seq.upper())
    # add qnet sample sizes
    ha_qnet_sample = -1
    na_qnet_sample = -1
    if os.path.exists(GISAID_PATH + HA_PATH):
        ha_qnet_sample = min(1000, len(parse_fasta(GISAID_PATH + HA_PATH)))
    if os.path.exists(GISAID_PATH + NA_PATH):
        na_qnet_sample = min(1000, len(parse_fasta(GISAID_PATH + NA_PATH)))
        
    # add entry
    entry_df = pd.DataFrame({'Influenza Virus':[virus_name],
                             'Virus Type':[virus_type],
                             'Dates of Risk Assessment':[assessment_date],
                             'Potential Emergence Estimate':[emergence_risk],
                             'Potential Impact Estimate':[impact_risk],
                             'Summary Risk Score Category':[risk_category],
                             'HA Sequence':[ha_seq],
                             'NA Sequence':[na_seq],
                             'HA Qnet Sample':[ha_qnet_sample],
                             'NA Qnet Sample':[na_qnet_sample]})
    return df.append(entry_df, ignore_index=True)

In [7]:
df = pd.DataFrame({'Influenza Virus':[],
                   'Virus Type':[],
                   'Dates of Risk Assessment':[],
                   'Potential Emergence Estimate':[],
                   'Potential Impact Estimate':[],
                   'Summary Risk Score Category':[],
                   'HA Sequence':[],
                   'NA Sequence':[],
                   'HA Qnet Sample':[],
                   'NA Qnet Sample':[]})
df = add_irat_entry(df,'H1N1','A/swine/Shandong/1207/2016','Jul 2020',7.5,6.9,'Moderate')
df = add_irat_entry(df,'H1N1','A/duck/New York/1996','Nov 2011',2.3,2.4,'Low')
df = add_irat_entry(df,'H1N2','A/California/62/2018','Jul 2019',5.8,5.7,'Moderate')
df = add_irat_entry(df,'H3N2','A/Ohio/13/2017','Jul 2019',6.6,5.8,'Moderate')
df = add_irat_entry(df,'H3N2','A/Indiana/08/2011','Dec 2012',6.0,4.5,'Moderate')
df = df = add_irat_entry(df,'H3N2','A/canine/Illinois/12191/2015','Jun 2016',3.7,3.7,'Low')
df = add_irat_entry(df,'H5N1','A/American wigeon/South Carolina/AH0195145/2021','Mar 2022',4.4,5.1,'Moderate')
df = add_irat_entry(df,'H5N1','A/American green-winged teal/Washington/1957050/2014','Mar 2015',3.6,4.1,'Low-Moderate')
df = add_irat_entry(df,'H5N1','A/Vietnam/1203/2004','Nov 2011',5.2,6.6,'Moderate')
df = add_irat_entry(df,'H5N2','A/Northern pintail/Washington/40964/2014','Mar 2015',3.8,4.1,'Low-Moderate')
df = add_irat_entry(df,'H5N6','A/Sichuan/06681/2021','Oct 2021',5.3,6.3,'Moderate')
df = add_irat_entry(df,'H5N6','A/Yunnan/14564/2015','Apr 2016',5.0,6.6,'Moderate')
df = add_irat_entry(df,'H5N8','A/Astrakhan/3212/2020','Mar 2021',4.6,5.2,'Moderate')
df = add_irat_entry(df,'H5N8','A/gyrfalcon/Washington/41088/2014','Mar 2015',4.2,4.6,'Low-Moderate')
df = add_irat_entry(df,'H7N7','A/Netherlands/219/2003','Jun 2012',4.6,5.8,'Moderate')
df = add_irat_entry(df,'H7N8','A/turkey/Indiana/1573-2/2016','Jul 2017',3.4,3.9,'Low')
df = add_irat_entry(df,'H7N9','A/chicken/Tennessee/17-007431-3/2017','Oct 2017',3.1,3.5,'Low')
df = add_irat_entry(df,'H7N9','A/chicken/Tennessee/17-007147-2/2017','Oct 2017',2.8,3.5,'Low')
df = add_irat_entry(df,'H7N9','A/Hong Kong/125/2017','May 2017',6.5,7.5,'Moderate-High')
df = add_irat_entry(df,'H7N9','A/Shanghai/02/2013','Apr 2016',6.4,7.2,'Moderate-High')
df = add_irat_entry(df,'H9N2','A/Bangladesh/0994/2011','Feb 2014',5.6,5.4,'Moderate')
df = add_irat_entry(df,'H9N2','A/Anhui-Lujiang/39/2018','Jul 2019',6.2,5.9,'Moderate')
df = add_irat_entry(df,'H10N8','A/Jiangxi-Donghu/346/2013','Feb 2014',4.3,6.0,'Moderate')
df

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample
0,A/swine/Shandong/1207/2016,H1N1,Jul 2020,7.5,6.9,Moderate,MEARLFVLFCAFTTLKADTICVGYHANNSTDTVDTILEKNVTVTHS...,MNPNQKIITIGSICMTIGIASLILQIGNIISIWISHSIQIENQNQS...,1000.0,1000.0
1,A/duck/New York/1996,H1N1,Nov 2011,2.3,2.4,Low,-1,MNPNQKIITIGSICMAIGIISLVLQIGNIISIWVSHSIQTGSQSHP...,1000.0,1000.0
2,A/California/62/2018,H1N2,Jul 2019,5.8,5.7,Moderate,MKVKLMVLLCTFTATYADTICVGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSISLTLAAMCFLMQTAILVTNVTLHFNQCECHYPP...,55.0,55.0
3,A/Ohio/13/2017,H3N2,Jul 2019,6.6,5.8,Moderate,MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSP...,1000.0,1000.0
4,A/Indiana/08/2011,H3N2,Dec 2012,6.0,4.5,Moderate,MKTIIAFSCILCLIFAQKLPGSDNSMATLCLGHHAVPNGTLVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTVTLHFKQHDYNSPP...,1000.0,1000.0
5,A/canine/Illinois/12191/2015,H3N2,Jun 2016,3.7,3.7,Low,MKTVIALSYIFCLAFGQNLLGNENNAATLCLGHHAVPNGTMVKTIT...,MNPNQKIIAIGSVSLTIATVCFLLQIAILATTVTLYFKQNECNIPS...,1000.0,1000.0
6,A/American wigeon/South Carolina/AH0195145/2021,H5N1,Mar 2022,4.4,5.1,Moderate,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKITTIGSICMVIGIVSLMLQIGNIISIWVSHSIQTGNQYQP...,335.0,323.0
7,A/American green-winged teal/Washington/195705...,H5N1,Mar 2015,3.6,4.1,Low-Moderate,MEKIVLLLAVISLVKSDQICIGYHANNSTKQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSICMVIGIISLVLQIGNIISIWVSHSIQTGSQNHP...,326.0,314.0
8,A/Vietnam/1203/2004,H5N1,Nov 2011,5.2,6.6,Moderate,MEKIVLLFAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSICMVTGIVSLMLQIGNMISIWVSHSIHTGNQHQS...,258.0,246.0
9,A/Northern pintail/Washington/40964/2014,H5N2,Mar 2015,3.8,4.1,Low-Moderate,MEKIVLLLAVISLVKSDQICIGYHANNSTKQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSVSLTIATVCFLMQIAILATTVTLHFKQNECSIPP...,-1.0,-1.0


In [8]:
# save dataframe as csv
os.makedirs('results', exist_ok=True)  
df.to_csv('results/irat_data.csv', index=False)  