# Data Collection

### NOTE: data is updated as of 12/13/2023
1. Collect data on strains previously analyzed by IRAT and replicate the [IRAT table](https://www.cdc.gov/flu/pandemic-resources/monitoring/irat-virus-summaries.htm#H1N2variant)
2. Collect all human viruses from **1/1/2010 - present (1/1/2024)** from [GISAID](https://gisaid.org/)
    1. Download viruses with complete HA and NA segments
        - Metadata: `Isolate name | Type | Gene name | Collection date | Protein Accession no. | Isolate ID | Lineage | Clade`
    1. Filter for IRAT evaluation at **original time of assessment**
    2. Filter for IRAT evaluation at **current time**

In [16]:
import os, glob, re
from datetime import date, datetime
from collections import Counter
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from Bio import SeqIO


IRAT_DIR = 'data/animal/irat/'
HUMAN_DIR = 'data/human/'
NA_TRUNC = 449
HA_TRUNC = 560


def parse_fasta(file_name):
    ''' Parses a fasta file into a dataframe
    '''
    name = []
    subtype = []
    segment = []
    dates = []
    accession = []
    sequence = []
    for record in SeqIO.parse(file_name, 'fasta'):
        metadata = record.id.split('|')
        if not metadata[1].startswith('A_/_') or len(metadata[1].split('_')[2]) < 4:
            continue
        name.append(metadata[0])
        subtype.append(metadata[1].split('_')[2])
        segment.append(metadata[2])
        dates.append(metadata[3])
        accession.append(metadata[4])
        sequence.append(str(record.seq.upper()))
    df = pd.DataFrame({'name':name, 
                       'subtype':subtype,
                       'segment':segment, 
                       'date':dates,
                       'accession':accession,
                       'sequence':sequence})
    df[['HA', 'NA']] = df['subtype'].str.extract(r'H(\d+)N(\d+)')
    df['HA'] = df['HA'].apply(lambda x: 'H' + str(x))
    df['NA'] = df['NA'].apply(lambda x: 'N' + str(x))
    return df

## 1) IRAT Strains
- **IRAT (CDC)**: https://www.cdc.gov/flu/pandemic-resources/monitoring/irat-virus-summaries.htm#H1N2variant
    - Protein fasta for each strain is downloaded from GISAID or NCBI
    - Collect both NA and HA segments
    - Had difficulty finding `A/duck/New York/1996`, only NA available 
    
#### Mean Low - Mean High

From [IRAT](https://www.cdc.gov/flu/pandemic-resources/national-strategy/risk-assessment.htm): "Since the IRAT is qualitative in nature, its scores involve some degree of subjectivity. Accordingly, subject matter experts provide a range of “acceptable” scores for each risk element by identifying a lower and upper bound they would consider acceptable from other experts scoring the same element. The mean of the lowest acceptable bound and the mean of the highest acceptable bound from each risk element are used in the weighted “emergence” or “public health impact” calculations to create the “mean-high” and “mean-low” acceptable score ranges."

In [3]:
def add_irat_entry(df, subtype, virus_name, 
                   assessment_date, risk_category,
                   emergence_risk, impact_risk,
                   emergence_low, emergence_high,
                   impact_low, impact_high):
    ''' Adds a row to the IRAT dataframe
    '''
    DIR = IRAT_DIR + virus_name.replace('/',':') + '.fasta'
    seq_df = parse_fasta(DIR)
    try:
        ha_seq = seq_df[seq_df['segment'] == 'HA']['sequence'].values[0][:HA_TRUNC]
        ha_seq = ha_seq.ljust(HA_TRUNC, 'X')
    except:
        ha_seq = '-1'
    try:
        na_seq = seq_df[seq_df['segment'] == 'NA']['sequence'].values[0][:NA_TRUNC]
        na_seq = na_seq.ljust(NA_TRUNC, 'X')
    except:
        na_seq = '-1'
    entry_df = pd.DataFrame({'Influenza Virus':[virus_name],
                             'Virus Type':[subtype],
                             'Date of Risk Assessment':[assessment_date],
                             'Risk Score Category':[risk_category],
                             'Emergence Score':[emergence_risk],
                             'Impact Score':[impact_risk],
                             'Mean Low Acceptable Emergence':[emergence_low],
                             'Mean High Acceptable Emergence':[emergence_high],
                             'Mean Low Acceptable Impact':[impact_low],
                             'Mean High Acceptable Impact':[impact_high],
                             'HA Sequence':[ha_seq],
                             'NA Sequence':[na_seq],
                             'HA Length':[len(ha_seq)],
                             'NA Length':[len(na_seq)]})
    return df.append(entry_df, ignore_index=True)

In [4]:
df = pd.DataFrame()
df = add_irat_entry(df,'H1N1','A/swine/Shandong/1207/2016',date(2020,7,1),'Moderate',7.5,6.9,6.33,8.65,5.42,8.09)
df = add_irat_entry(df,'H1N1','A/duck/New York/1996',date(2011,11,1),'Low',2.3,2.4,-1,-1,-1,-1)
df = add_irat_entry(df,'H1N2','A/California/62/2018',date(2019,7,1),'Moderate',5.8,5.7,4.22,7.16,3.8,7.09)
df = add_irat_entry(df,'H3N2','A/Ohio/13/2017',date(2019,7,1),'Moderate',6.6,5.8,5.01,7.59,4.09,7.26)
df = add_irat_entry(df,'H3N2','A/Indiana/08/2011',date(2012,12,1),'Moderate',6.0,4.5,-1,-1,-1,-1)
df = add_irat_entry(df,'H3N2','A/canine/Illinois/12191/2015',date(2016,6,1),'Low',3.7,3.7,2.81,4.9,2.69,4.9)
df = add_irat_entry(df,'H5N1','A/American wigeon/South Carolina/AH0195145/2021',date(2022,3,1),'Moderate',4.4,5.1,3.28,5.51,3.84,6.19)
df = add_irat_entry(df,'H5N1','A/American green-winged teal/Washington/1957050/2014',date(2015,3,1),'Low-Moderate',3.6,4.1,2.4,4.6,3,5.6)
df = add_irat_entry(df,'H5N1','A/Vietnam/1203/2004',date(2011,11,1),'Moderate',5.2,6.6,-1,-1,-1,-1)
df = add_irat_entry(df,'H5N2','A/Northern pintail/Washington/40964/2014',date(2015,3,1),'Low-Moderate',3.8,4.1,2.6,5,3,5.7)
df = add_irat_entry(df,'H5N6','A/Sichuan/06681/2021',date(2021,10,1),'Moderate',5.3,6.3,3.88,6.45,5.04,7.47)
df = add_irat_entry(df,'H5N6','A/Yunnan/14564/2015',date(2016,4,1),'Moderate',5.0,6.6,4.07,6.18,5.57,7.93)
df = add_irat_entry(df,'H5N8','A/Astrakhan/3212/2020',date(2021,3,1),'Moderate',4.6,5.2,3.64,5.82,4.07,6.37)
df = add_irat_entry(df,'H5N8','A/gyrfalcon/Washington/41088/2014',date(2015,3,1),'Low-Moderate',4.2,4.6,2.9,5.3,3.4,5.9)
df = add_irat_entry(df,'H7N7','A/Netherlands/219/2003',date(2012,6,1),'Moderate',4.6,5.8,3.22,4.39,5.99,7.22)
df = add_irat_entry(df,'H7N8','A/turkey/Indiana/1573-2/2016',date(2017,7,1),'Low',3.4,3.9,2.4,4.26,2.91,4.63)
df = add_irat_entry(df,'H7N9','A/chicken/Tennessee/17-007431-3/2017',date(2017,10,1),'Low',3.1,3.5,2.2,3.94,2.53,4.32)
df = add_irat_entry(df,'H7N9','A/chicken/Tennessee/17-007147-2/2017',date(2017,10,1),'Low',2.8,3.5,2.01,3.71,2.67,4.39)
df = add_irat_entry(df,'H7N9','A/Hong Kong/125/2017',date(2017,5,1),'Moderate-High',6.5,7.5,5.65,7.51,6.74,8.5)
df = add_irat_entry(df,'H7N9','A/Shanghai/02/2013',date(2016,4,1),'Moderate-High',6.4,7.2,5.52,7.43,6.41,8.32)
df = add_irat_entry(df,'H9N2','A/Bangladesh/0994/2011',date(2014,2,1),'Moderate',5.6,5.4,4.49,6.74,4.41,6.65)
df = add_irat_entry(df,'H9N2','A/Anhui-Lujiang/39/2018',date(2019,7,1),'Moderate',6.2,5.9,4.76,7.57,4.3,7.3)
df = add_irat_entry(df,'H10N8','A/Jiangxi-Donghu/346/2013',date(2014,2,1),'Moderate',4.3,6.0,3.37,5.96,5.21,7.24)
df = add_irat_entry(df,'H5N1','A/mink/Spain/3691-8_22VIR10586-10/2022',date(2023,4,1),'Moderate',5.1,6.2,3.96,6.27,4.95,7.43)
df.sort_values(by=['Date of Risk Assessment'], inplace=True, ascending=False)
df.to_csv('data/animal/irat.csv', index=False)
df.reset_index(drop=True)

Unnamed: 0,Influenza Virus,Virus Type,Date of Risk Assessment,Risk Score Category,Emergence Score,Impact Score,Mean Low Acceptable Emergence,Mean High Acceptable Emergence,Mean Low Acceptable Impact,Mean High Acceptable Impact,HA Sequence,NA Sequence,HA Length,NA Length
0,A/mink/Spain/3691-8_22VIR10586-10/2022,H5N1,2023-04-01,Moderate,5.1,6.2,3.96,6.27,4.95,7.43,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQRIITTGSICMVIGIVSLMLQIGNIISIWVSHSIQTGNQYQP...,560,449
1,A/American wigeon/South Carolina/AH0195145/2021,H5N1,2022-03-01,Moderate,4.4,5.1,3.28,5.51,3.84,6.19,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKITTIGSICMVIGIVSLMLQIGNIISIWVSHSIQTGNQYQP...,560,449
2,A/Sichuan/06681/2021,H5N6,2021-10-01,Moderate,5.3,6.3,3.88,6.45,5.04,7.47,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKITCISATGVTLSIVSLLIGITNLGLNIGLHYKVSDSTTIN...,560,449
3,A/Astrakhan/3212/2020,H5N8,2021-03-01,Moderate,4.6,5.2,3.64,5.82,4.07,6.37,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKIATIGSISLGLVVFNVLLHALNIILMVLALGKSENNGICK...,560,449
4,A/swine/Shandong/1207/2016,H1N1,2020-07-01,Moderate,7.5,6.9,6.33,8.65,5.42,8.09,MEARLFVLFCAFTTLKADTICVGYHANNSTDTVDTILEKNVTVTHS...,MNPNQKIITIGSICMTIGIASLILQIGNIISIWISHSIQIENQNQS...,560,449
5,A/Ohio/13/2017,H3N2,2019-07-01,Moderate,6.6,5.8,5.01,7.59,4.09,7.26,MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSP...,560,449
6,A/California/62/2018,H1N2,2019-07-01,Moderate,5.8,5.7,4.22,7.16,3.8,7.09,MKVKLMVLLCTFTATYADTICVGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSISLTLAAMCFLMQTAILVTNVTLHFNQCECHYPP...,560,449
7,A/Anhui-Lujiang/39/2018,H9N2,2019-07-01,Moderate,6.2,5.9,4.76,7.57,4.3,7.3,METVSLITILLVATASNADKICIGYQSTNSTETVDTLTENNVPVTH...,MNPNQKITAIGSVSLIIAIICLLMQIAILTTTMTLHFGQKECSNPS...,560,449
8,A/chicken/Tennessee/17-007431-3/2017,H7N9,2017-10-01,Low,3.1,3.5,2.2,3.94,2.53,4.32,MNTQILALIACMLIGAKGDKICLGHHAVANGTKVNTLTERGIEVVN...,MNPNQKILCTSATAIVIGTIAVLIGIANLGLNIGLHLKPNCNCSNS...,560,449
9,A/chicken/Tennessee/17-007147-2/2017,H7N9,2017-10-01,Low,2.8,3.5,2.01,3.71,2.67,4.39,MNTQILALIACMLIGAKGDKICLGHHAVANGTKVNTLTERGIEVVN...,MNPNQKILCTSATAIVIGTIAVLIGIANLGLNIGLHLKPNCNCSNS...,560,449


## 2) Human Sequences
- Download all human sequences from **1/1/2010 - present (1/1/2024)**
- Collect all complete HA and NA segments from [GISAID](https://gisaid.org/)

In [17]:
def filter_by_date_range(df, date_column, start_date, end_date):
    ''' Filters a DataFrame by a date rance
    '''
    df[date_column] = pd.to_datetime(df[date_column])
    filtered_df = df[(df[date_column] >= start_date) & (df[date_column] <= end_date)]
    filtered_df.sort_values(by=[date_column], inplace=True, ascending=False)
    return filtered_df

In [18]:
# Compile all human sequences
fasta_files = glob.glob(os.path.join(HUMAN_DIR + 'gisaid', '*.fasta'))

human = pd.DataFrame()
for file in fasta_files:
    df = parse_fasta(file)
    human = pd.concat([human, df], ignore_index=True)
    
# Sort by date and save
human = human.sort_values(by=['date'])
os.makedirs(HUMAN_DIR + 'gisaid/', exist_ok=True)
human.to_csv(HUMAN_DIR + 'gisaid/human.csv', index=False)

### a) Prepare human sequences for each IRAT sequence

1. Filter within one year of IRAT risk assesment date
2. Save to `data/human/irat/<irat_name>.csv`

In [6]:
# Filter human sequences for each IRAT sequence
human = pd.read_csv(HUMAN_DIR + 'gisaid/human.csv', na_filter=False)
irat = pd.read_csv('data/animal/irat.csv')

os.makedirs(HUMAN_DIR + 'irat/', exist_ok=True)
for i in range(len(irat)):
    row = irat.iloc[i]
    virus_name = row['Influenza Virus'].replace('/',':')
    end = datetime.strptime(row['Date of Risk Assessment'], '%Y-%m-%d').date()
    start = date(end.year - 1, end.month, end.day)
    filtered = filter_by_date_range(human, 'date', str(start), str(end))
    filtered.to_csv(HUMAN_DIR + 'irat/' + virus_name + '.csv', index=False)
    print(virus_name)
    print('\t', len(filtered))
    print('\t', Counter(filtered[filtered['segment'] == 'HA']['HA']))
    print('\t', Counter(filtered[filtered['segment'] == 'NA']['NA']))

A:mink:Spain:3691-8_22VIR10586-10:2022
	 83126
	 Counter({'H3': 27966, 'H1': 13580, 'H5': 9})
	 Counter({'N2': 27981, 'N1': 13580, 'N8': 9, 'N6': 1})
A:American wigeon:South Carolina:AH0195145:2021
	 21474
	 Counter({'H3': 9785, 'H1': 928, 'H5': 11, 'H9': 6, 'H10': 2})
	 Counter({'N2': 9806, 'N1': 923, 'N6': 11, 'N3': 2})
A:Sichuan:06681:2021
	 3404
	 Counter({'H3': 1140, 'H1': 531, 'H5': 16, 'H9': 13, 'H10': 2})
	 Counter({'N2': 1162, 'N1': 523, 'N6': 14, 'N3': 2, 'N8': 1})
A:Astrakhan:3212:2020
	 4318
	 Counter({'H1': 1216, 'H3': 922, 'H9': 14, 'H5': 7})
	 Counter({'N1': 1213, 'N2': 940, 'N6': 5, 'N8': 1})
A:swine:Shandong:1207:2016
	 31187
	 Counter({'H1': 8923, 'H3': 6660, 'H9': 8})
	 Counter({'N1': 8922, 'N2': 6674})
A:Ohio:13:2017
	 47593
	 Counter({'H3': 12633, 'H1': 11121, 'H9': 26, 'H5': 10, 'H7': 8})
	 Counter({'N2': 12670, 'N1': 11110, 'N9': 8, 'N6': 7})
A:California:62:2018
	 47593
	 Counter({'H3': 12633, 'H1': 11121, 'H9': 26, 'H5': 10, 'H7': 8})
	 Counter({'N2': 12670, 'N