# Data Collection

In [2]:
import os 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from domseq import DomSeq


NCBI_DIR = 'data/ncbi/'
GISAID_DIR = 'data/gisaid/'
DATA_DIR = 'data/merged/'

NA_TRUNC = 468 # 2 less than official length of 470
HA_TRUNC = 565 # 2 less than official length of 567

## Downloading Data
**Sources: [GISAID](https://platform.epicov.org/epi3/cfrontend#586f5f), [NCBI](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus), [WHO](https://www.who.int/teams/global-influenza-programme/vaccines/who-recommendations/recommendations-for-influenza-vaccine-composition-archive)**
- Host: Human
- Subtype: H1N1 or H3N2
- Segment: HA (4) and NA (6)
- Download all data from 09/25/2001 - 02/15/2023 (collection date) from both NCBI and GISAID
- For NCBI, filter by the following sequence length
    - HA: min = 550, max = 570
    - NA: min = 450, max = 470

## Cleaning and Merging Data
- Make `h1n1.csv` and `h3n2.csv`
- Merge HA, and NA data
    - Put HA strain in `sequence` column, NA strain in `na_sequence` column
    - Keep only strains with both HA and NA available
    - Truncate to 468 for NA (2 less than official length of 470)
    - Truncate to 565 for HA (2 less than official length of 567)
- Merge GISAID and NCBI data
- Save to `data/merged/`

In [3]:
def load_ncbi_gisaid(domseq, NCBI_FILE, GISAID_FILE):
    ''' Returns merged sequence DataFrame.
    '''
    seq_df = pd.DataFrame({'acc':[],'name':[],'date':[],'sequence':[]})
    if os.path.isfile(NCBI_FILE):
        seq_df_ncbi = domseq.load_data(NCBI_FILE)
        seq_df = seq_df.append(seq_df_ncbi)
    if os.path.isfile(GISAID_FILE):
        seq_df_gisaid = domseq.load_data(GISAID_FILE)
        seq_df = seq_df.append(seq_df_gisaid)
    return seq_df

In [4]:
# Combine GISAID and NCBI, merge on HA and NA, remove duplicates by name
for SUBTYPE in ['h1n1', 'h3n2']:
    # Initialize the DomSeq
    domseq_ha = DomSeq(seq_trunc_length=HA_TRUNC)
    domseq_na = DomSeq(seq_trunc_length=NA_TRUNC)
    
    # File name
    GISAID_FILE_HA = GISAID_DIR+SUBTYPE+'_ha.fasta'
    GISAID_FILE_NA = GISAID_DIR+SUBTYPE+'_na.fasta'
    NCBI_FILE_HA = NCBI_DIR+SUBTYPE+'_ha.fasta'
    NCBI_FILE_NA = NCBI_DIR+SUBTYPE+'_na.fasta'
    
    # Load data
    seq_df_ha = load_ncbi_gisaid(domseq_ha, NCBI_FILE_HA, GISAID_FILE_HA)
    seq_df_na = load_ncbi_gisaid(domseq_na, NCBI_FILE_NA, GISAID_FILE_NA)
    
    # Drop duplicates by name
    seq_df_ha.drop_duplicates(subset=['name'], inplace=True)
    seq_df_na.drop_duplicates(subset=['name'], inplace=True)
    seq_df_na.rename(columns={'acc':'acc_na', 'sequence':'sequence_na'}, inplace=True)
    
    # Merge HA and NA on name and date
    seq_df = seq_df_ha.merge(seq_df_na, how='inner', on=['name', 'date'])
    seq_df['date'] = pd.to_datetime(seq_df['date'])
    seq_df.sort_values(by='date', inplace=True)
    
    # Save to csv
    os.makedirs(DATA_DIR, exist_ok=True)  
    seq_df.to_csv(DATA_DIR+SUBTYPE+'.csv', index=False)

## Seasonal Files
- Make seperate files for North H1N1, North H3N2, South H1N1, South H3N2
    - Make seperate file for each season (21 seasons total for each category)
- Flu Season example: 
    - Northern strains from 10/1/2002 - 4/1/2003: predict for 2003-04 season
    - Southern strains from 4/1/2002 - 10/1/2002: predict for 2003 season
    - Flu season dates from [CDC](https://www.cdc.gov/flu/school-business/travelersfacts.htm) and [WHO](https://www.who.int/teams/global-influenza-programme/vaccines/who-recommendations/recommendations-for-influenza-vaccine-composition-archive)
- File names for raw data: `<hemisphere>_<subtype>_<season>` (+ '_pred' if prediction data)
    - `hemisphere`: "north" or "south"
    - `subtype`: "h1n1" or "h3n2"
    - `season`: (ex. 02_03 for north 10/1/2002 - 4/1/2003, 02 for south 4/1/2002 - 10/1/2002)
    - **'pred': all sequence data up to that point, used for prediction**
        - Only use unique strains here
        - Season specific data is for training models, but we will predict on all data up to that point
- Save to `data/merged/`
    - Some seasons will have no strains from a particular database
    - In each year record how many strains come from NCBI and GISAID

In [5]:
NORTH_YEARS = []
for i in np.arange(2, 23):
    YEAR = ''
    if i < 10:
        YEAR += '0' + str(i)
    else:
        YEAR += (str(i))
    if i + 1 < 10:
        YEAR += '_0' + str(i + 1)
    else:
        YEAR += '_' + str(i + 1)
    NORTH_YEARS.append(YEAR)
        
SOUTH_YEARS = []
for i in np.arange(2, 23):
    if i < 10:
        SOUTH_YEARS.append('0' + str(i))
    else:
        SOUTH_YEARS.append(str(i))

In [6]:
for SUBTYPE in ['h1n1', 'h3n2']:
    df = pd.read_csv(DATA_DIR+SUBTYPE+'.csv')
    df['date'] = pd.to_datetime(df['date'])
    
    for i in range(21):
        NORTH_START = str(2002 + i) + '-10-01'
        NORTH_END = str(2003 + i) + '-04-01'
        NORTH_DIR = DATA_DIR + 'north_' + SUBTYPE + '/'
        os.makedirs(NORTH_DIR, exist_ok=True) 
        os.makedirs(NORTH_DIR+'pred', exist_ok=True) 
        # North
        north_df = df.loc[(df['date'] >= NORTH_START) & (df['date'] <= NORTH_END)]
        north_df.to_csv(NORTH_DIR+'north_'+SUBTYPE+'_'+NORTH_YEARS[i]+'.csv', index=False)
        # North prediction
        north_pred_df = df.loc[df['date'] <= NORTH_END].drop_duplicates(subset=['sequence'])
        north_pred_df.to_csv(NORTH_DIR+'pred/north_'+SUBTYPE+'_'+NORTH_YEARS[i]+'.csv', index=False)
        
        SOUTH_START = str(2002 + i) + '-04-01'
        SOUTH_END = str(2002 + i) + '-10-01'
        SOUTH_DIR = DATA_DIR + 'south_' + SUBTYPE + '/'
        os.makedirs(SOUTH_DIR, exist_ok=True) 
        os.makedirs(SOUTH_DIR+'pred', exist_ok=True)
        # South
        south_df = df.loc[(df['date'] >= SOUTH_START) & (df['date'] <= SOUTH_END)]
        south_df.to_csv(SOUTH_DIR+'south_'+SUBTYPE+'_'+SOUTH_YEARS[i]+'.csv', index=False)
        # South prediction
        south_pred_df = df.loc[df['date'] <= SOUTH_END].drop_duplicates(subset=['sequence'])
        south_pred_df.to_csv(SOUTH_DIR+'pred/south_'+SUBTYPE+'_'+SOUTH_YEARS[i]+'.csv', index=False)

In [7]:
num_seqs = pd.DataFrame({})
for SEASON in ['north', 'south']:
    YEARS = NORTH_YEARS
    if SEASON == 'south':
        YEARS = SOUTH_YEARS
    num_seqs[SEASON + '_season'] = YEARS
    
    for SUBTYPE in ['h1n1', 'h3n2']:
        seq_cnt = []
        NAME = SEASON + '_' + SUBTYPE
        DIR = DATA_DIR + NAME + '/'
        for i in range(21):
            seq_df = pd.read_csv(DIR+NAME+'_'+YEARS[i]+'.csv')
            seq_cnt.append(len(seq_df))
        num_seqs[NAME] = seq_cnt
num_seqs.to_csv(DATA_DIR+'num_seqs.csv', index=False)
num_seqs

Unnamed: 0,north_season,north_h1n1,north_h3n2,south_season,south_h1n1,south_h3n2
0,02_03,37,67,2,5,158
1,03_04,14,345,3,16,200
2,04_05,7,254,4,4,228
3,05_06,34,154,5,36,270
4,06_07,442,264,6,40,23
5,07_08,394,407,7,91,214
6,08_09,597,316,8,180,125
7,09_10,3338,125,9,4473,738
8,10_11,1611,1050,10,635,504
9,11_12,454,1450,11,317,537


In [8]:
num_seqs_pred = pd.DataFrame({})
for SEASON in ['north', 'south']:
    YEARS = NORTH_YEARS
    if SEASON == 'south':
        YEARS = SOUTH_YEARS
    num_seqs_pred[SEASON + '_season'] = YEARS
    
    for SUBTYPE in ['h1n1', 'h3n2']:
        seq_cnt = []
        NAME = SEASON + '_' + SUBTYPE
        DIR = DATA_DIR + NAME + '/pred/'
        for i in range(21):
            seq_df = pd.read_csv(DIR+NAME+'_'+YEARS[i]+'.csv')
            seq_cnt.append(len(seq_df))
        num_seqs_pred[NAME] = seq_cnt
num_seqs_pred.to_csv(DATA_DIR+'num_seqs_pred.csv', index=False)
num_seqs_pred

Unnamed: 0,north_season,north_h1n1,north_h3n2,south_season,south_h1n1,south_h3n2
0,02_03,31,156,2,20,126
1,03_04,37,314,3,36,220
2,04_05,46,451,4,40,393
3,05_06,90,592,5,65,534
4,06_07,282,749,6,119,605
5,07_08,600,1048,7,355,874
6,08_09,1085,1290,8,730,1121
7,09_10,3463,1644,9,2207,1565
8,10_11,4538,2258,10,3747,1858
9,11_12,4967,3048,11,4702,2469
