# Aggregate Sequence Data
- Aggregates sequence data used in Emergenet paper 
- Data from NCBI and GISAID

In [1]:
import os 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from zedstat.textable import textable
from Bio import SeqIO

## GISAID

In [2]:
def parse_fasta_gisaid(file_name, segment):
    names = []
    subtypes = []
    dates = []
    accessions = []
    for record in SeqIO.parse(file_name, 'fasta'):
        if segment == 'HA' and len(record.seq) < 550:
            continue
        elif segment == 'NA' and (len(record.seq) < 449 or len(record.seq) > 500):
            continue
        rec = record.id.split('|')
        names.append(rec[0])
        subtypes.append(rec[1][-4:])
        dates.append(rec[3])
        accessions.append(rec[4])
    df = pd.DataFrame({'name':names, 
                       'subtype':subtypes, 
                       'collection_date':dates, 
                       'accession':accessions})
    return df

In [3]:
GISAID_PATH = 'all_gisaid_data/'

gisaid_df_ha = pd.DataFrame({})
gisaid_df_na = pd.DataFrame({})
for file in os.listdir(GISAID_PATH):
    FILENAME = os.fsdecode(file)
    if FILENAME.endswith('.fasta'): 
        # HA
        df_ha = parse_fasta_gisaid(GISAID_PATH + FILENAME, 'HA')
        if len(gisaid_df_ha) == 0:
            gisaid_df_ha = df_ha
        else:
            gisaid_df_ha = gisaid_df_ha.append(df_ha, ignore_index=True)
        # NA
        df_na = parse_fasta_gisaid(GISAID_PATH + FILENAME, 'NA')
        if len(gisaid_df_na) == 0:
            gisaid_df_na = df_na
        else:
            gisaid_df_na = gisaid_df_na.append(df_na, ignore_index=True)
            
gisaid_df_ha = gisaid_df_ha.drop_duplicates(subset=['accession'], ignore_index=True)
gisaid_df_na = gisaid_df_na.drop_duplicates(subset=['accession'], ignore_index=True)
gisaid_df_ha.to_csv('gisaid_metadata_ha.csv')
gisaid_df_na.to_csv('gisaid_metadata_na.csv')

In [4]:
gisaid_ha_counts = gisaid_df_ha.groupby(by='subtype').count()['name'].values
gisaid_na_counts = gisaid_df_na.groupby(by='subtype').count()['name'].values
gisaid_counts = pd.DataFrame({'Database':['GISAID']*len(gisaid_ha_counts),
                              'Influenza Subtype':gisaid_df_ha.groupby(by='subtype').count().index,
                              'No. HA Sequences':gisaid_ha_counts,
                              'No. NA Sequences':gisaid_na_counts})
gisaid_counts = gisaid_counts[1:].append(gisaid_counts.iloc[0])
gisaid_counts.at[0, 'Influenza Subtype'] = 'H10N8'

## NCBI

In [5]:
def parse_fasta_ncbi(file_name, segment):
    names = []
    subtypes = []
    dates = []
    accessions = []
    for record in SeqIO.parse(file_name, 'fasta'):
        rec = record.description.split('|')
        if segment == 'HA' and 'hemagglutinin' not in rec[1]:
            continue
        elif segment == 'NA' and 'neuraminidase' not in rec[1]:
            continue
        names.append(rec[1])
        if 'H1' in rec[1][-7:-3]:
            subtypes.append('H1N1')
        else:
            subtypes.append(rec[1][-7:-3])
        dates.append(rec[2])
        accessions.append(rec[0])
    df = pd.DataFrame({'name':names, 
                       'subtype':subtypes, 
                       'collection_date':dates, 
                       'accession':accessions})
    return df

In [6]:
NCBI_PATH = 'all_ncbi_data/'

ncbi_df_ha = pd.DataFrame({})
ncbi_df_na = pd.DataFrame({})
for file in os.listdir(NCBI_PATH):
    FILENAME = os.fsdecode(file)
    if FILENAME.endswith('.fasta'): 
        # HA
        df_ha = parse_fasta_ncbi(NCBI_PATH + FILENAME, 'HA')
        if len(ncbi_df_ha) == 0:
            ncbi_df_ha = df_ha
        else:
            ncbi_df_ha = ncbi_df_ha.append(df_ha, ignore_index=True)
        # NA
        df_na = parse_fasta_ncbi(NCBI_PATH + FILENAME, 'NA')
        if len(ncbi_df_na) == 0:
            ncbi_df_na = df_na
        else:
            ncbi_df_na = ncbi_df_na.append(df_na, ignore_index=True)
            
ncbi_df_ha = ncbi_df_ha.drop_duplicates(subset=['accession'], ignore_index=True)
ncbi_df_na = ncbi_df_na.drop_duplicates(subset=['accession'], ignore_index=True)
ncbi_df_ha.to_csv('ncbi_metadata_ha.csv')
ncbi_df_na.to_csv('ncbi_metadata_na.csv')

In [7]:
ncbi_ha_counts = ncbi_df_ha.groupby(by='subtype').count()['name'].values
ncbi_na_counts = ncbi_df_na.groupby(by='subtype').count()['name'].values
ncbi_counts = pd.DataFrame({'Database':['NCBI']*len(ncbi_ha_counts),
                            'Influenza Subtype':ncbi_df_ha.groupby(by='subtype').count().index,
                            'No. HA Sequences':ncbi_ha_counts,
                            'No. NA Sequences':ncbi_na_counts})

## All Counts

In [8]:
pd.options.display.float_format = '{:,.0f}'.format

seq_counts = gisaid_counts.append(ncbi_counts)
seq_counts['Total'] = seq_counts['No. HA Sequences'] + seq_counts['No. NA Sequences']
seq_counts = seq_counts.append(seq_counts.sum(numeric_only=True), ignore_index=True)
seq_counts.to_csv('total_sequences.csv', index = False)
seq_counts

Unnamed: 0,Database,Influenza Subtype,No. HA Sequences,No. NA Sequences,Total
0,GISAID,H1N1,13536,13501,27037
1,GISAID,H1N2,857,857,1714
2,GISAID,H3N2,40257,40096,80353
3,GISAID,H5N1,1970,1943,3913
4,GISAID,H5N2,22,24,46
5,GISAID,H5N6,186,186,372
6,GISAID,H5N8,1449,1401,2850
7,GISAID,H7N1,3,3,6
8,GISAID,H7N2,2,2,4
9,GISAID,H7N3,101,99,200


In [9]:
# seq_counts = pd.read_csv('total_sequences.csv', index_col=0)
# textable(seq_counts,tabname='total_sequences.tex', FORMAT='%1d')

## Save to Excel

In [10]:
with pd.ExcelWriter('seq_metadata.xlsx') as writer:  
    gisaid_df_ha.to_excel(writer, sheet_name='GISAID HA Sequences', index=False)
    gisaid_df_na.to_excel(writer, sheet_name='GISAID NA Sequences', index=False)
    ncbi_df_ha.to_excel(writer, sheet_name='NCBI HA Sequences', index=False)
    ncbi_df_na.to_excel(writer, sheet_name='NCBI NA Sequences', index=False)