# Aggregate Sequence Data
- Aggregates sequence data used in Emergenet paper 
- Data from NCBI and GISAID

In [93]:
import os 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from zedstat.textable import textable
from Bio import SeqIO

## GISAID

In [94]:
def parse_fasta_gisaid(file_name, segment):
    names = []
    subtypes = []
    dates = []
    accessions = []
    for record in SeqIO.parse(file_name, 'fasta'):
        if segment == 'HA' and len(record.seq) < 550:
            continue
        elif segment == 'NA' and (len(record.seq) < 449 or len(record.seq) > 500):
            continue
        rec = record.id.split('|')
        if len(rec[1].split('_')) < 2:
            continue
        if len(rec[1].split('_')[2]) < 4:
            continue
        names.append(rec[0])
        subtypes.append(rec[1].split('_')[2])
        accessions.append(rec[4])
    df = pd.DataFrame({'name':names,
                       'subtype':subtypes,
                       'accession':accessions})
    return df

In [95]:
gisaid_df_ha = pd.DataFrame({})
gisaid_df_na = pd.DataFrame({})
for DIR in os.listdir('gisaid/'):
    GISAID_DIR = 'gisaid/' + DIR + '/'
    if not os.path.isdir(GISAID_DIR):
        continue
    for file in os.listdir(GISAID_DIR):
        FILENAME = os.fsdecode(file)
        if FILENAME.endswith('.fasta'): 
            # HA
            df_ha = parse_fasta_gisaid(GISAID_DIR + FILENAME, 'HA')
            gisaid_df_ha = gisaid_df_ha.append(df_ha, ignore_index=True)
            # NA
            df_na = parse_fasta_gisaid(GISAID_DIR + FILENAME, 'NA')
            gisaid_df_na = gisaid_df_na.append(df_na, ignore_index=True)
            
gisaid_df_ha = gisaid_df_ha.drop_duplicates(subset=['accession'], ignore_index=True)
gisaid_df_na = gisaid_df_na.drop_duplicates(subset=['accession'], ignore_index=True)
gisaid_df_ha.to_csv('gisaid_metadata_ha.csv')
gisaid_df_na.to_csv('gisaid_metadata_na.csv')

In [96]:
gisaid_ha_counts = gisaid_df_ha.groupby(by='subtype').count()['name'].values
gisaid_na_counts = gisaid_df_na.groupby(by='subtype').count()['name'].values
gisaid_counts = pd.DataFrame({'Database':['GISAID']*len(gisaid_ha_counts),
                              'Influenza Subtype':gisaid_df_ha.groupby(by='subtype').count().index,
                              'No. HA Sequences':gisaid_ha_counts,
                              'No. NA Sequences':gisaid_na_counts})
gisaid_counts = gisaid_counts[1:].append(gisaid_counts.iloc[0])
gisaid_counts = gisaid_counts[(gisaid_counts['No. HA Sequences'] >= 15) & (gisaid_counts['No. NA Sequences'] >= 15)]
gisaid_counts = pd.concat([gisaid_counts.iloc[4:], gisaid_counts.iloc[:4]])

## NCBI

In [97]:
def parse_fasta_ncbi(file_name, segment):
    names = []
    subtypes = []
    dates = []
    accessions = []
    for record in SeqIO.parse(file_name, 'fasta'):
        rec = record.description.split('|')
        if segment == 'HA' and 'hemagglutinin' not in rec[1]:
            continue
        elif segment == 'NA' and 'neuraminidase' not in rec[1]:
            continue
        names.append(rec[1])
        if 'H1' in rec[1][-7:-3]:
            subtypes.append('H1N1')
        else:
            subtypes.append(rec[1][-7:-3])
        accessions.append(rec[0])
    df = pd.DataFrame({'name':names, 
                       'subtype':subtypes, 
                       'accession':accessions})
    return df

In [98]:
ncbi_df_ha = pd.DataFrame({})
ncbi_df_na = pd.DataFrame({})
for DIR in os.listdir('ncbi/'):
    NCBI_DIR = 'ncbi/' + DIR + '/'
    if not os.path.isdir(NCBI_DIR):
        continue
    for file in os.listdir(NCBI_DIR):
        FILENAME = os.fsdecode(file)
        if FILENAME.endswith('.fasta'): 
            # HA
            df_ha = parse_fasta_ncbi(NCBI_DIR + FILENAME, 'HA')
            if len(ncbi_df_ha) == 0:
                ncbi_df_ha = df_ha
            else:
                ncbi_df_ha = ncbi_df_ha.append(df_ha, ignore_index=True)
            # NA
            df_na = parse_fasta_ncbi(NCBI_DIR + FILENAME, 'NA')
            if len(ncbi_df_na) == 0:
                ncbi_df_na = df_na
            else:
                ncbi_df_na = ncbi_df_na.append(df_na, ignore_index=True)
            
ncbi_df_ha = ncbi_df_ha.drop_duplicates(subset=['accession'], ignore_index=True)
ncbi_df_na = ncbi_df_na.drop_duplicates(subset=['accession'], ignore_index=True)
ncbi_df_ha.to_csv('ncbi_metadata_ha.csv')
ncbi_df_na.to_csv('ncbi_metadata_na.csv')

In [99]:
ncbi_ha_counts = ncbi_df_ha.groupby(by='subtype').count()['name'].values
ncbi_na_counts = ncbi_df_na.groupby(by='subtype').count()['name'].values
ncbi_counts = pd.DataFrame({'Database':['NCBI']*len(ncbi_ha_counts),
                            'Influenza Subtype':ncbi_df_ha.groupby(by='subtype').count().index,
                            'No. HA Sequences':ncbi_ha_counts,
                            'No. NA Sequences':ncbi_na_counts})

## All Counts

In [104]:
pd.options.display.float_format = '{:,.0f}'.format

seq_counts = gisaid_counts.append(ncbi_counts)
seq_counts['Total'] = seq_counts['No. HA Sequences'] + seq_counts['No. NA Sequences']
seq_counts = pd.concat([seq_counts.iloc[[0]], 
                        seq_counts.iloc[[-2]], 
                        seq_counts.iloc[[3]], 
                        seq_counts.iloc[[-1]], 
                        seq_counts.iloc[1:3], 
                        seq_counts.iloc[4:-2]])
seq_counts = seq_counts.append(seq_counts.sum(numeric_only=True), ignore_index=True)
seq_counts.to_csv('total_sequences.csv', index=False)
seq_counts = seq_counts.set_index('Database')
textable(seq_counts,tabname='total_sequences.tex', FORMAT='%1d')
seq_counts

Unnamed: 0_level_0,Influenza Subtype,No. HA Sequences,No. NA Sequences,Total
Database,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GISAID,H1N1,73905,73920,147825
NCBI,H1N1,18577,16913,35490
GISAID,H3N2,108829,108860,217689
NCBI,H3N2,18840,15249,34089
GISAID,H1N2,1340,1340,2680
GISAID,H1N7,18,18,36
GISAID,H3N8,406,405,811
GISAID,H4N6,68,68,136
GISAID,H5N1,8245,8145,16390
GISAID,H5N2,35,35,70


In [101]:
print('Total H1N1 HA and H3N2 HA:', seq_counts['No. HA Sequences'][:4].sum())

Total H1N1 HA and H3N2 HA: 220151


## Save to Excel

In [102]:
with pd.ExcelWriter('seq_metadata.xlsx') as writer:  
    gisaid_df_ha.to_excel(writer, sheet_name='GISAID HA Sequences', index=False)
    gisaid_df_na.to_excel(writer, sheet_name='GISAID NA Sequences', index=False)
    ncbi_df_ha.to_excel(writer, sheet_name='NCBI HA Sequences', index=False)
    ncbi_df_na.to_excel(writer, sheet_name='NCBI NA Sequences', index=False)