# Dominant Sequences 2020-2021
- Find 2020-2021 dominant strains using edit distance

In [36]:
# basic imports
import os 
import numpy as np
import pandas as pd
import math
import warnings
warnings.filterwarnings('ignore')
import tqdm
from tqdm.notebook import trange, tqdm

# visualization
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

# other
from Bio import SeqIO
from collections import Counter
import Levenshtein as lev

# qnet
from quasinet.qnet import Qnet, qdistance, qdistance_matrix, membership_degree, save_qnet, load_qnet
from quasinet.qseqtools import list_trained_qnets, load_trained_qnet

## Data Sources
- NCBI: https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Protein
- GISAID: https://platform.epicov.org/epi3/cfrontend#586f5f

## Downloading Data
**GISAID (NCBI has few strains for this season):**
1. Download amino acid data from both sources with the following filters:
    - Host: Human
    - Flu Season: 
        - Northern strains from 10/01/2020 - 5/01/2021
        - Southern strains from 04/01/2020 - 10/1/2020
        - Flu season dates from [CDC](https://www.cdc.gov/flu/school-business/travelersfacts.htm)
    - Segment: HA (4) and NA (6)
2. File names for raw data: HEMISPHERE_SEQUENCE_SEGMENT_SEASON
    - HEMISPHERE: "north" or "south"
    - SEQUENCE: "h1n1" or "h3n2"
    - SEGMENT: "ha" or "na"
    - SEASON: year the season begins in (ex. 20 for 2020-2021)

In [37]:
NCBI_PATH = 'raw_data/ncbi/'
GISAID_PATH = 'raw_data/gisaid/'

FILES = ['north_h1n1_ha_20', 'north_h1n1_na_20', 'north_h3n2_ha_20', 'north_h3n2_na_20',
         'south_h1n1_ha_20', 'south_h1n1_na_20', 'south_h3n2_ha_20', 'south_h3n2_na_20']

NA_TRUNC = 469
HA_TRUNC = 566

## Dominant Strain
Levenshtein Centroid: $$\widehat{x}^{dom} = argmin_{x\in P^t} \sum_{y \in P^t} \theta(x,y)$$
- Where $P^t$ is the sequence population at time $t$.
- $\theta(x,y)$ is the edit distance between x and y

In [39]:
# input: fasta file name, length to truncate each sequence
# output: dataframe of sequences
def parse_fasta(file_name, trunc):
    acc = []
    seq = []
    for record in SeqIO.parse(file_name, 'fasta'):
        if len(record.seq) < trunc:
            continue
        acc.append(record.id.split('|')[0])
        seq.append(''.join(record.seq[:trunc].upper()))
    df = pd.DataFrame({'name':acc, 'sequence':seq})
    return df

In [49]:
dom_files = []
dom_names = []
dom_seqs = []

for FILE in tqdm(FILES):
    TRUNC = HA_TRUNC
    if 'na' in FILE:
        TRUNC = NA_TRUNC
    df = parse_fasta(GISAID_PATH + FILE + ".fasta", TRUNC)
    seq_df = df.sample(min(1000, len(df)), random_state = 42)
    seqs = seq_df['sequence'].values
    
    # find centroid of sequences in P^t
    edit_dists = []
    for seq in seqs:
        edit_dist = 0
        for seq1 in seqs:
            edit_dist += lev.distance(seq, seq1)
        edit_dists.append(edit_dist)
    ind_min = np.argmin(edit_dists)
    dom_name = seq_df.iloc[ind_min].values[0]
    dom_seq = seq_df.iloc[ind_min].values[1]
        
    # save results
    dom_files.append(FILE[:13])
    dom_names.append(dom_name)
    dom_seqs.append(dom_seq)

  0%|          | 0/8 [00:00<?, ?it/s]

In [50]:
dominant_seqs = pd.DataFrame({'strain':dom_files, 'name':dom_names, 'sequence':dom_seqs})
dominant_seqs

Unnamed: 0,strain,name,sequence
0,north_h1n1_ha_20,A/Togo/905/2020,MKAILVVLLYTFTTANADTLCIGYHANNSTDTVDTVLEKNVTVTHS...
1,north_h1n1_na_20,A/Ghana/119/2020,MNPNQKIITIGSICMTIGMANLILQIGNIISIWVSHSIQIGNQSQI...
2,north_h3n2_ha_20,A/India/Pun-NIV300460/2021_Apr,MKTIIALSYILCLVFAQKIPGNDNSTATLCLGHHAVPNGTIVKTIT...
3,north_h3n2_na_20,A/Kenya/122/2021,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...
4,south_h1n1_ha_20,A/Cote_d'Ivoire/951/2020,MKAILVVLLYTFTTANADTLCIGYHANNSTDTVDTVLEKNVTVTHS...
5,south_h1n1_na_20,A/Srinagar/AG_659/2020,MNPNQKIITIGSICMTIGMANLILQIGNIISIWVSHSIQIGNQSQI...
6,south_h3n2_ha_20,A/Timor-Leste/2/2020,MKTIIALSYILCLVFAQKIPGNDNSTATLCLGHHAVPNGTIVKTIT...
7,south_h3n2_na_20,A/Bangladesh/3009/2020,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...


In [51]:
# save dataframe as csv
os.makedirs('results', exist_ok=True)  
dominant_seqs.to_csv('results/dominant_sequences_2020_2021.csv', index=False)  