# Missing Prediction 2019-2020
- The H3N2 NA Northern Hemisphere 2019-2020 prediction is missing, this notebook computes it
- Remember, use previous season (2018-2019) data to compute prediction

In [37]:
# basic imports
import os 
import numpy as np
import pandas as pd
import math
import warnings
warnings.filterwarnings('ignore')
import tqdm
from tqdm.notebook import trange, tqdm

# visualization
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

# other
from Bio import SeqIO
from collections import Counter
import Levenshtein as lev

# qnet
from quasinet.qnet import Qnet, qdistance, qdistance_matrix, membership_degree, save_qnet, load_qnet
from quasinet.qseqtools import list_trained_qnets, load_trained_qnet

## Data Sources
- GISAID: https://platform.epicov.org/epi3/cfrontend#586f5f
- Download 2018 H3N2 NA data from 10/1/2018 - 5/1/2019

In [20]:
NA_TRUNC = 469
HA_TRUNC = 566

In [21]:
# input: fasta file name, length to truncate each sequence
# output: dataframe of sequences
def parse_fasta(file_name, trunc):
    acc = []
    seq = []
    for record in SeqIO.parse(file_name, 'fasta'):
        if len(record.seq) < trunc:
            continue
        acc.append(record.id.split('|')[0])
        seq.append(np.array(record.seq[:trunc].upper()))
    df = pd.DataFrame({'name':acc, 'sequence':seq})
    return df

# input: dataframe of sequences
# output: array of nucleotide lists
def sequence_array(df):
    seqs = seq_df['sequence'].values
    seq_lst = []
    for seq in seqs:
        seq_lst.append(seq)
    return np.array(seq_lst)

In [25]:
seq_df = parse_fasta("north_h3n2_na_18.fasta", NA_TRUNC).sample(1000, random_state = 42)
seq_arr = sequence_array(seq_df)

## Loading Past Qnet
- Show possible Qnets with `list_trained_qnets()`
- Load pre-trained qnet for H3N2 NA 2018

In [26]:
# input: virus, protein, year
# output: qnet 
def load_influenza_qnet(virus, protein, year):
    myqnet = load_trained_qnet('influenza', virus + ';' + protein + ';' + str(year))
    TRUNC = HA_TRUNC
    if protein == 'na':
        TRUNC = NA_TRUNC
    # add feature names
    myqnet.feature_names=['x'+str(i) for i in np.arange(TRUNC)]
    return myqnet

In [27]:
myqnet = load_influenza_qnet('h3n2', 'na', 2018)

## Predictions
Q-Centroid: $$\widehat{x}^{t+1} = argmin_{x\in P} \sum_{y \in P^t} \theta(x,y)$$
- Where $P^t$ is the sequence population at time $t$ and $P = P^t \cup P^{t-1} \cup P^{t-2} \cup \dots \cup P^1$.
- $\theta(x,y)$ is the qdistance between x and y in their respective Qnets

In [28]:
# compute qdistance matrix
seqs_matrix = np.array(list(seq_df['sequence'].values))
dist_matrix = qdistance_matrix(seqs_matrix, seqs_matrix, myqnet, myqnet)

# compute q-centroid using formula
sums = list(dist_matrix.sum(axis=1))
min_ind = np.argmin(sums)
rec_name = seq_df.iloc[min_ind].values[0]
rec_seq = seq_df.iloc[min_ind].values[1]

In [34]:
prediction = pd.DataFrame({'strain':['north_h3n2_na'], 'name':rec_name, 'sequence':''.join(rec_seq)})
prediction

Unnamed: 0,strain,name,sequence
0,north_h3n2_na,A/Guangxi-Fangcheng/54/2019,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...


## Modifying Tables

In [71]:
# input: results table, name of strain, year, dataframe with predictions
# output: updated table with qnet predicted sequence
def add_predicted_sequence(df, name, year, pred_df):
    pred_row = pred_df.loc[pred_df['strain'] == name]
    pred_name = pred_row['name'].values[0]
    pred_seq = pred_row['sequence'].values[0]
    df.loc[df['year'] == year, ['qdistance_recommendation_accession_name']] = pred_name
    df.loc[df['year'] == year, ['qdistance_recommendation_sequence']] = pred_seq
    
# input: results table, name of strain, year
# output: updated table with who and qnet errors
def add_who_qnet_errors(df, name, year):
    dom_seq = df.loc[df['year'] == year]['dominant_strain_sequence'].values[0]
    who_seq = df.loc[df['year'] == year]['WHO_recommendation_sequence'].values[0]
    qnet_seq = df.loc[df['year'] == year]['qdistance_recommendation_sequence'].values[0]
    trunc = min(len(dom_seq), min(len(qnet_seq), len(who_seq)))
    who_error = lev.distance(dom_seq[:trunc], who_seq[:trunc])
    qnet_error = lev.distance(dom_seq[:trunc], qnet_seq[:trunc])
    df.loc[df['year'] == year, ['ldistance_WHO']] = who_error
    df.loc[df['year'] == year, ['ldistance_Qnet_recommendation']] = qnet_error

In [74]:
df = pd.read_csv('../tables/north_h3n2_na.csv')
add_predicted_sequence(df, 'north_h3n2_na', '2019_2020', prediction)
add_who_qnet_errors(df, 'north_h3n2_na', '2019_2020')
df.to_csv('../tables/north_h3n2_na.csv', index=False)  
df

Unnamed: 0,year,WHO_recommendation_name,WHO_recommendation_sequence,dominant_strain_accession,dominant_strain_sequence,dominant_strain_accession_name,qdistance_recommendation_accession,qdistance_recommendation_sequence,qdistance_recommendation_accession_name,ldistance_WHO,ldistance_Qnet_recommendation,qnet_sample_size
0,2003_2004,A/Moscow/10/99,MNPNQKIITIGSVSLTIATICFLMQIAILVTTVTLHFKQYECNSPP...,EU103941,MNPNQKIITIGSVSLTIATICFLMQIAILVTTVTLHFKQHECNSPP...,A/Denmark/107/2003,CY000227,MNPNQKIITIGSVSLTIATICFLMQIAILVTTVTLHFKQYECNSPP...,A/New York/100/2002,13,3,169
1,2004_2005,A/Fujian/411/2002,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,AB761208,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Hyogo/36/2004,CY001423,MNPNQKIITIGSVSLTIATICFLMQIAILVTTVTLHFKQHECNSPP...,A/New York/20/2003,3,16,159
2,2005_2006,A/California/7/2004,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,EU103854,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Denmark/203/2005,CY038617,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Hong Kong/HKU20/2004,4,0,212
3,2006_2007,A/Wisconsin/67/2005,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,FJ231854,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Berlin/32/2006,CY100572,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Mexico/InDRE2227/2005,1,1,165
4,2007_2008,A/Wisconsin/67/2005,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,EU879087,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Brazil/80/2007,FJ231845,MNPNQMIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Baden-Wuerttemberg/17/2006,8,7,387
5,2008_2009,A/Brisbane/10/2007,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,FJ686943,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Missouri/05/2008,EU100650,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Washington/01/2007,3,2,269
6,2009_2010,A/Brisbane/10/2007,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,KC535398,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Oklahoma/09/2009,KC535389,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Wisconsin/24/2008,3,1,231
7,2010_2011,A/Perth/16/2009,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,KC883268,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/California/17/2010,KC535454,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/New York/70/2009,2,3,309
8,2011_2012,A/Perth/16/2009,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,KC892944,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Texas/14/2011,KC535424,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/California/14/2010,3,2,565
9,2012_2013,A/Victoria/361/2011,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,KC893181,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/New York/02/2012,KF014482,MNPNQKIITIGSVSLTISTICFFMQIAILITTVTLHFKQYEFNSPP...,A/Singapore/C2011.493/2011,4,1,739
