# Influenza Enet Predictions

Predicting dominant strains using Emergenet.

In [1]:
import os 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from domseq import DomSeq

PRED_DIR = 'results/enet_predictions/seasonal_predictions/'
OUT_DIR = 'results/enet_predictions/'

FILES = ['north_h1n1', 'north_h3n2', 'south_h1n1', 'south_h3n2']

NORTH_YEARS = []
for i in np.arange(3, 24):
    YEAR = ''
    if i < 10:
        YEAR += '0' + str(i)
    else:
        YEAR += (str(i))
    if i + 1 < 10:
        YEAR += '_0' + str(i + 1)
    else:
        YEAR += '_' + str(i + 1)
    NORTH_YEARS.append(YEAR)
        
SOUTH_YEARS = []
for i in np.arange(3, 24):
    if i < 10:
        SOUTH_YEARS.append('0' + str(i))
    else:
        SOUTH_YEARS.append(str(i))

NA_TRUNC = 468 # 2 less than official length of 470
HA_TRUNC = 565 # 2 less than official length of 567

## Create Enet Models
- Truncate HA at 565 amino acids, we only need HA models
- Give Enet the same name as the data file
    
### Running Processes

Computations are done in:
- `enet_train.py`
- `run_enet_train.sh`

To run, navigate to terminal and do

`chmod ugo+rwx run_enet_train.sh`

`./run_enet_train.sh`

## Predictions
E-Centroid: $$x_{*}^{t+\delta} = argmin_{y\in \bigcup_{r\leq t}H^{\tau}} \left ( \sum_{x \in {H^t}} \theta(x,y) - |H^t|A \ln\omega_y \right )$$
- $x_{*}^{t+\delta}$ is the dominant strain in the upcoming flu season at time $t+\omega$
- $H^t$ is the sequence population at time $t$
- $\theta(x,y)$ is the e-distance between $x$ and $y$ in their respective Enets
- $A = \frac{1-\alpha}{\sqrt{8}N^2}$, where $\alpha$ is a fixed significance level and $N$ is the sequence length considered
- $\ln\omega_y$ is the membership degree of sequence $y$
- **Predict dominant strain based on HA data** 
    
Predictions:
- Multi-cluster predictions: take the E-centroid of the two largest clusters
- Single-cluster predictions: find the sequence who's distance to the two-cluster predictions is closest to the ratio of their clusters' respective sizes

### Running Processes

Computations are done in:
- `enet_predictions.py`
- `run_enet_predictions.sh`

To run, navigate to terminal and do

`chmod ugo+rwx run_enet_predictions.sh`

`./run_enet_predictions.sh`

## Aggregate Two-Cluster and Single-Cluster Predictions

In [2]:
domseq = DomSeq(seq_trunc_length=HA_TRUNC, random_state=42)
for FILE in FILES:
    pred_df = pd.DataFrame(columns=['season',
                                    'name_0','cluster_count_0','cluster_area_0',
                                    'ha_acc_0','ha_seq_0','na_acc_0','na_seq_0',
                                    'name_1','cluster_count_1','cluster_area_1',
                                    'ha_acc_1','ha_seq_1','na_acc_1','na_seq_1']) 
    pred_df_single = pd.DataFrame(columns=['season','name',
                                           'ha_acc','ha_seq',
                                           'na_acc','na_seq']) 
    YEARS = []
    if FILE[:5] == 'north':
        YEARS = NORTH_YEARS
        prev_season = '02_03'
    else:
        YEARS = SOUTH_YEARS
        prev_season = '02'
    for i in range(21):
        pred_seqs = pd.read_csv(PRED_DIR + FILE + '/' + FILE + '_' + YEARS[i] + '.csv')
        pred_seqs = pred_seqs.sort_values(by='cluster_count', ascending=False)
        
        # Expand multicluster predictions to larger dataframe
        df1 = pd.DataFrame({'season':[YEARS[i]]})
        for j in range(2):
            df1['name_'+str(j)] = pred_seqs['name'].values[j]
            df1['cluster_count_'+str(j)] = pred_seqs['cluster_count'].values[j]
            df1['cluster_area_'+str(j)] = pred_seqs['cluster_area'].values[j]
            df1['ha_acc_'+str(j)] = pred_seqs['acc'].values[j]
            df1['ha_seq_'+str(j)] = pred_seqs['sequence'].values[j]
            df1['na_acc_'+str(j)] = pred_seqs['acc_na'].values[j]
            df1['na_seq_'+str(j)] = pred_seqs['sequence_na'].values[j]
        pred_df = pd.concat([pred_df, df1])
        
        # Get single prediction
        pred_seq_df = pd.read_csv('data/merged/' + FILE + '/pred/' + FILE + '_' + prev_season + '.csv')
        prev_season = YEARS[i]
        single_pred_seq = domseq.predict_single_domseq(pred_seqs, pred_seq_df)
            
        # Expand single cluster predictions to larger dataframe
        df2 = pd.DataFrame({'season':[YEARS[i]]})
        df2['name'] = single_pred_seq['name'].values[0]
        df2['ha_acc'] = single_pred_seq['acc'].values[0]
        df2['ha_seq'] = single_pred_seq['sequence'].values[0]
        df2['na_acc'] = single_pred_seq['acc_na'].values[0]
        df2['na_seq'] = single_pred_seq['sequence_na'].values[0]
        pred_df_single = pd.concat([pred_df_single, df2])
        
    # Enet recommendation accession, name, sequence
    pred_df.to_csv(OUT_DIR + FILE + '_predictions.csv', index=False)
    pred_df_single.to_csv(OUT_DIR + FILE + '_predictions_single_cluster.csv', index=False)