# Animal Influenza Emergence Risk Predictions
- Uses the Emergenet package to generate emergence risk scores for animal strains
- Collected animal HA and NA strains for the last year (September 30, 2021 - September 30, 2022)
- Used qnets from `qnet_models/current_qnets/`

In [1]:
import os 
import numpy as np
import pandas as pd
import scipy.stats as stats
import math
from tqdm.notebook import trange, tqdm
import warnings
warnings.filterwarnings('ignore')
from Bio import SeqIO

# linear model
import statsmodels.api as sm
import statsmodels.formula.api as smf

from emergenet import Enet, save_model, load_model
# from emergenet.emergenet import Enet, save_model, load_model
from quasinet.qnet import Qnet, qdistance, qdistance_matrix, membership_degree, save_qnet, load_qnet
# from zedstat.textable import textable

In [2]:
# input: fasta file name, length to truncate each sequence
# output: dataframe of sequences
def parse_fasta(file_name, trunc, seq_array = False):
    acc = []
    seq = []
    for record in SeqIO.parse(file_name, 'fasta'):
        if len(record.seq) < trunc:
            continue
        acc.append(record.id.split('|')[0])
        if seq_array:
            seq.append(np.array(record.seq[:trunc].upper()))
        else:
            seq.append(str(record.seq[:trunc].upper()))
    df = pd.DataFrame({'id':acc, 'sequence':seq})
    return df


# input: dataframe of sequences, number of samples
# output: array of nucleotide lists
def sequence_array(seq_df):
    seqs = seq_df['sequence'].values
    seq_lst = []
    for seq in seqs:
        seq_lst.append(seq)
    return np.array(seq_lst)


# input: name to call qnet, array of nucleotide lists, number of nucleotides
# output: save qnet as joblib
def train_save_qnet(name, seq_arr, num_nuc):
    myqnet = Qnet(feature_names=['x'+str(i) for i in np.arange(num_nuc)],n_jobs=1)
    myqnet.fit(seq_arr)
    save_qnet(myqnet, QNET_DIR + name + '.joblib')

## Predictions

In [3]:
DATA_DIR = 'raw_data/gisaid_animal/'
HUMAN_DATA_DIR = 'raw_data/gisaid_current/'
QNET_DIR = 'qnet_models/current_qnets/'
OUT_DIR = 'results/animal_predictions/'

NA_TRUNC = 449
HA_TRUNC = 550

FILES = ['h1n1', 'h1n2', 'h3n2', 'h5n1', 'h5n2', 'h5n6', 'h5n8', 'h7', 'h9n2']

In [None]:
for FILE in tqdm(FILES):
    ha_qnet = load_model(QNET_DIR + FILE + '_ha.joblib')
    na_qnet = load_model(QNET_DIR + FILE + '_na.joblib')
    
    ha_df = parse_fasta(DATA_DIR + FILE + '_ha.fasta', HA_TRUNC)
    na_df = parse_fasta(DATA_DIR + FILE + '_na.fasta', NA_TRUNC)
    
    df = ha_df.merge(na_df, how='inner', on='id').rename(columns={'sequence_x':'ha', 'sequence_y':'na'})
    human_ha_df = parse_fasta(HUMAN_DATA_DIR + FILE + '_ha.fasta', HA_TRUNC, seq_array=True)
    human_na_df = parse_fasta(HUMAN_DATA_DIR + FILE + '_na.fasta', NA_TRUNC, seq_array=True)
    
    ha_emergence_risk = []
    ha_variance = []
    na_emergence_risk = []
    na_variance = []
    geometric_mean_risk = []

    for i in trange(len(df)):
        row = df.iloc[i]
        enet_ha = Enet(seq=row['ha'], seq_trunc_length=HA_TRUNC, random_state=42)
        enet_na = Enet(seq=row['na'], seq_trunc_length=NA_TRUNC, random_state=42)

        emergence_risk_score_ha, variance_ha = enet_ha.emergence_risk(seq_df=human_ha_df, qnet=ha_qnet, sample_size=100)
        emergence_risk_score_na, variance_na = enet_na.emergence_risk(seq_df=human_na_df, qnet=na_qnet, sample_size=100)

        ha_emergence_risk.append(emergence_risk_score_ha)
        ha_variance.append(variance_ha)
        na_emergence_risk.append(emergence_risk_score_na)
        na_variance.append(variance_na)
        geometric_mean_risk.append(math.sqrt(emergence_risk_score_ha * emergence_risk_score_na))

    df['ha_risk'] = ha_emergence_risk
    df['ha_variance'] = ha_variance
    df['na_risk'] = na_emergence_risk
    df['na_variance'] = na_variance
    df['geometric_mean_risk'] = geometric_mean_risk

    # save dataframe as csv
    os.makedirs(OUT_DIR, exist_ok=True)
    df.to_csv(OUT_DIR + FILE + '.csv', index=False)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1484 [00:00<?, ?it/s]

## Scale Predictions with GLM

In [6]:
data = pd.read_csv('results/irat_average_qdistances_filled.csv')
data = data.rename(columns={'Potential Emergence Estimate':'IRAT_Emergence_Score',
                            'Potential Impact Estimate':'IRAT_Impact_Score',
                            'Geometric Mean':'Geometric_Mean',
                            'HA Average Qdistance':'HA_Avg_Qdist',
                            'NA Average Qdistance':'NA_Avg_Qdist'})
data = data[(data['HA_Avg_Qdist'] > 0) & (data['NA_Avg_Qdist'] > 0)]

In [7]:
# IRAT_Emergence_Score ~ Geometric_Mean + HA_Avg_Qdist*NA_Avg_Qdist
formula1 = 'IRAT_Emergence_Score ~ Geometric_Mean + HA_Avg_Qdist*NA_Avg_Qdist'
mod1 = smf.glm(formula=formula1, data=data).fit()
print(mod1.summary())

                  Generalized Linear Model Regression Results                   
Dep. Variable:     IRAT_Emergence_Score   No. Observations:                   22
Model:                              GLM   Df Residuals:                       17
Model Family:                  Gaussian   Df Model:                            4
Link Function:                 identity   Scale:                         0.69369
Method:                            IRLS   Log-Likelihood:                -24.357
Date:                  Mon, 07 Nov 2022   Deviance:                       11.793
Time:                          23:49:41   Pearson chi2:                     11.8
No. Iterations:                       3   Pseudo R-squ. (CS):             0.7797
Covariance Type:              nonrobust                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
In

In [8]:
# IRAT_Emergence_Score ~ Geometric_Mean + HA_Avg_Qdist*NA_Avg_Qdist
formula2 = 'IRAT_Impact_Score ~ Geometric_Mean + HA_Avg_Qdist*NA_Avg_Qdist'
mod2 = smf.glm(formula=formula2, data=data).fit()
print(mod2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:      IRAT_Impact_Score   No. Observations:                   22
Model:                            GLM   Df Residuals:                       17
Model Family:                Gaussian   Df Model:                            4
Link Function:               identity   Scale:                         0.77760
Method:                          IRLS   Log-Likelihood:                -25.614
Date:                Mon, 07 Nov 2022   Deviance:                       13.219
Time:                        23:49:42   Pearson chi2:                     13.2
No. Iterations:                     3   Pseudo R-squ. (CS):             0.6744
Covariance Type:            nonrobust                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

In [9]:
for FILE in tqdm(FILES):
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    df = df.rename(columns={'geometric_mean_risk':'Geometric_Mean',
                            'ha_risk':'HA_Avg_Qdist',
                            'na_risk':'NA_Avg_Qdist'})
    df['predicted_emergence_score'] = mod1.predict(df).values
    df['predicted_impact_score'] = mod2.predict(df).values
    df.to_csv(OUT_DIR + FILE + '.csv', index=False)

  0%|          | 0/9 [00:00<?, ?it/s]

## Number of Strains Evaluated

In [5]:
num = 0
for FILE in tqdm(FILES):
    df = pd.read_csv(OUT_DIR + FILE + '.csv')
    num += len(df)
num

  0%|          | 0/9 [00:00<?, ?it/s]

6254