# Linear Models

In [116]:
# basic imports
import os 
import numpy as np
import pandas as pd
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# linear model
import statsmodels.api as sm
import statsmodels.formula.api as smf

# textable
from zedstat.textable import textable

In [117]:
def save_print_model(mod, PATH, save=False):
    if save:
        with open("results/glm_models/" + PATH, "a") as f:
            print(mod.summary(), file=f)
    print(mod.summary())
    print()

## Linear Models

In [118]:
data = pd.read_csv('results/irat_average_qdistances.csv')
data = data.rename(columns={'Potential Emergence Estimate':'IRAT_Emergence_Score',
                            'Potential Impact Estimate':'IRAT_Impact_Score',
                            'Avg. Geometric Mean':'Geometric_Mean',
                            'Avg. HA Qdistance':'HA_Avg_Qdist',
                            'Avg. NA Qdistance':'NA_Avg_Qdist'})
# remove Yunnan and Netherlands outliers, and Duck strain
data = data[(data['HA_Avg_Qdist'] > 0) & (data['NA_Avg_Qdist'] > 0)].drop(labels=[10, 12], axis=0)

# simple emergence model
formula1 = 'IRAT_Emergence_Score ~ Geometric_Mean'
mod1 = smf.glm(formula=formula1, data=data).fit()
save_print_model(mod1, 'model_emergence_simple.txt')

# complex emergence model
formula2 = 'IRAT_Emergence_Score ~ Geometric_Mean + HA_Avg_Qdist:NA_Avg_Qdist'
mod2 = smf.glm(formula=formula2, data=data).fit()
save_print_model(mod2, 'model_emergence_complex.txt')

# simple impact model
formula3 = 'IRAT_Impact_Score ~ Geometric_Mean'
mod3 = smf.glm(formula=formula3, data=data).fit()
save_print_model(mod3, 'model_impact_simple.txt')

# complex impact model
formula4 = 'IRAT_Impact_Score ~ Geometric_Mean + HA_Avg_Qdist:NA_Avg_Qdist'
mod4 = smf.glm(formula=formula4, data=data).fit()
save_print_model(mod4, 'model_impact_complex.txt')

                  Generalized Linear Model Regression Results                   
Dep. Variable:     IRAT_Emergence_Score   No. Observations:                   20
Model:                              GLM   Df Residuals:                       18
Model Family:                  Gaussian   Df Model:                            1
Link Function:                 identity   Scale:                         0.75627
Method:                            IRLS   Log-Likelihood:                -24.532
Date:                  Tue, 14 Mar 2023   Deviance:                       13.613
Time:                          18:59:03   Pearson chi2:                     13.6
No. Iterations:                       3   Pseudo R-squ. (CS):             0.7465
Covariance Type:              nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          6.391

## Predicted IRAT Scores - IRAT at Time of Analysis

In [119]:
# input: na risk, ha risk, geometric mean
# output: emergence and impact predictions
def predict_irat(emergence_mod, impact_mod, ha_risk, na_risk, geom_mean):
    df = pd.DataFrame({'Geometric_Mean':[geom_mean],
                       'HA_Avg_Qdist':[ha_risk],
                       'NA_Avg_Qdist':[na_risk]})
    emergence_risk = emergence_mod.predict(df)[0]
    impact_risk = impact_mod.predict(df)[0]
    if geom_mean > 0.3:
        emergence_risk = 3.8
        impact_risk = 4.45
    return emergence_risk, impact_risk

In [120]:
df_irat = pd.read_csv('results/irat_average_qdistances.csv')
df_irat['HA Qdistance'] = df_irat['HA Qdistance'].apply(eval)
df_irat['NA Qdistance'] = df_irat['NA Qdistance'].apply(eval)
df_irat['Geometric Mean'] = df_irat['Geometric Mean'].apply(eval)

emergence_risks = []
emergence_risks_var = []
impact_risks = []
impact_risks_var = []

for i in range(len(df_irat)):
    ha = df_irat['HA Qdistance'].iloc[i]
    na = df_irat['NA Qdistance'].iloc[i]
    gm = df_irat['Geometric Mean'].iloc[i]
    if ha == -1:
        emergence_risks.append(-1)
        emergence_risks_var.append(-1)
        impact_risks.append(-1)
        impact_risks_var.append(-1)
        continue
    emergence_risk = []
    impact_risk = []
    for j in range(len(gm)):
        er, ir = predict_irat(mod2, mod4, ha[j], na[j], gm[j])
        emergence_risk.append(er)
        impact_risk.append(ir)
    emergence_risks.append(np.mean(emergence_risk))
    emergence_risks_var.append(np.var(emergence_risk))
    impact_risks.append(np.mean(impact_risk))
    impact_risks_var.append(np.var(impact_risk))
df_irat['Emergence Risk'] = emergence_risks
df_irat['Emergence Risk Var.'] = emergence_risks_var
df_irat['Impact Risk'] = impact_risks
df_irat['Impact Risk Var.'] = impact_risks_var
df_irat.to_csv('results/irat_predictions.csv', index=False)

In [130]:
# convert to textable
df_irat1 = pd.read_csv('results/irat_predictions.csv', index_col=0)
df_irat1['Emergence Risk Err.'] = np.sqrt(df_irat1['Emergence Risk Var.'])
df_irat1['Impact Risk Err.'] = np.sqrt(df_irat1['Impact Risk Var.'])
df_irat1 = df_irat1[['Virus Type', 'Dates of Risk Assessment',
                     'Potential Emergence Estimate', 'Potential Impact Estimate',
                     'HA Qnet Sample', 'NA Qnet Sample', 'Avg. Geometric Mean',
                     'Emergence Risk', 'Emergence Risk Err.',
                     'Impact Risk', 'Impact Risk Err.']][:22]
# textable(df_irat1,tabname='figures/irat_predictions.tex', FORMAT='%1.4f')

In [122]:
df_irat.corr()[:2][['Emergence Risk', 'Impact Risk']]

Unnamed: 0,Emergence Risk,Impact Risk
Potential Emergence Estimate,0.773969,0.726221
Potential Impact Estimate,0.671734,0.658506


In [123]:
pd.read_csv('results/irat_predictions.csv')[['Influenza Virus', 'Potential Emergence Estimate', 'Potential Impact Estimate', 'Emergence Risk', 'Impact Risk']]

Unnamed: 0,Influenza Virus,Potential Emergence Estimate,Potential Impact Estimate,Emergence Risk,Impact Risk
0,A/swine/Shandong/1207/2016,7.5,6.9,5.970237,6.056767
1,A/Ohio/13/2017,6.6,5.8,6.359501,6.395586
2,A/Hong Kong/125/2017,6.5,7.5,6.585691,6.594033
3,A/Shanghai/02/2013,6.4,7.2,6.765053,6.752073
4,A/Anhui-Lujiang/39/2018,6.2,5.9,5.606866,5.744949
5,A/Indiana/08/2011,6.0,4.5,6.411742,6.441334
6,A/California/62/2018,5.8,5.7,5.390709,5.561788
7,A/Bangladesh/0994/2011,5.6,5.4,4.689532,5.061702
8,A/Sichuan/06681/2021,5.3,6.3,4.621628,4.929231
9,A/Vietnam/1203/2004,5.2,6.6,6.021589,6.101613


## Predicted IRAT Scores - IRAT at Current Time

In [124]:
df_irat_cur = pd.read_csv('results/irat_average_qdistances_current.csv')
df_irat_cur['HA Qdistance'] = df_irat_cur['HA Qdistance'].apply(eval)
df_irat_cur['NA Qdistance'] = df_irat_cur['NA Qdistance'].apply(eval)
df_irat_cur['Geometric Mean'] = df_irat_cur['Geometric Mean'].apply(eval)

emergence_risks = []
emergence_risks_var = []
impact_risks = []
impact_risks_var = []

for i in range(len(df_irat_cur)):
    ha = df_irat_cur['HA Qdistance'].iloc[i]
    na = df_irat_cur['NA Qdistance'].iloc[i]
    gm = df_irat_cur['Geometric Mean'].iloc[i]
    if ha == -1:
        emergence_risks.append(-1)
        emergence_risks_var.append(-1)
        impact_risks.append(-1)
        impact_risks_var.append(-1)
        continue
    emergence_risk = []
    impact_risk = []
    for j in range(len(gm)):
        er, ir = predict_irat(mod2, mod4, ha[j], na[j], gm[j])
        emergence_risk.append(er)
        impact_risk.append(ir)
    emergence_risks.append(np.mean(emergence_risk))
    emergence_risks_var.append(np.std(emergence_risk))
    impact_risks.append(np.mean(impact_risk))
    impact_risks_var.append(np.var(impact_risk))
df_irat_cur['Emergence Risk'] = emergence_risks
df_irat_cur['Emergence Risk Var.'] = emergence_risks_var
df_irat_cur['Impact Risk'] = impact_risks
df_irat_cur['Impact Risk Var.'] = impact_risks_var
df_irat_cur.to_csv('results/irat_predictions_current.csv', index=False)

In [133]:
# convert to textable
df_irat_cur1 = pd.read_csv('results/irat_predictions_current.csv', index_col=0)
df_irat_cur1['Emergence Risk Err.'] = np.sqrt(df_irat_cur1['Emergence Risk Var.'])
df_irat_cur1['Impact Risk Err.'] = np.sqrt(df_irat_cur1['Impact Risk Var.'])
df_irat_cur1 = df_irat_cur1[['Virus Type', 'Dates of Risk Assessment',
                             'Potential Emergence Estimate', 'Potential Impact Estimate',
                             'HA Qnet Sample', 'NA Qnet Sample', 'Avg. Geometric Mean',
                             'Emergence Risk', 'Emergence Risk Err.',
                             'Impact Risk', 'Impact Risk Err.']][:22]
# textable(df_irat_cur1,tabname='figures/irat_predictions_current.tex', FORMAT='%1.4f')

In [126]:
df_irat_cur.corr()[:2][['Emergence Risk', 'Impact Risk']]

Unnamed: 0,Emergence Risk,Impact Risk
Potential Emergence Estimate,0.711509,0.677672
Potential Impact Estimate,0.614943,0.607335


In [127]:
pd.read_csv('results/irat_predictions_current.csv')[['Influenza Virus', 'Potential Emergence Estimate', 'Potential Impact Estimate', 'Emergence Risk', 'Impact Risk']]

Unnamed: 0,Influenza Virus,Potential Emergence Estimate,Potential Impact Estimate,Emergence Risk,Impact Risk
0,A/swine/Shandong/1207/2016,7.5,6.9,5.858453,5.9602
1,A/Ohio/13/2017,6.6,5.8,6.355973,6.392557
2,A/Hong Kong/125/2017,6.5,7.5,6.741911,6.731677
3,A/Shanghai/02/2013,6.4,7.2,6.799528,6.782522
4,A/Anhui-Lujiang/39/2018,6.2,5.9,6.392883,6.424997
5,A/Indiana/08/2011,6.0,4.5,6.488592,6.508719
6,A/California/62/2018,5.8,5.7,5.132289,5.344613
7,A/Bangladesh/0994/2011,5.6,5.4,4.455855,4.799649
8,A/Sichuan/06681/2021,5.3,6.3,4.499385,4.833593
9,A/Vietnam/1203/2004,5.2,6.6,5.488074,5.643334
