# Linear Models

In [357]:
# basic imports
import os 
import numpy as np
import pandas as pd
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# visualization
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 300

# linear model
import statsmodels.api as sm
import statsmodels.formula.api as smf

def save_fig(filename, AXIS=False):
    plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0, hspace = 0, wspace = 0)
    plt.margins(0,0)
    if not AXIS:
        plt.gca().xaxis.set_major_locator(plt.NullLocator())
        plt.gca().yaxis.set_major_locator(plt.NullLocator())
    plt.savefig(filename, dpi=300, bbox_inches = 'tight', pad_inches = 0, transparent=True) 
    return

In [358]:
df = pd.read_csv('results/irat_average_qdistances.csv')

# filter by positive average qdistances
df = df[(df['HA Average Qdistance'] > 0) & (df['NA Average Qdistance'] > 0)]
df.corr()

Unnamed: 0,Potential Emergence Estimate,Potential Impact Estimate,HA Qnet Sample,NA Qnet Sample,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance,Geometric Mean
Potential Emergence Estimate,1.0,0.801516,0.117317,0.122822,-0.357059,-0.697325,-0.706604,-0.703703
Potential Impact Estimate,0.801516,1.0,-0.256979,-0.254024,0.004992,-0.5625,-0.451331,-0.437505
HA Qnet Sample,0.117317,-0.256979,1.0,0.999838,-0.465577,-0.158515,-0.315155,-0.33544
NA Qnet Sample,0.122822,-0.254024,0.999838,1.0,-0.470155,-0.165806,-0.322818,-0.343628
HA Average Qdistance,-0.357059,0.004992,-0.465577,-0.470155,1.0,0.289746,0.635994,0.675846
NA Average Qdistance,-0.697325,-0.5625,-0.158515,-0.165806,0.289746,1.0,0.92282,0.860529
Both Average Qdistance,-0.706604,-0.451331,-0.315155,-0.322818,0.635994,0.92282,1.0,0.964927
Geometric Mean,-0.703703,-0.437505,-0.33544,-0.343628,0.675846,0.860529,0.964927,1.0


## Linear Model - Emergence Score

In [360]:
data = pd.read_csv('results/irat_average_qdistances.csv')
data = data.rename(columns={'Potential Emergence Estimate':'IRAT_Emergence_Score',
                            'Potential Impact Estimate':'IRAT_Impact_Score',
                            'Geometric Mean':'Geometric_Mean',
                            'HA Average Qdistance':'HA_Avg_Qdist',
                            'NA Average Qdistance':'NA_Avg_Qdist'})
# remove Yunnan and Netherlands outliers, and Duck strain
data = data[(data['HA_Avg_Qdist'] > 0) & (data['NA_Avg_Qdist'] > 0)].drop(labels=[10, 12], axis=0)

In [361]:
# IRAT_Emergence_Score ~ Geometric_Mean
formula1 = 'IRAT_Emergence_Score ~ Geometric_Mean'
mod1 = smf.glm(formula=formula1, data=data).fit()
print(mod1.summary())

# save to model_emergence_simple.txt
# with open("results/glm_models/model_emergence_simple.txt", "a") as f:
#     print(mod1.summary(), file=f)

                  Generalized Linear Model Regression Results                   
Dep. Variable:     IRAT_Emergence_Score   No. Observations:                   20
Model:                              GLM   Df Residuals:                       18
Model Family:                  Gaussian   Df Model:                            1
Link Function:                 identity   Scale:                         0.76392
Method:                            IRLS   Log-Likelihood:                -24.632
Date:                  Fri, 18 Nov 2022   Deviance:                       13.751
Time:                          21:04:54   Pearson chi2:                     13.8
No. Iterations:                       3   Pseudo R-squ. (CS):             0.7407
Covariance Type:              nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          6.408

In [362]:
# IRAT_Emergence_Score ~ Geometric_Mean + HA_Avg_Qdist:NA_Avg_Qdist
formula2 = 'IRAT_Emergence_Score ~ Geometric_Mean + HA_Avg_Qdist:NA_Avg_Qdist'
mod2 = smf.glm(formula=formula2, data=data).fit()
print(mod2.summary())

# save to model_emergence_complex.txt
# with open("results/glm_models/model_emergence_complex.txt", "a") as f:
#     print(mod2.summary(), file=f)

                  Generalized Linear Model Regression Results                   
Dep. Variable:     IRAT_Emergence_Score   No. Observations:                   20
Model:                              GLM   Df Residuals:                       17
Model Family:                  Gaussian   Df Model:                            2
Link Function:                 identity   Scale:                         0.74617
Method:                            IRLS   Log-Likelihood:                -23.826
Date:                  Fri, 18 Nov 2022   Deviance:                       12.685
Time:                          21:04:55   Pearson chi2:                     12.7
No. Iterations:                       3   Pseudo R-squ. (CS):             0.7678
Covariance Type:              nonrobust                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
In

## Linear Model - Impact Score

In [363]:
# IRAT_Emergence_Score ~ Geometric_Mean
formula3 = 'IRAT_Impact_Score ~ Geometric_Mean'
mod3 = smf.glm(formula=formula3, data=data).fit()
print(mod3.summary())

# save to model_impact_simple.txt
# with open("results/glm_models/model_impact_simple.txt", "a") as f:
#     print(mod3.summary(), file=f)

                 Generalized Linear Model Regression Results                  
Dep. Variable:      IRAT_Impact_Score   No. Observations:                   20
Model:                            GLM   Df Residuals:                       18
Model Family:                Gaussian   Df Model:                            1
Link Function:               identity   Scale:                          1.0201
Method:                          IRLS   Log-Likelihood:                -27.525
Date:                Fri, 18 Nov 2022   Deviance:                       18.362
Time:                        21:04:59   Pearson chi2:                     18.4
No. Iterations:                     3   Pseudo R-squ. (CS):             0.4382
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          6.3736      0.396     16.

In [364]:
# IRAT_Emergence_Score ~ Geometric_Mean + HA_Avg_Qdist:NA_Avg_Qdist
formula4 = 'IRAT_Impact_Score ~ Geometric_Mean + HA_Avg_Qdist:NA_Avg_Qdist'
mod4 = smf.glm(formula=formula4, data=data).fit()
print(mod4.summary())

# save to model_impact_complex.txt
# with open("results/glm_models/model_impact_complex.txt", "a") as f:
#     print(mod4.summary(), file=f)

                 Generalized Linear Model Regression Results                  
Dep. Variable:      IRAT_Impact_Score   No. Observations:                   20
Model:                            GLM   Df Residuals:                       17
Model Family:                Gaussian   Df Model:                            2
Link Function:               identity   Scale:                          1.0345
Method:                          IRLS   Log-Likelihood:                -27.093
Date:                Fri, 18 Nov 2022   Deviance:                       17.587
Time:                        21:04:59   Pearson chi2:                     17.6
No. Iterations:                     3   Pseudo R-squ. (CS):             0.4584
Covariance Type:            nonrobust                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

## Predicted IRAT Scores - IRAT at Time of Analysis

In [365]:
# input: data, emergence and impact models
# output: data with emergence and impact predictions
def predict_irat(df, emergence_mod, impact_mod):
    data = df.rename(columns={'Potential Emergence Estimate':'IRAT_Emergence_Score',
                              'Potential Impact Estimate':'IRAT_Impact_Score',
                              'Geometric Mean':'Geometric_Mean',
                              'HA Average Qdistance':'HA_Avg_Qdist',
                              'NA Average Qdistance':'NA_Avg_Qdist'})
    data = data[(data['HA_Avg_Qdist'] > 0) & (data['NA_Avg_Qdist'] > 0)]
    df['Predicted Emergence Score'] = emergence_mod.predict(data)
    df['Predicted Impact Score'] = impact_mod.predict(data)
    for i in range(len(df)):
        if df['Geometric Mean'][i] > 0.3 and df['Geometric Mean'][i] != 1:
            df.at[i, 'Predicted Emergence Score'] = 3.8
            df.at[i, 'Predicted Impact Score'] = 4.45
    return df

In [366]:
df_irat = pd.read_csv('results/irat_average_qdistances.csv')
df_irat = predict_irat(df_irat, mod2, mod4)

df_irat.to_csv('results/irat_predictions.csv')
df_irat

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance,Geometric Mean,Predicted Emergence Score,Predicted Impact Score
0,A/swine/Shandong/1207/2016,H1N1,Jul 2020,7.5,6.9,Moderate,MEARLFVLFCAFTTLKADTICVGYHANNSTDTVDTILEKNVTVTHS...,MNPNQKIITIGSICMTIGIASLILQIGNIISIWISHSIQIENQNQS...,1000.0,1000.0,0.094135,0.02053,0.057332,0.043961,5.990556,6.060932
1,A/Ohio/13/2017,H3N2,Jul 2019,6.6,5.8,Moderate,MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSP...,1000.0,1000.0,0.018431,0.030623,0.024527,0.023757,6.338644,6.337715
2,A/Hong Kong/125/2017,H7N9,May 2017,6.5,7.5,Moderate-High,MNTQILVFALIAIIPTNADKICLGHHAVSNGTKVNTLTERGVEVVN...,MNPNQKILCTSATAITIGAIAVLIGIANLGLNIGLHLKPGCNCSHS...,437.0,437.0,0.029599,0.005775,0.017687,0.013074,6.533111,6.492946
3,A/Shanghai/02/2013,H7N9,Apr 2016,6.4,7.2,Moderate-High,MNTQILVFALIAIIPTNADKICLGHHAVSNGTKVNTLTERGVEVVN...,MNPNQKILCTSATAIIIGAIAVLIGMANLGLNIGLHLKPGCNCSHS...,178.0,178.0,0.005495,0.003556,0.004525,0.00442,6.695908,6.623186
4,A/Anhui-Lujiang/39/2018,H9N2,Jul 2019,6.2,5.9,Moderate,METVSLITILLVATASNADKICIGYQSTNSTETVDTLTENNVPVTH...,MNPNQKITAIGSVSLIIAIICLLMQIAILTTTMTLHFGQKECSNPS...,31.0,30.0,0.029024,0.16809,0.098557,0.069847,5.5822,5.738396
5,A/Indiana/08/2011,H3N2,Dec 2012,6.0,4.5,Moderate,MKTIIAFSCILCLIFAQKLPGSDNSMATLCLGHHAVPNGTLVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTVTLHFKQHDYNSPP...,1000.0,1000.0,0.052311,0.009103,0.030707,0.021822,6.373331,6.365375
6,A/California/62/2018,H1N2,Jul 2019,5.8,5.7,Moderate,MKVKLMVLLCTFTATYADTICVGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSISLTLAAMCFLMQTAILVTNVTLHFNQCECHYPP...,55.0,55.0,0.108902,0.060951,0.084926,0.081472,5.412567,5.605279
7,A/Bangladesh/0994/2011,H9N2,Feb 2014,5.6,5.4,Moderate,METVSLMTILLLVTTSNADKICIGHQSTNSTETVDTLTETNVPVTH...,MNPNQKIIALGSASLTIAIICLLIQIAILATTMTLHFMQNEHTNST...,-1.0,-1.0,0.2078,0.182338,0.195069,0.194653,4.206547,4.689203
8,A/Sichuan/06681/2021,H5N6,Oct 2021,5.3,6.3,Moderate,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKITCISATGVTLSIVSLLIGITNLGLNIGLHYKVSDSTTIN...,45.0,45.0,0.361591,0.051794,0.206692,0.136852,4.721498,5.070941
9,A/Vietnam/1203/2004,H5N1,Nov 2011,5.2,6.6,Moderate,MEKIVLLFAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSICMVTGIVSLMLQIGNMISIWVSHSIHTGNQHQS...,258.0,246.0,0.167254,0.011074,0.091342,0.043037,6.005908,6.073107


In [367]:
df_irat.corr()[:2][['Predicted Emergence Score', 'Predicted Impact Score']]

Unnamed: 0,Predicted Emergence Score,Predicted Impact Score
Potential Emergence Estimate,0.762561,0.765721
Potential Impact Estimate,0.522076,0.528905


## Predicted IRAT Scores - IRAT at Current Time

In [368]:
df_irat_cur = pd.read_csv('results/irat_average_qdistances_current.csv')
df_irat_cur = predict_irat(df_irat_cur, mod2, mod4)

df_irat_cur.to_csv('results/irat_predictions_current.csv')
df_irat_cur

Unnamed: 0,Influenza Virus,Virus Type,Dates of Risk Assessment,Potential Emergence Estimate,Potential Impact Estimate,Summary Risk Score Category,HA Sequence,NA Sequence,HA Qnet Sample,NA Qnet Sample,HA Average Qdistance,NA Average Qdistance,Both Average Qdistance,Geometric Mean,Predicted Emergence Score,Predicted Impact Score
0,A/swine/Shandong/1207/2016,H1N1,Jul 2020,7.5,6.9,Moderate,MEARLFVLFCAFTTLKADTICVGYHANNSTDTVDTILEKNVTVTHS...,MNPNQKIITIGSICMTIGIASLILQIGNIISIWISHSIQIENQNQS...,1000,1000,0.059891,0.041687,0.050789,0.049967,5.892046,5.982887
1,A/Ohio/13/2017,H3N2,Jul 2019,6.6,5.8,Moderate,MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSP...,1000,1000,0.009133,0.069184,0.039159,0.025137,6.314049,6.318111
2,A/Hong Kong/125/2017,H7N9,May 2017,6.5,7.5,Moderate-High,MNTQILVFALIAIIPTNADKICLGHHAVSNGTKVNTLTERGVEVVN...,MNPNQKILCTSATAITIGAIAVLIGIANLGLNIGLHLKPGCNCSHS...,1000,1000,0.00919,0.004615,0.006903,0.006513,6.656111,6.591325
3,A/Shanghai/02/2013,H7N9,Apr 2016,6.4,7.2,Moderate-High,MNTQILVFALIAIIPTNADKICLGHHAVSNGTKVNTLTERGVEVVN...,MNPNQKILCTSATAIIIGAIAVLIGMANLGLNIGLHLKPGCNCSHS...,1000,1000,0.003082,0.004365,0.00374,0.003668,6.710282,6.634697
4,A/Anhui-Lujiang/39/2018,H9N2,Jul 2019,6.2,5.9,Moderate,METVSLITILLVATASNADKICIGYQSTNSTETVDTLTENNVPVTH...,MNPNQKITAIGSVSLIIAIICLLMQIAILTTTMTLHFGQKECSNPS...,58,58,0.015745,0.046728,0.031236,0.027124,6.278837,6.290057
5,A/Indiana/08/2011,H3N2,Dec 2012,6.0,4.5,Moderate,MKTIIAFSCILCLIFAQKLPGSDNSMATLCLGHHAVPNGTLVKTIT...,MNPNQKIITIGSVSLIIATICFLMQIAILVTTVTLHFKQHDYNSPP...,1000,1000,0.017557,0.018402,0.01798,0.017975,6.443003,6.42097
6,A/California/62/2018,H1N2,Jul 2019,5.8,5.7,Moderate,MKVKLMVLLCTFTATYADTICVGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSISLTLAAMCFLMQTAILVTNVTLHFNQCECHYPP...,37,37,0.203827,0.047728,0.125777,0.098632,5.177747,5.422068
7,A/Bangladesh/0994/2011,H9N2,Feb 2014,5.6,5.4,Moderate,METVSLMTILLLVTTSNADKICIGHQSTNSTETVDTLTETNVPVTH...,MNPNQKIIALGSASLTIAIICLLIQIAILATTMTLHFMQNEHTNST...,58,58,0.047339,0.465388,0.256364,0.148429,4.601474,4.980084
8,A/Sichuan/06681/2021,H5N6,Oct 2021,5.3,6.3,Moderate,MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKITCISATGVTLSIVSLLIGITNLGLNIGLHYKVSDSTTIN...,46,46,0.344316,0.059989,0.202152,0.143719,4.649288,5.016181
9,A/Vietnam/1203/2004,H5N1,Nov 2011,5.2,6.6,Moderate,MEKIVLLFAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,MNPNQKIITIGSICMVTGIVSLMLQIGNMISIWVSHSIHTGNQHQS...,48,45,0.132348,0.041099,0.088164,0.073752,5.524264,5.692867


In [369]:
df_irat_cur.corr()[:2][['Predicted Emergence Score', 'Predicted Impact Score']]

Unnamed: 0,Predicted Emergence Score,Predicted Impact Score
Potential Emergence Estimate,0.673504,0.676907
Potential Impact Estimate,0.438903,0.439491
