# Example Using emergenet.emergenet
- Compares risk assesment from IRAT and risk assessment using Emergenet
- Analyzes HA and NA segments of A/Ohio/13/2017 (H3N2), which was evaluated by IRAT on July 2019
- Collected Human HA and NA strains within one year prior to IRAT analysis (July 1, 2018 - June 30, 2019)

In [1]:
%%capture
!pip install emergenet --upgrade

In [2]:
from emergenet.emergenet import Enet, save_model, load_model, irat_risk

DATA_DIR = 'example_data/emergenet/'

## HA Emergence Risk Score

In [15]:
# initialize the Enet with A/Ohio/13/2017 HA
#enet_ha = Enet(seq=DATA_DIR+'ha_target_sequence.fasta', seq_trunc_length=550, random_state=42)
enet_ha = Enet(seq='../extras/variants/alberta.fasta', seq_trunc_length=550, random_state=42)

print('Initializing Enet with fasta file\n---------------------------------\n')
print(enet_ha.seq_metadata)
print(enet_ha.seq)
print('Length of target sequence:', len(enet_ha.seq))

Initializing Enet with fasta file
---------------------------------

A/Alberta/01/2020_(H1N2)v|EPI_ISL_683998|A_/_H1N2|P1||6B.1|2020-10-01|Bastien_Nathalie||RV16380/20|2020-12-09|Provincial_Laboratory_of_Public_Health_for_Southern_Alberta|Public_Health_Agency_of_Canada_(PHAC)|HA|EPI1815179|
MKAILLVLLHTFAATSADTICVGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRGKAPLYLGKCNIAGWLLGNPECELPLTVSSWSYIVETSDSDNGTCYPGDFTNYEELREQLSSVSSFERFEMFPKESSWPNHETNKSVTAACPYAGASSFYRNLIWLVKKDDSYPMLNISYVNNKGKEVLVLWGIHHPPTEDDQKWLYKNADAYVFVGTSTYSQKFEPEIATRPRVRDQTGRMNYYWTLVKPGDKITFEATGNLVVPRYAFAMNRGSESGIIISDAPVHDCNTICQTPKGALNTSLPFQNVHPVTIGECPKYIKSTRLKMATGLRNTPSIQSRGLFGAIAGFIEGGWTGMVDGWYGYHHQNEQGSGYAADQKSTQRAVDGITNKVNSIIERMNSQFTAVGKEFSNLERRIENLNKKVDDGFLDVWTYNAELLILLENERTLDFHDSNVKNLYERVRNQLRNNAKEIGNGCFEFYHKCDNTCMESVKNGTYDYPKYSEESKLNREEIDGVKLDSTKVYQILAIYSTVASSLVVLVSLGALSFWMCSNGSLQCRICI
Length of target sequence: 566


In [10]:
# can also initialize with sequence rather than fasta file
enet_ha_1 = Enet(seq='MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTITNDQIEVTNATELVQSFSTGEICNSPYQILDGENCTLIDALLGDPQCDGFQNNKWDLFVERSKAHSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVTQDGASSSCKRRSSNSFFSRLNWLTHLNFKYPALEVTMPNNEQFDKLYIWGVHHPATDKDQISLYAQAAGRIIVSTKRNQQAVIPNIGSRPRVRDIPSRISIYWTIVRPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCNSACITPNGSIPNDKPFQNVNRITYGACPRYVKQNTLKLATGMRNIPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSDVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHDVYRNEALNNRFQIKGVELKSEYKDWILWISFAISCFLLCVALLGFIMWACQKGNIKCNICI',
               seq_trunc_length=550, seq_metadata='A/Ohio/13/2017|A_/_H3N2|$SEGMENT_NAME|2017-07-14|EPI1056653|', random_state=42)

print('Initializing Enet with nucleotide sequence\n------------------------------------------\n')
print(enet_ha_1.seq_metadata)
print(enet_ha_1.seq)
print('Length of target sequence:', len(enet_ha_1.seq))

Initializing Enet with nucleotide sequence
------------------------------------------

A/Ohio/13/2017|A_/_H3N2|$SEGMENT_NAME|2017-07-14|EPI1056653|
MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTITNDQIEVTNATELVQSFSTGEICNSPYQILDGENCTLIDALLGDPQCDGFQNNKWDLFVERSKAHSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVTQDGASSSCKRRSSNSFFSRLNWLTHLNFKYPALEVTMPNNEQFDKLYIWGVHHPATDKDQISLYAQAAGRIIVSTKRNQQAVIPNIGSRPRVRDIPSRISIYWTIVRPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCNSACITPNGSIPNDKPFQNVNRITYGACPRYVKQNTLKLATGMRNIPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSDVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHDVYRNEALNNRFQIKGVELKSEYKDWILWISFAISCFLLCVALLGFIMWACQKGNIKCNICI
Length of target sequence: 566


In [16]:
# load fasta data
df_ha = enet_ha.load_data(filepath=DATA_DIR+'ha_sequences.fasta')

print('Number of sequences:', len(df_ha))
df_ha.head()

Number of sequences: 12389


Unnamed: 0,id,sequence
0,A/Taiwan/79440/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, K, T, I, I, A, L, S, Y, I, L, C, L, V, F, ..."
1,A/Taiwan/79440/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, K, T, I, I, A, L, S, Y, I, L, C, L, V, F, ..."
2,A/Taiwan/80749/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, K, T, I, I, A, L, S, Y, I, L, C, L, V, F, ..."
3,A/Taiwan/79429/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, K, T, I, I, A, L, S, Y, I, L, C, L, V, F, ..."
4,A/Myanmar/18M219/2018|A_/_H3N2|$SEGMENT_NAME|2...,"[M, K, T, I, I, A, L, S, Y, I, L, C, L, V, F, ..."


In [17]:
%%time
# train enet (automatically includes target sequence with df)
enet_ha1 = enet_ha.train(seq_df=df_ha, sample_size=1000, n_jobs=1)
# save enet
save_model(enet=enet_ha1, outfile=DATA_DIR+'ha_enet.joblib')

CPU times: user 10min 21s, sys: 486 ms, total: 10min 21s
Wall time: 10min 26s


In [18]:
# load enet
enet_ha1 = load_model(filepath=DATA_DIR+'ha_enet.joblib')

In [19]:
%%time
# compute risk score
risk_score_ha, variance_ha = enet_ha.emergence_risk(seq_df=df_ha.head(1), enet=enet_ha1, sample_size=1000)

print('Emergenet Risk Score:', risk_score_ha)
print('Variance:', variance_ha)

Emergenet Risk Score: 0.19984013035450224
Variance: 0.03993607770010445
CPU times: user 181 ms, sys: 2.99 ms, total: 184 ms
Wall time: 181 ms


In [11]:
%%time
# compute risk score
risk_score_ha, variance_ha = enet_ha.emergence_risk(seq_df=df_ha.head(1), enet=enet_ha1, sample_size=1000)

print('Emergenet Risk Score:', risk_score_ha)
print('Variance:', variance_ha)

Emergenet Risk Score: 0.012340261003657222
Variance: 0.00015228204163838315
CPU times: user 139 ms, sys: 1.01 ms, total: 140 ms
Wall time: 138 ms


In [15]:
%%time
# compute risk score with qsampling to get upper and lower bounds, see qsampling module
# https://zeroknowledgediscovery.github.io/quasinet/build/html/quasinet.html#quasinet.qsampling.qsample
avg_ha, min_ha, max_ha, var_ha = enet_ha.emergence_risk_qsampling(seq_df=df_ha, enet=enet_ha1, sample_size=1000, qsamples=10, steps=10)

print('Emergenet Risk Score:', avg_ha)
print('Bounds:', [min_ha, max_ha])
print('Variance:', var_ha)

Emergenet Risk Score: 0.020328876194469082
Bounds: [0.02001265632283635, 0.020721550600361946]
Variance: 1.6303481243028225e-05
CPU times: user 6min 55s, sys: 495 ms, total: 6min 55s
Wall time: 6min 55s


## NA Emergence Risk Score

In [19]:
# initialize the Enet with A/Ohio/13/2017 NA
enet_na = Enet(seq=DATA_DIR+'na_target_sequence.fasta', seq_trunc_length=449, random_state=42)

print('Initializing Enet with fasta file\n---------------------------------\n')
print(enet_na.seq_metadata)
print(enet_na.seq)
print('Length of target sequence:', len(enet_na.seq))

Initializing Enet with fasta file
---------------------------------

A/Ohio/13/2017|A_/_H3N2|$SEGMENT_NAME|2017-07-14|EPI1056652|
MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSPNNHVMFCEPTIIERNKTEIVYLTNTTVEKEICPKPTEYRNWSKPQCNITGFAPFSKDNSIRLSAGGDIWVTREPYVSCDQDKCYQFALGQGTTLNNGHSNDTVHDRTPYRTLLMNELGVPFHLGTRQVCIAWSSSSCHDGKAWLHVCITGDDKNATASLIYNGRLVDSIGSWSKNILRTQESECVCINGTCTVVMTDGSASGKADTKILFIEEGKIIHISTLSGSAQHVEECSCYPRYSGVRCVCRDNWKGSNRPIVDINVKDYSTVSSYICSGLVGDTPRKNDSFSSSNCLDPNNEEGGHGVKGWAFDDGNDLWMGRTISEKSRLGYETFKVVKGWSEPNSKLQSNRQVIVDRGNRSGYSGIFSIEGKNCINRCFYVELIRGRKEETKVLWTSNSIVVFCGTSGTYGTGSWPDGADINLMPI
Length of target sequence: 469


In [20]:
# can also initialize with sequence rather than fasta file
enet_na_1 = Enet(seq='MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSPNNHVMFCEPTIIERNKTEIVYLTNTTVEKEICPKPTEYRNWSKPQCNITGFAPFSKDNSIRLSAGGDIWVTREPYVSCDQDKCYQFALGQGTTLNNGHSNDTVHDRTPYRTLLMNELGVPFHLGTRQVCIAWSSSSCHDGKAWLHVCITGDDKNATASLIYNGRLVDSIGSWSKNILRTQESECVCINGTCTVVMTDGSASGKADTKILFIEEGKIIHISTLSGSAQHVEECSCYPRYSGVRCVCRDNWKGSNRPIVDINVKDYSTVSSYICSGLVGDTPRKNDSFSSSNCLDPNNEEGGHGVKGWAFDDGNDLWMGRTISEKSRLGYETFKVVKGWSEPNSKLQSNRQVIVDRGNRSGYSGIFSIEGKNCINRCFYVELIRGRKEETKVLWTSNSIVVFCGTSGTYGTGSWPDGADINLMPI',
                 seq_trunc_length=449, seq_metadata='A/Ohio/13/2017|A_/_H3N2|$SEGMENT_NAME|2017-07-14|EPI1056653|', random_state=42)

print('Initializing Enet with nucleotide sequence\n------------------------------------------\n')
print(enet_na_1.seq_metadata)
print(enet_na_1.seq)
print('Length of target sequence:', len(enet_na_1.seq))

Initializing Enet with nucleotide sequence
------------------------------------------

A/Ohio/13/2017|A_/_H3N2|$SEGMENT_NAME|2017-07-14|EPI1056653|
MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSPNNHVMFCEPTIIERNKTEIVYLTNTTVEKEICPKPTEYRNWSKPQCNITGFAPFSKDNSIRLSAGGDIWVTREPYVSCDQDKCYQFALGQGTTLNNGHSNDTVHDRTPYRTLLMNELGVPFHLGTRQVCIAWSSSSCHDGKAWLHVCITGDDKNATASLIYNGRLVDSIGSWSKNILRTQESECVCINGTCTVVMTDGSASGKADTKILFIEEGKIIHISTLSGSAQHVEECSCYPRYSGVRCVCRDNWKGSNRPIVDINVKDYSTVSSYICSGLVGDTPRKNDSFSSSNCLDPNNEEGGHGVKGWAFDDGNDLWMGRTISEKSRLGYETFKVVKGWSEPNSKLQSNRQVIVDRGNRSGYSGIFSIEGKNCINRCFYVELIRGRKEETKVLWTSNSIVVFCGTSGTYGTGSWPDGADINLMPI
Length of target sequence: 469


In [21]:
# load fasta data
df_na = enet_na.load_data(filepath=DATA_DIR+'na_sequences.fasta')

print('Number of sequences:', len(df_na))
df_na.head()

Number of sequences: 12388


Unnamed: 0,id,sequence
0,A/Taiwan/79440/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, N, P, N, Q, K, I, I, T, I, G, S, V, S, L, ..."
1,A/Taiwan/79440/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, N, P, N, Q, K, I, I, T, I, G, S, V, S, L, ..."
2,A/Taiwan/80749/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, N, P, N, Q, K, I, I, T, I, G, S, V, S, L, ..."
3,A/Taiwan/79429/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, N, P, N, Q, K, I, I, T, I, G, S, V, S, L, ..."
4,A/Myanmar/18M219/2018|A_/_H3N2|$SEGMENT_NAME|2...,"[M, N, P, N, Q, K, I, I, T, I, G, S, V, S, L, ..."


In [22]:
%%time
# train enet (automatically includes target sequence with df)
enet_na1 = enet_na.train(seq_df=df_na, sample_size=1000, n_jobs=1)
# save enet
save_model(enet=enet_na1, outfile=DATA_DIR+'na_enet.joblib')

CPU times: user 2min 26s, sys: 682 ms, total: 2min 27s
Wall time: 2min 27s


In [23]:
# load enet
enet_na1 = load_model(filepath=DATA_DIR+'na_enet.joblib')

In [24]:
%%time
# compute risk score
risk_score_na, variance_na = enet_na.emergence_risk(seq_df=df_na, enet=enet_na1, sample_size=1000)

print('Emergenet Risk Score:', risk_score_na)
print('Variance:', variance_na)

Emergenet Risk Score: 0.03050091990424366
Variance: 1.564767759048388e-05
CPU times: user 31.2 s, sys: 11.9 ms, total: 31.2 s
Wall time: 31.2 s


In [26]:
%%time
# compute risk score with qsampling to get upper and lower bounds, see qsampling module
# https://zeroknowledgediscovery.github.io/quasinet/build/html/quasinet.html#quasinet.qsampling.qsample
avg_na, min_na, max_na, var_na = enet_na.emergence_risk_qsampling(seq_df=df_na, enet=enet_na1, sample_size=1000, qsamples=10, steps=10)

print('Emergenet Risk Score:', avg_na)
print('Bounds:', [min_na, max_na])
print('Variance:', var_na)

Emergenet Risk Score: 0.030511130875083347
Bounds: [0.03021676107351499, 0.030865105872841463]
Variance: 1.5402245650034415e-05
CPU times: user 5min 25s, sys: 376 ms, total: 5min 25s
Wall time: 5min 25s


## Average Emergence Risk Score
- Emergence risk scores should match values in SI-Table 16 (with some variation due to sampling differences in training Enet)

In [28]:
# compute geometric mean emergence risk
import numpy as np
geom_mean_risk_score = np.sqrt(risk_score_ha * risk_score_na)
irat_emergence_prediction, irat_impact_prediction = irat_risk(risk_score_ha, risk_score_na)

print('Geometric Mean of HA and NA risk scores:', round(geom_mean_risk_score, 6))
print('Emergenet prediction of IRAT emergence estimate:', round(irat_emergence_prediction, 1))
print('IRAT emergence estimate: 6.6')
print('Emergenet prediction of IRAT impact estimate:', round(irat_impact_prediction, 1))
print('IRAT impact estimate: 5.8')

Geometric Mean of HA and NA risk scores: 0.024893
Emergenet prediction of IRAT emergence estimate: 6.3
IRAT emergence estimate: 6.6
Emergenet prediction of IRAT impact estimate: 6.4
IRAT impact estimate: 5.8
