# Example Using emergenet.emergenet
- Compares risk assesment from IRAT and risk assessment using Emergenet
- Analyzes HA and NA segments of A/Ohio/13/2017 (H3N2), which was evaluated by IRAT on July 2019
- Collected Human HA and NA strains within one year prior to IRAT analysis (July 1, 2018 - June 30, 2019)

In [17]:
%%capture
!pip install emergenet --upgrade

In [1]:
from emergenet.emergenet import Enet, save_model, load_model

DATA_DIR = 'example_data/emergenet/'

## HA Emergence Risk Score

In [2]:
# initialize the Enet with a sequence not currently circulating
enet_ha = Enet(seq=DATA_DIR+'ha_target_sequence.fasta', seq_trunc_length=550, random_state=42)

print('Initializing Enet with fasta file\n---------------------------------\n')
print(enet_ha.seq_metadata)
print(enet_ha.seq)
print('Length of target sequence:', len(enet_ha.seq))

Initializing Enet with fasta file
---------------------------------

A/Ohio/13/2017|A_/_H3N2|$SEGMENT_NAME|2017-07-14|EPI1056653|
MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTITNDQIEVTNATELVQSFSTGEICNSPYQILDGENCTLIDALLGDPQCDGFQNNKWDLFVERSKAHSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVTQDGASSSCKRRSSNSFFSRLNWLTHLNFKYPALEVTMPNNEQFDKLYIWGVHHPATDKDQISLYAQAAGRIIVSTKRNQQAVIPNIGSRPRVRDIPSRISIYWTIVRPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCNSACITPNGSIPNDKPFQNVNRITYGACPRYVKQNTLKLATGMRNIPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSDVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHDVYRNEALNNRFQIKGVELKSEYKDWILWISFAISCFLLCVALLGFIMWACQKGNIKCNICI
Length of target sequence: 566


In [3]:
# can also initialize with sequence rather than fasta file
enet_ha_1 = Enet(seq='MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTITNDQIEVTNATELVQSFSTGEICNSPYQILDGENCTLIDALLGDPQCDGFQNNKWDLFVERSKAHSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVTQDGASSSCKRRSSNSFFSRLNWLTHLNFKYPALEVTMPNNEQFDKLYIWGVHHPATDKDQISLYAQAAGRIIVSTKRNQQAVIPNIGSRPRVRDIPSRISIYWTIVRPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCNSACITPNGSIPNDKPFQNVNRITYGACPRYVKQNTLKLATGMRNIPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSDVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHDVYRNEALNNRFQIKGVELKSEYKDWILWISFAISCFLLCVALLGFIMWACQKGNIKCNICI',
               seq_trunc_length=550, seq_metadata='A/Ohio/13/2017|A_/_H3N2|$SEGMENT_NAME|2017-07-14|EPI1056653|', random_state=42)

print('Initializing Enet with nucleotide sequence\n------------------------------------------\n')
print(enet_ha_1.seq_metadata)
print(enet_ha_1.seq)
print('Length of target sequence:', len(enet_ha_1.seq))

Initializing Enet with nucleotide sequence
------------------------------------------

A/Ohio/13/2017|A_/_H3N2|$SEGMENT_NAME|2017-07-14|EPI1056653|
MKTIIALSHILCLVFAQKLPGNDNNMATLCLGHHAVPNGTIVKTITNDQIEVTNATELVQSFSTGEICNSPYQILDGENCTLIDALLGDPQCDGFQNNKWDLFVERSKAHSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVTQDGASSSCKRRSSNSFFSRLNWLTHLNFKYPALEVTMPNNEQFDKLYIWGVHHPATDKDQISLYAQAAGRIIVSTKRNQQAVIPNIGSRPRVRDIPSRISIYWTIVRPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCNSACITPNGSIPNDKPFQNVNRITYGACPRYVKQNTLKLATGMRNIPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSDVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHDVYRNEALNNRFQIKGVELKSEYKDWILWISFAISCFLLCVALLGFIMWACQKGNIKCNICI
Length of target sequence: 566


In [4]:
# load fasta data
df_ha = enet_ha.load_data(filepath=DATA_DIR+'ha_sequences.fasta')

print('Number of sequences:', len(df_ha))
df_ha.head()

Number of sequences: 12389


Unnamed: 0,id,sequence
0,A/Taiwan/79440/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, K, T, I, I, A, L, S, Y, I, L, C, L, V, F, ..."
1,A/Taiwan/79440/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, K, T, I, I, A, L, S, Y, I, L, C, L, V, F, ..."
2,A/Taiwan/80749/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, K, T, I, I, A, L, S, Y, I, L, C, L, V, F, ..."
3,A/Taiwan/79429/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, K, T, I, I, A, L, S, Y, I, L, C, L, V, F, ..."
4,A/Myanmar/18M219/2018|A_/_H3N2|$SEGMENT_NAME|2...,"[M, K, T, I, I, A, L, S, Y, I, L, C, L, V, F, ..."


In [5]:
%%time
# train enet (automatically includes target sequence with df sequences)
enet_ha1 = enet_ha.train(seq_df=df_ha, sample_size=1000, n_jobs=1)
# save enet
save_model(enet=enet_ha1, outfile=DATA_DIR+'ha_enet.joblib')

CPU times: user 3min 25s, sys: 1.56 s, total: 3min 27s
Wall time: 3min 40s


In [6]:
# load enet
enet_ha1 = load_model(filepath=DATA_DIR+'ha_enet.joblib')

In [7]:
%%time
# compute emergence risk score
emergence_risk_score_ha, variance_ha = enet_ha.emergence_risk(seq_df=df_ha, enet=enet_ha1, sample_size=1000)

print('Emergence Risk Score:', emergence_risk_score_ha)
print('Variance:', variance_ha)

Emergence Risk Score: 0.02031589645039965
Variance: 1.673019568927244e-05
CPU times: user 33.3 s, sys: 123 ms, total: 33.4 s
Wall time: 33.4 s


In [8]:
%%time
# compute emergence risk score with qsampling to get upper and lower bounds, see qsampling module
# https://zeroknowledgediscovery.github.io/quasinet/build/html/quasinet.html#quasinet.qsampling.qsample
avg_ha, min_ha, max_ha, var_ha = enet_ha.emergence_risk_qsampling(seq_df=df_ha, enet=enet_ha1, sample_size=1000, qsamples=10, steps=10)

print('Emergence Risk Score:', avg_ha)
print('Bounds:', [min_ha, max_ha])
print('Variance:', var_ha)

Emergence Risk Score: 0.02032809780571675
Bounds: [0.02000448302209821, 0.02072229924271158]
Variance: 1.630070886429651e-05
CPU times: user 5min 32s, sys: 316 ms, total: 5min 33s
Wall time: 5min 33s


## NA Emergence Risk Score

In [9]:
# initialize the Enet with a sequence not currently circulating
enet_na = Enet(seq=DATA_DIR+'na_target_sequence.fasta', seq_trunc_length=449, random_state=42)

print('Initializing Enet with fasta file\n---------------------------------\n')
print(enet_na.seq_metadata)
print(enet_na.seq)
print('Length of target sequence:', len(enet_na.seq))

Initializing Enet with fasta file
---------------------------------

A/Ohio/13/2017|A_/_H3N2|$SEGMENT_NAME|2017-07-14|EPI1056652|
MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSPNNHVMFCEPTIIERNKTEIVYLTNTTVEKEICPKPTEYRNWSKPQCNITGFAPFSKDNSIRLSAGGDIWVTREPYVSCDQDKCYQFALGQGTTLNNGHSNDTVHDRTPYRTLLMNELGVPFHLGTRQVCIAWSSSSCHDGKAWLHVCITGDDKNATASLIYNGRLVDSIGSWSKNILRTQESECVCINGTCTVVMTDGSASGKADTKILFIEEGKIIHISTLSGSAQHVEECSCYPRYSGVRCVCRDNWKGSNRPIVDINVKDYSTVSSYICSGLVGDTPRKNDSFSSSNCLDPNNEEGGHGVKGWAFDDGNDLWMGRTISEKSRLGYETFKVVKGWSEPNSKLQSNRQVIVDRGNRSGYSGIFSIEGKNCINRCFYVELIRGRKEETKVLWTSNSIVVFCGTSGTYGTGSWPDGADINLMPI
Length of target sequence: 469


In [10]:
# can also initialize with sequence rather than fasta file
enet_na_1 = Enet(seq='MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSPNNHVMFCEPTIIERNKTEIVYLTNTTVEKEICPKPTEYRNWSKPQCNITGFAPFSKDNSIRLSAGGDIWVTREPYVSCDQDKCYQFALGQGTTLNNGHSNDTVHDRTPYRTLLMNELGVPFHLGTRQVCIAWSSSSCHDGKAWLHVCITGDDKNATASLIYNGRLVDSIGSWSKNILRTQESECVCINGTCTVVMTDGSASGKADTKILFIEEGKIIHISTLSGSAQHVEECSCYPRYSGVRCVCRDNWKGSNRPIVDINVKDYSTVSSYICSGLVGDTPRKNDSFSSSNCLDPNNEEGGHGVKGWAFDDGNDLWMGRTISEKSRLGYETFKVVKGWSEPNSKLQSNRQVIVDRGNRSGYSGIFSIEGKNCINRCFYVELIRGRKEETKVLWTSNSIVVFCGTSGTYGTGSWPDGADINLMPI',
                 seq_trunc_length=449, seq_metadata='A/Ohio/13/2017|A_/_H3N2|$SEGMENT_NAME|2017-07-14|EPI1056653|', random_state=42)

print('Initializing Enet with nucleotide sequence\n------------------------------------------\n')
print(enet_na_1.seq_metadata)
print(enet_na_1.seq)
print('Length of target sequence:', len(enet_na_1.seq))

Initializing Enet with nucleotide sequence
------------------------------------------

A/Ohio/13/2017|A_/_H3N2|$SEGMENT_NAME|2017-07-14|EPI1056653|
MNPNQKIITIGSVSLIIATICFLMQIAILVTTITLHFKQHNCDSSPNNHVMFCEPTIIERNKTEIVYLTNTTVEKEICPKPTEYRNWSKPQCNITGFAPFSKDNSIRLSAGGDIWVTREPYVSCDQDKCYQFALGQGTTLNNGHSNDTVHDRTPYRTLLMNELGVPFHLGTRQVCIAWSSSSCHDGKAWLHVCITGDDKNATASLIYNGRLVDSIGSWSKNILRTQESECVCINGTCTVVMTDGSASGKADTKILFIEEGKIIHISTLSGSAQHVEECSCYPRYSGVRCVCRDNWKGSNRPIVDINVKDYSTVSSYICSGLVGDTPRKNDSFSSSNCLDPNNEEGGHGVKGWAFDDGNDLWMGRTISEKSRLGYETFKVVKGWSEPNSKLQSNRQVIVDRGNRSGYSGIFSIEGKNCINRCFYVELIRGRKEETKVLWTSNSIVVFCGTSGTYGTGSWPDGADINLMPI
Length of target sequence: 469


In [11]:
# load fasta data
df_na = enet_na.load_data(filepath=DATA_DIR+'na_sequences.fasta')

print('Number of sequences:', len(df_na))
df_na.head()

Number of sequences: 12388


Unnamed: 0,id,sequence
0,A/Taiwan/79440/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, N, P, N, Q, K, I, I, T, I, G, S, V, S, L, ..."
1,A/Taiwan/79440/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, N, P, N, Q, K, I, I, T, I, G, S, V, S, L, ..."
2,A/Taiwan/80749/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, N, P, N, Q, K, I, I, T, I, G, S, V, S, L, ..."
3,A/Taiwan/79429/2018|A_/_H3N2|$SEGMENT_NAME|201...,"[M, N, P, N, Q, K, I, I, T, I, G, S, V, S, L, ..."
4,A/Myanmar/18M219/2018|A_/_H3N2|$SEGMENT_NAME|2...,"[M, N, P, N, Q, K, I, I, T, I, G, S, V, S, L, ..."


In [12]:
%%time
# train enet (automatically includes target sequence with df sequences)
enet_na1 = enet_na.train(seq_df=df_na, sample_size=1000, n_jobs=1)
# save enet
save_model(enet=enet_na1, outfile=DATA_DIR+'na_enet.joblib')

CPU times: user 2min 3s, sys: 366 ms, total: 2min 4s
Wall time: 2min 4s


In [13]:
# load enet
enet_na1 = load_model(filepath=DATA_DIR+'na_enet.joblib')

In [14]:
%%time
# compute emergence risk score
emergence_risk_score_na, variance_na = enet_na.emergence_risk(seq_df=df_na, enet=enet_na1, sample_size=1000)

print('Emergence Risk Score:', emergence_risk_score_na)
print('Variance:', variance_na)

Emergence Risk Score: 0.030500919904243758
Variance: 1.5647677590483994e-05
CPU times: user 25.1 s, sys: 9 ms, total: 25.1 s
Wall time: 25.1 s


In [15]:
%%time
# compute emergence risk score with qsampling to get upper and lower bounds, see qsampling module
# https://zeroknowledgediscovery.github.io/quasinet/build/html/quasinet.html#quasinet.qsampling.qsample
avg_na, min_na, max_na, var_na = enet_na.emergence_risk_qsampling(seq_df=df_na, enet=enet_na1, sample_size=1000, qsamples=10, steps=10)

print('Emergence Risk Score:', avg_na)
print('Bounds:', [min_na, max_na])
print('Variance:', var_na)

Emergence Risk Score: 0.03050774244062029
Bounds: [0.030194376467103805, 0.030851319190176453]
Variance: 1.541896855842682e-05
CPU times: user 4min 21s, sys: 45.6 ms, total: 4min 21s
Wall time: 4min 21s


## Average Emergence Risk Score
- Emergence risk scores should match values in SI-Table 16 (with some variation due to sampling differences in training Enet)

In [16]:
# compute average emergence risk (recall we used 1000 samples in each)
import numpy as np
emergence_risk_score_avg = np.sqrt(emergence_risk_score_ha * emergence_risk_score_na)

print('Emergenet potential emergence estimate:', round(emergence_risk_score_avg, 6))
print('IRAT potential emergence estimate: 6.6')

Emergenet potential emergence estimate: 0.024893
IRAT potential emergence estimate: 6.6
