In [1]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
from Bio import SeqIO

from emergenet.emergenet import Enet, save_model, load_model

DATA_DIR = 'example_data/'

In [2]:
# initialize the Enet with a sequence not currently circulating
# for this example we chose A/Vietnam/1203/2004, an H5N1 sequence previously analyzed by IRAT
enet = Enet(seq=DATA_DIR+'target_sequence.fasta', seq_trunc_length=550, random_state=42)
print(enet.seq_metadata)
print(enet.seq)
print('Length of target sequence:', len(enet.seq))

AAT73274.1 |hemagglutinin, partial [Influenza A virus (A/Viet Nam/1203/2004(H5N1))]|Viet Nam||hemagglutinin|
MEKIVLLFAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKKHNGKLCDLDGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKANPVNDLCYPGDFNDYEELKHLLSRINHFEKIQIIPKSSWSSHEASLGVSSACPYQGKSSFFRNVVWLIKKNSTYPTIKRSYNNTNQEDLLVLWGIHHPNDAAEQTKLYQNPTTYISVGTSTLNQRLVPRIATRSKVNGQSGRMEFFWTILKPNDAINFESNGNFIAPEYAYKIVKKGDSTIMKSELEYGNCNTKCQTPMGAINSSMPFHNIHPLTIGECPKYVKSNRLVLATGLRNSPQRERRRKKRGLFGAIAGFIEGGWQGMVDGWYGYHHSNEQGSGYAADKESTQKAIDGVTNKVNSIIDKMNTQFEAVGREFNNLERRIENLNKKMEDGFLDVWTYNAELLVLMENERTLDFHDSNVKNLYDKVRLQLRDNAKELGNGCFEFYHKCDNECMESVRNGTYDYPQYSEEARLKREEISGVKLESIGIYQILSIYSTVASSLALAIMVAGLSLWMCSNGSLQCR
Length of target sequence: 565


In [3]:
# load fasta data (typically use human sequences of that subtype within 1 year prior of target sequence)
df = enet.load_data(filepath=DATA_DIR+'sequences.fasta', outfile=DATA_DIR+'sequences.csv')
print('Number of sequences:', len(df))
df.head()

Number of sequences: 257


Unnamed: 0,id,sequence
0,A/Hunan/2/2009|A_/_H5N1|$SEGMENT_NAME|2009-01-...,"[M, E, K, I, V, L, L, L, A, I, V, S, L, V, R, ..."
1,A/Hunan/1/2009|A_/_H5N1|$SEGMENT_NAME|2009-01-...,"[M, E, K, I, V, L, L, L, A, I, V, S, L, V, K, ..."
2,A/Beijing/1/2009|A_/_H5N1|$SEGMENT_NAME|2008-1...,"[M, E, K, I, V, L, L, L, A, I, V, S, L, V, K, ..."
3,A/Jiangsu/2/2007|A_/_H5N1|$SEGMENT_NAME|2007-1...,"[M, E, K, I, V, L, L, L, A, I, V, S, L, V, K, ..."
4,A/Jiangsu/1/2007|A_/_H5N1|$SEGMENT_NAME|2007-1...,"[M, E, K, I, V, L, L, L, A, I, V, S, L, V, K, ..."


In [4]:
%%time
# train qnet (automatically includes target sequence with df sequences)
myqnet = enet.train(seq_df=df, n_jobs=1)

CPU times: user 7min, sys: 815 ms, total: 7min 1s
Wall time: 7min 1s


In [10]:
# save qnet
save_model(qnet=myqnet, outfile=DATA_DIR+'qnet.joblib')

In [11]:
# load qnet
myqnet = load_model(filepath=DATA_DIR+'qnet.joblib')

In [12]:
%%time
# compute emergence risk score
emergence_risk_score = enet.emergence_risk(seq_df=df, qnet=myqnet)
print(emergence_risk_score)

0.1684691325423223
CPU times: user 20.6 s, sys: 14 ms, total: 20.6 s
Wall time: 20.6 s
