# Test the prefix tree and suffix tree for time and memory usage

In [1]:
from suffix_tree import Tree
from datrie import Trie, BaseTrie
import string


In [2]:
# load a protein database
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.file_io import fasta
import time

fasta_file = '../../testing framework/data/databases/1000prots.fasta'
database = fasta.read(fasta_file, True)

database = {x['name']: x for x in database}
dblen = len(database)

In [3]:
# test the speed of building and the resulting size of the suffix tree
st = time.time()
su_t = Tree()
i = 0
for key, value in database.items():
    i += 1
    print(f'{i}/{dblen}\r', end='')
    su_t.add(key, value['sequence'])
print(f'Done. Time to build suffix tree: {time.time() - st}')
print(f'space used: {sys.getsizeof(su_t)}B')


Done. Time to build suffix tree: 527.5714259147644
space used: 64B


In [18]:
# test speed of building and resulting size of the prefix tree
min_len = 3
max_len = 35
st = time.time()
pt = Trie(string.ascii_uppercase)
i = 0
for key, value in database.items():
    i += 1
    print(f'{i}/{dblen}\r', end='')
    for j in range(len(value['sequence'])-min_len):
        subseqlen = max_len if j + max_len < len(value['sequence']) -1 else len(value['sequence']) - j
        pt[value['sequence'][j:j+subseqlen]] = key
print(f'Done. Time to build prefix tree: {time.time() - st}')
print(f'space used: {sys.getsizeof(pt)}B')


Done. Time to build prefix tree: 100.74237513542175
space used: 80B


In [12]:
# test speed of building and resulting size of the base prefix tree
st = time.time()
bpt = BaseTrie(string.ascii_uppercase)
savep = {}
i = 0
for key, value in database.items():
    i += 1 
    print(f'{i}/{dblen}\r', end='')
    bpt[value['sequence']] = i
    savep[i] = key
print(f'Done. Time to build prefix tree: {time.time() - st}')
print(f'space used: {sys.getsizeof(bpt)}B for trie and {sys.getsizeof(savep)}B for protein dict')


1/10002/10003/10004/10005/10006/10007/10008/10009/100010/100011/100012/100013/100014/100015/100016/100017/100018/100019/100020/100021/100022/100023/100024/100025/100026/100027/100028/100029/100030/100031/100032/100033/100034/100035/100036/100037/100038/100039/100040/100041/100042/100043/100044/100045/100046/100047/100048/100049/100050/100051/100052/100053/100054/100055/100056/100057/100058/100059/100060/100061/100062/100063/100064/100065/100066/100067/100068/100069/100070/100071/100072/100073/100074/100075/100076/100077/100078/100079/100080/100081/100082/100083/100084/100085/100086/100087/100088/100089/100090/100091/100092/100093/100094/100095/100096/100097/100098/100099/1000100/1000101/1000102/1000103/1000104/1000105/1000106/1000107/1000108/1000109/1000110/1000111/1000112/1000113/1000114/1000115/1000116/1000117/1000118/1000119/1000120/1000121/1000122/1000123/10001

## Make sure that we get the same results

In [32]:
print(su_t.find_all('MALWA'))
print(pt.has_keys_with_prefix('MALWA'))
print(pt.values('MALWA'))
print(database[list(database.keys())[0]])


[('AGAL_MOUSE', <suffix_tree.util.Path object at 0x7fcb52f34f50>)]
True
['AGAL_MOUSE']
{'name': 'TECR_MOUSE', 'sequence': 'MKHYEVEIRDAKTREKLCFLDKVEPQATISEIKTLFTKTHPQWYPARQSLRLDPKGKSLKDEDVLQKLPVGTTATLYFRDLGAQISWVTVFLTEYAGPLFIYLLFYFRVPFIYGRKYDFTSSRHTVVHLACMCHSFHYIKRLLETLFVHRFSHGTMPLRNIFKNCTYYWGFAAWMAYYINHPLYTPPTYGVQQVKLALAVFVICQLGNFSIHMALRDLRPAGSKTRKIPYPTKNPFTWLFLLVSCPNYTYEVGSWIGFAILTQCVPVALFSLVGFTQMTIWAKGKHRSYLKEFRDYPPLRMPIIPFLL', 'identifier': 'Q9CY27', 'human_readable_name': 'Very-long-chain enoyl-CoA reductase'}


## Results
Don't use the base trie. Trash for trying to save proteins since we have to keep the dictionary
