# Test the prefix tree and suffix tree for time and memory usage

In [1]:
from suffix_tree import Tree
from datrie import Trie, BaseTrie
import string


In [2]:
# load a protein database
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.file_io import fasta
import time

fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/filteredNOD2.fasta'
database = fasta.read(fasta_file, True)

database = {x['name']: x for x in database}
dblen = len(database)

In [3]:
# test the speed of building and the resulting size of the suffix tree
st = time.time()
su_t = Tree()
i = 0
for key, value in database.items():
    i += 1
    print(f'{i}/{dblen}\r', end='')
    su_t.add(key, value['sequence'])
print(f'Done. Time to build suffix tree: {time.time() - st}')
print(f'space used: {sys.getsizeof(su_t)}B')


Done. Time to build suffix tree: 34.500792026519775
space used: 64B


In [4]:
# test speed of building and resulting size of the prefix tree
min_len = 3
max_len = 35
st = time.time()
pt = Trie(string.ascii_uppercase)
i = 0
for key, value in database.items():
    i += 1
    print(f'{i}/{dblen}\r', end='')
    for j in range(len(value['sequence'])-min_len):
        subseqlen = max_len if j + max_len < len(value['sequence']) -1 else len(value['sequence']) - j
        pt[value['sequence'][j:j+subseqlen]] = key
print(f'Done. Time to build prefix tree: {time.time() - st}')
print(f'space used: {sys.getsizeof(pt)}B')


Done. Time to build prefix tree: 5.182687044143677
space used: 80B


In [5]:
# test speed of building and resulting size of the base prefix tree
st = time.time()
bpt = BaseTrie(string.ascii_uppercase)
savep = {}
i = 0
for key, value in database.items():
    i += 1 
    print(f'{i}/{dblen}\r', end='')
    bpt[value['sequence']] = i
    savep[i] = key
print(f'Done. Time to build prefix tree: {time.time() - st}')
print(f'space used: {sys.getsizeof(bpt)}B for trie and {sys.getsizeof(savep)}B for protein dict')


Done. Time to build prefix tree: 0.03750896453857422
space used: 72B for trie and 9328B for protein dict


## Make sure that we get the same results

In [7]:
print(su_t.find_all('VAFE'))
print(pt.has_keys_with_prefix('VAFE'))
print(pt.values('VAFE'))
print(database[list(database.keys())[0]])


[('LEG1_MOUSE', <suffix_tree.util.Path object at 0x7fdc0fcbf950>)]
True
['LEG1_MOUSE']
{'name': 'LEG1_MOUSE', 'sequence': 'MACGLVASNLNLKPGECLKVRGEVASDAKSFVLNLGKDSNNLCLHFNPRFNAHGDANTIVCNTKEDGTWGTEHREPAFPFQPGSITEVCITFDQADLTIKLPDGHEFKFPNRLNMEAINYMAADGDFKIKCVAFE', 'identifier': 'P16045', 'human_readable_name': 'Galectin-1'}


## Results
Don't use the base trie. Trash for trying to save proteins since we have to keep the dictionary
