# Compare the sizes of the different table implementation
1. KmerMasses approach of old
2. massive table
3. list of tables

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.types.objects import Spectrum
from src.spectra.gen_spectra import gen_spectrum
from src.utils import ppm_to_da
from pyteomics import fasta

import math
import sys
import time

from collections import defaultdict

In [2]:
fastafile = '/Users/zacharymcgrath/Documents/Layer_Research/hypedsearch/testing framework/data/databases/6000prots.fasta'


In [3]:
st = time.time()

bigtable = defaultdict(list)
# big table version
for i, entry in enumerate(fasta.read(fastafile)):
    seq = entry.sequence
    p_name = entry.description.split('|')[-1].split(' ')[0]
    
    print(f'On protein {i}/{6000}\r', end='')
    
    for i in range(3, 26):
        kmer_len = 25 if i + 25 < len(seq) else len(seq) - i
        kmer = seq[i:i+kmer_len]
        
        kmer_spec_b_s = gen_spectrum(kmer, ion='b', charge=1)['spectrum']
        kmer_spec_b_d = gen_spectrum(kmer, ion='b', charge=2)['spectrum']
        kmer_spec_y_s = gen_spectrum(kmer, ion='y', charge=1)['spectrum']
        kmer_spec_y_d = gen_spectrum(kmer, ion='y', charge=2)['spectrum']
        
        # add each m/z peak in the spectrum to the table
        for j in range(3, kmer_len + 1):

            # take the sequence left to right for b, and right to left for y
            subseq_b = kmer[:j]
            subseq_y = kmer[kmer_len-j:]

            # add singly and doubly entry for this sequence to the table respectively
            bigtable[math.floor(kmer_spec_b_s[j-1])] \
                .append((kmer_spec_b_s[j-1], 'bs', subseq_b, p_name))

            bigtable[math.floor(kmer_spec_b_d[j-1])] \
                .append((kmer_spec_b_d[j-1], 'bd', subseq_b, p_name))

            # add singly and doubly entry for this sequence to the table respectively
            bigtable[math.floor(kmer_spec_b_s[j-1])] \
                .append((kmer_spec_y_s[j-1], 'ys', subseq_y, p_name))

            bigtable[math.floor(kmer_spec_y_d[j-1])] \
                .append((kmer_spec_y_d[j-1], 'yd', subseq_y, p_name))
     
    
print(f'Size of big table is: {sys.getsizeof(bigtable)/1000}KB')
print(f'Time to build: {time.time() - st}')

Size of big table is: 147.576KB
Time to build: 17.727277994155884


In [4]:
st = time.time()

kmermasses = (defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list))
# big table version
for i, entry in enumerate(fasta.read(fastafile)):
    seq = entry.sequence
    p_name = entry.description.split('|')[-1].split(' ')[0]
    
    print(f'On protein {i}/{6000}\r', end='')
    
    for i in range(3, 26):
        kmer_len = 25 if i + 25 < len(seq) else len(seq) - i
        kmer = seq[i:i+kmer_len]
        
        kmer_spec_b_s = gen_spectrum(kmer, ion='b', charge=1)['spectrum']
        kmer_spec_b_d = gen_spectrum(kmer, ion='b', charge=2)['spectrum']
        kmer_spec_y_s = gen_spectrum(kmer, ion='y', charge=1)['spectrum']
        kmer_spec_y_d = gen_spectrum(kmer, ion='y', charge=2)['spectrum']
        
        # add each m/z peak in the spectrum to the table
        for j in range(3, kmer_len + 1):

            # take the sequence left to right for b, and right to left for y
            subseq_b = kmer[:j]
            subseq_y = kmer[kmer_len-j:]

            # add singly and doubly entry for this sequence to the table respectively
            kmermasses[0][math.floor(kmer_spec_b_s[j-1])] \
                .append((kmer_spec_b_s[j-1], 'bs', subseq_b, p_name))

            kmermasses[1][math.floor(kmer_spec_b_d[j-1])] \
                .append((kmer_spec_b_d[j-1], 'bd', subseq_b, p_name))

            # add singly and doubly entry for this sequence to the table respectively
            kmermasses[2][math.floor(kmer_spec_b_s[j-1])] \
                .append((kmer_spec_y_s[j-1], 'ys', subseq_y, p_name))

            kmermasses[3][math.floor(kmer_spec_y_d[j-1])] \
                .append((kmer_spec_y_d[j-1], 'yd', subseq_y, p_name))
     
    
print(f'Size of big table is: {sys.getsizeof(kmermasses)/1000}KB')
print(f'Time to build: {time.time() - st}')

Size of big table is: 0.088KB
Time to build: 26.988785982131958
