# Looking at a single realistic spectrum in hypedsearch
We want to see how changes to the input spectrum affects the results of hypedsearch and see what is causing issues

## 1. Make some data

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.file_io import fasta
from src.spectra.gen_spectra import gen_spectrum
from modules.sequence_generation import realistic_spectra
import matplotlib.pyplot as plt
import numpy as np


fastafile = '../../testing framework/data/databases/4prots.fasta'

database = fasta.read(fastafile, True)

database = {x['name']: x for x in database}


In [2]:
import random
peptideprotein = list(database.keys())[random.randint(0, len(database)-1)]
peptidelen = random.randint(6, 15)
peptidestart = random.randint(0, len(database[peptideprotein]['sequence']) - peptidelen)
peptideseq = database[peptideprotein]['sequence'][peptidestart:peptidestart+peptidelen]

print(peptideseq)

ADTFEHVIEELLD


In [3]:
peptidetheoreticalspectrum = gen_spectrum(peptideseq)
peptiderealisticspectrum = realistic_spectra.gen_realistic_spectra([peptideseq], DEBUG=True)[0]


Dropout rate: 76
Number of peaks remaining: 19/52
Count of remaining ions by type:
b+: 3 	 b++: 7 	 y+: 6 	 y++: 3
Remaing ion types by position:
b+: [0, 5, 8]
b++: [0, 1, 4, 6, 9, 11, 12]
y+: [0, 5, 6, 7, 9, 10]
y++: [5, 8, 9]


In [4]:

heights = np.mean(peptiderealisticspectrum.abundance)
filterheight = sorted(peptiderealisticspectrum.abundance, reverse=True)[24]
filteredindices = [i for i in range(len(peptiderealisticspectrum.abundance)) if peptiderealisticspectrum.abundance[i] >= filterheight]
filteredspectra = [peptiderealisticspectrum.spectrum[i] for i in filteredindices]
filteredabundances = [peptiderealisticspectrum.abundance[i] for i in filteredindices]


In [5]:
# plt.figure(figsize=(18, 8))
# bs = gen_spectrum(peptideseq, ion='b', charge=1)['spectrum']
# overlappedbs = []
# bremove = []
# fremove = []
# for b in bs:
#     for f in filteredspectra:
#         if b - .005 <= f <= b + .005:
#             overlappedbs.append(b)
#             bremove.append(b)
#             fremove.append(f)
            
# bs = [x for x in bs if x not in bremove]
# filtered = [x for x in filteredspectra if x not in fremove]

# print(f'overlapped b+ ions: {len(overlappedbs)}')

# plt.bar(bs, [heights for _ in range(len(bs))], width=4)
# plt.bar(filtered, [heights for _ in range(len(filtered))], color='r', width=4)
# plt.bar(overlappedbs, [heights for _ in range(len(overlappedbs))], color='g', width=4)

In [6]:
# plt.figure(figsize=(18, 8))
# bd = gen_spectrum(peptideseq, ion='b', charge=2)['spectrum']
# overlappedbd = []
# bremove = []
# fremove = []
# for b in bd:
#     for f in filteredspectra:
#         if b - .005 <= f <= b + .005:
#             overlappedbd.append(b)
#             bremove.append(b)
#             fremove.append(f)
            
# bd = [x for x in bd if x not in bremove]
# filtered = [x for x in filteredspectra if x not in fremove]

# print(f'overlapped b++ ions: {len(overlappedbd)}')

# plt.bar(bd, [heights for _ in range(len(bd))], width=4)
# plt.bar(filtered, [heights for _ in range(len(filtered))], color='r', width=4)
# plt.bar(overlappedbd, [heights for _ in range(len(overlappedbd))], color='g', width=4)

In [7]:
# plt.figure(figsize=(18, 8))
# ys = gen_spectrum(peptideseq, ion='y', charge=1)['spectrum']
# overlappedys = []
# yremove = []
# fremove = []
# for y in ys:
#     for f in filteredspectra:
#         if y - .005 <= f <= y + .005:
#             overlappedys.append(y)
#             yremove.append(y)
#             fremove.append(f)
            
# ys = [x for x in ys if x not in yremove]
# filtered = [x for x in filteredspectra if x not in fremove]

# print(f'overlapped y+ ions: {len(overlappedys)}')

# plt.bar(ys, [heights for _ in range(len(ys))], width=4)
# plt.bar(filtered, [heights for _ in range(len(filtered))], color='r', width=4)
# plt.bar(overlappedys, [heights for _ in range(len(overlappedys))], color='g', width=4)

In [8]:
# plt.figure(figsize=(18, 8))
# yd = gen_spectrum(peptideseq, ion='y', charge=2)['spectrum']
# overlappedyd = []
# yremove = []
# fremove = []
# for y in yd:
#     for f in filteredspectra:
#         if y - .005 <= f <= y + .005:
#             overlappedyd.append(y)
#             yremove.append(y)
#             fremove.append(f)
            
# yd = [x for x in yd if x not in yremove]
# filtered = [x for x in filteredspectra if x not in fremove]

# print(f'overlapped y++ ions: {len(overlappedyd)}')

# plt.bar(yd, [heights for _ in range(len(yd))], width=4)
# plt.bar(filtered, [heights for _ in range(len(filtered))], color='r', width=4)
# plt.bar(overlappedyd, [heights for _ in range(len(overlappedyd))], color='g', width=4)

## Generate an mzml file

In [9]:
from modules.sequence_generation import write_spectra

write_spectra.write_mzml('../data/spectra/singleSpectraAnalysis', [dict(peptiderealisticspectrum._asdict())])

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


1


'./../data/spectra/singleSpectraAnalysis.mzML'

## Run hypedsearch

In [10]:
from src import runner

runner.run({
    'spectra_folder': '../data/spectra/',
    'database_file': fastafile, 
    'output_dir': '../data/testing_output/singleSpectraAnalysis/', 
    'min_peptide_len': 1, 
    'max_peptide_len': 16,
    'verbose': True
})

Loading database...
Adding protein 1/4 to tree
Done.
Building hashes for kmers...
Indexing database for k=16...
1748 unique kmers
Done
Looking at kmer 1748/1748
Done.
Analyzing spectra file 1/1[0%]

Analyzing spectrum 1/1[0%]
Finished search. Writting results to ../data/testing_output/singleSpectraAnalysis/...


## Load the summary json

In [13]:
import json

summ = json.load(open('../data/testing_output/singleSpectraAnalysis/summary.json', 'r'))
print(summ)

{'../data/spectra/singleSpectraAnalysis.mzML_0': {'spectrum': {'spectrum': [15.886632872006444, 16.23639204788584, 19.453084590002558, 29.856061209480036, 33.86341595851735, 36.53137895481262, 72.04006140486453, 90.80004505675907, 94.03512790418299, 105.94154063820359, 134.04538402908895, 140.32041630502974, 160.87112735278225, 164.17201353434965, 171.51557064151996, 186.5389906365378, 190.7146832237549, 200.01878567863326, 242.48405601993827, 251.2178768577833, 269.1036246210805, 271.9980015285035, 279.7629506790956, 282.6175560516119, 285.6748958977916, 305.7105822206752, 352.3919718816437, 366.1931764301051, 380.77604074198496, 386.8957274148725, 391.841782821657, 400.67810676228754, 408.7436509807204, 454.8011164066068, 465.9857838343182, 490.5413799216976, 496.759520300429, 510.9372027732763, 532.7737641294797, 544.2186715521628, 548.7765155342721, 586.2667617267181, 614.8285342755448, 615.3766002562344, 622.3142311551458, 633.68938590504, 635.4935765732547, 661.9693669694699, 667

In [12]:
print(summ['../data/spectra/singleSpectraAnalysis.mzML_0']['alignments'][0]['sequence'] == peptideseq)

IndexError: list index out of range