# Looking at a single realistic spectrum in hypedsearch
We want to see how changes to the input spectrum affects the results of hypedsearch and see what is causing issues

## 1. Make some data

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.file_io import fasta
from src.spectra.gen_spectra import gen_spectrum
from modules.sequence_generation import realistic_spectra
import matplotlib.pyplot as plt
import numpy as np


fastafile = '../../testing framework/data/databases/4prots.fasta'

database = fasta.read(fastafile, True)

database = {x['name']: x for x in database}


In [2]:
import random
peptideprotein = list(database.keys())[random.randint(0, len(database)-1)]
peptidelen = random.randint(6, 15)
peptidestart = random.randint(0, len(database[peptideprotein]['sequence']) - peptidelen)
peptideseq = database[peptideprotein]['sequence'][peptidestart:peptidestart+peptidelen]

print(peptideseq)

GQQRSPSESSCSSLT


In [3]:
peptidetheoreticalspectrum = gen_spectrum(peptideseq)
peptiderealisticspectrum = realistic_spectra.gen_realistic_spectra([peptideseq], DEBUG=True)[0]


Dropout rate: 60
Number of peaks remaining: 29/60
Count of remaining ions by type:
b+: 6 	 b++: 6 	 y+: 10 	 y++: 7
Remaing ion types by position:
b+: [4, 6, 7, 8, 12, 13]
b++: [3, 4, 5, 9, 12, 13]
y+: [1, 2, 4, 5, 7, 8, 9, 11, 12, 14]
y++: [1, 4, 5, 7, 10, 13, 14]


In [4]:

heights = np.mean(peptiderealisticspectrum.abundance)
filterheight = sorted(peptiderealisticspectrum.abundance, reverse=True)[24]
filteredindices = [i for i in range(len(peptiderealisticspectrum.abundance)) if peptiderealisticspectrum.abundance[i] >= filterheight]
filteredspectra = [peptiderealisticspectrum.spectrum[i] for i in filteredindices]
filteredabundances = [peptiderealisticspectrum.abundance[i] for i in filteredindices]


In [5]:
# plt.figure(figsize=(18, 8))
# bs = gen_spectrum(peptideseq, ion='b', charge=1)['spectrum']
# overlappedbs = []
# bremove = []
# fremove = []
# for b in bs:
#     for f in filteredspectra:
#         if b - .005 <= f <= b + .005:
#             overlappedbs.append(b)
#             bremove.append(b)
#             fremove.append(f)
            
# bs = [x for x in bs if x not in bremove]
# filtered = [x for x in filteredspectra if x not in fremove]

# print(f'overlapped b+ ions: {len(overlappedbs)}')

# plt.bar(bs, [heights for _ in range(len(bs))], width=4)
# plt.bar(filtered, [heights for _ in range(len(filtered))], color='r', width=4)
# plt.bar(overlappedbs, [heights for _ in range(len(overlappedbs))], color='g', width=4)

In [6]:
# plt.figure(figsize=(18, 8))
# bd = gen_spectrum(peptideseq, ion='b', charge=2)['spectrum']
# overlappedbd = []
# bremove = []
# fremove = []
# for b in bd:
#     for f in filteredspectra:
#         if b - .005 <= f <= b + .005:
#             overlappedbd.append(b)
#             bremove.append(b)
#             fremove.append(f)
            
# bd = [x for x in bd if x not in bremove]
# filtered = [x for x in filteredspectra if x not in fremove]

# print(f'overlapped b++ ions: {len(overlappedbd)}')

# plt.bar(bd, [heights for _ in range(len(bd))], width=4)
# plt.bar(filtered, [heights for _ in range(len(filtered))], color='r', width=4)
# plt.bar(overlappedbd, [heights for _ in range(len(overlappedbd))], color='g', width=4)

In [7]:
# plt.figure(figsize=(18, 8))
# ys = gen_spectrum(peptideseq, ion='y', charge=1)['spectrum']
# overlappedys = []
# yremove = []
# fremove = []
# for y in ys:
#     for f in filteredspectra:
#         if y - .005 <= f <= y + .005:
#             overlappedys.append(y)
#             yremove.append(y)
#             fremove.append(f)
            
# ys = [x for x in ys if x not in yremove]
# filtered = [x for x in filteredspectra if x not in fremove]

# print(f'overlapped y+ ions: {len(overlappedys)}')

# plt.bar(ys, [heights for _ in range(len(ys))], width=4)
# plt.bar(filtered, [heights for _ in range(len(filtered))], color='r', width=4)
# plt.bar(overlappedys, [heights for _ in range(len(overlappedys))], color='g', width=4)

In [8]:
# plt.figure(figsize=(18, 8))
# yd = gen_spectrum(peptideseq, ion='y', charge=2)['spectrum']
# overlappedyd = []
# yremove = []
# fremove = []
# for y in yd:
#     for f in filteredspectra:
#         if y - .005 <= f <= y + .005:
#             overlappedyd.append(y)
#             yremove.append(y)
#             fremove.append(f)
            
# yd = [x for x in yd if x not in yremove]
# filtered = [x for x in filteredspectra if x not in fremove]

# print(f'overlapped y++ ions: {len(overlappedyd)}')

# plt.bar(yd, [heights for _ in range(len(yd))], width=4)
# plt.bar(filtered, [heights for _ in range(len(filtered))], color='r', width=4)
# plt.bar(overlappedyd, [heights for _ in range(len(overlappedyd))], color='g', width=4)

## Generate an mzml file

In [9]:
from modules.sequence_generation import write_spectra

write_spectra.write_mzml('../data/spectra/singleSpectraAnalysis', [dict(peptiderealisticspectrum._asdict())])

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


1


'./../data/spectra/singleSpectraAnalysis.mzML'

## Run hypedsearch

In [10]:
from src import runner

runner.run({
    'spectra_folder': '../data/spectra/',
    'database_file': fastafile, 
    'output_dir': '../data/testing_output/singleSpectraAnalysis/', 
    'min_peptide_len': 3, 
    'max_peptide_len': 16,
    'tolerance': 20,
    'verbose': True
})

Loading database...
Adding protein 1/4 to tree
Done.
Building hashes for kmers...
Indexing database for k=16...
1748 unique kmers
Done
Looking at kmer 1747/1748
Done.
Analyzing spectra file 1/1[0%]

sequences with in +/- 0.05 of 100.77106719077926
[]
sequences with in +/- 0.05 of 117.0829590530235
[]
sequences with in +/- 0.05 of 143.67297237626792
[]
sequences with in +/- 0.05 of 143.67297237626792
[]
sequences with in +/- 0.05 of 148.2269003538997
[]
sequences with in +/- 0.05 of 148.2269003538997
[]
sequences with in +/- 0.05 of 149.65753648399993
[MassSequence(mass=149.609897435, sequence='LLA'), MassSequence(mass=149.609897435, sequence='LAL'), MassSequence(mass=149.609897435, sequence='ALL'), MassSequence(mass=149.609897435, sequence='ILA')]
sequences with in +/- 0.05 of 149.65753648399993
[]
sequences with in +/- 0.05 of 153.59049728835575
[MassSequence(mass=153.54135443500002, sequence='CDS'), MassSequence(mass=153.550668435, sequence='CVC'), MassSequence(mass=153.559546935, se

## Load the summary json

In [11]:
import json

summ = json.load(open('../data/testing_output/singleSpectraAnalysis/summary.json', 'r'))
print(summ)

{'../data/spectra/singleSpectraAnalysis.mzML_0': {'spectrum': {'spectrum': [10.138212808836052, 11.272310808789543, 35.29567327513134, 80.84814967535274, 100.77106719077926, 117.0829590530235, 143.67297237626792, 148.2269003538997, 149.65753648399993, 153.59049728835575, 162.02674243255564, 205.5140109943617, 212.17149437279463, 216.88025169895624, 223.0439024209438, 233.14284409148956, 235.62708079647777, 255.61393374919527, 266.3289847634731, 267.7536801707255, 279.1395694299211, 280.60166730605596, 284.0609434522285, 292.10554605267316, 294.32505976979616, 299.13306007921824, 320.1812336058002, 327.6716717328088, 367.13641576254975, 407.16852159933103, 454.0366031000489, 461.53918416780124, 479.65835419816125, 481.07293496468685, 496.58565383607987, 499.12085662274995, 510.22297675972493, 520.8090209648036, 522.7375567687342, 542.7246242696325, 557.2768142841138, 566.0645696633608, 574.4825114241106, 597.259266889555, 601.4809593752067, 626.5847904562452, 661.2754761684816, 697.7736

In [12]:
print(summ['../data/spectra/singleSpectraAnalysis.mzML_0']['alignments'][0]['sequence'])
print(peptideseq)

PNSGQQRSPSESSCSSLT
GQQRSPSESSCSSLT


In [15]:
gen_spectrum('GQQR', ion='b', charge=2)

{'spectrum': [29.518008435000002,
  93.547297435,
  157.57658643500002,
  235.62714193500003],
 'precursor_mass': 244.63242425}