# Make realistic spectra
Make our generated data look more like real data

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.spectra.gen_spectra import gen_spectrum
from random import randint
from collections import namedtuple
import numpy as np

In [6]:
RealisticSpectrum = namedtuple('RealisticSpectrum', ['spectrum', 'abundance', 'precursor_mass'])

def gen_realistic_spectra(sequences: list) -> list:
    '''
    '''
    realistic_spectra = []
    for seq in sequences:
        spec_props = gen_spectrum(seq)
        spec = spec_props['spectrum']
        precursor = spec_props['precursor_mass']
        # Mess with it
        # 1. Drop out peaks
        dropout_rate = randint(60, 85)
        dropouts = [randint(0, 100) < dropout_rate for _ in range(len(spec))]
        leftover_peaks = [spec[i] for i in range(len(spec)) if not dropouts[i]]

        # 2. introduce mass errors
        for i in range(len(leftover_peaks)):
            factor = 1 if randint(0, 10) < 5 else -1
            leftover_peaks[i] += factor * np.random.pareto(600) # found from experiments

        # 3. pick the abundance
        abundances = points = np.random.pareto(1, len(leftover_peaks)) * 2000
        
        realistic_spectra.append(RealisticSpectrum(leftover_peaks, abundances, precursor))
    
    return realistic_spectra

## Make sequences

In [7]:
from src.file_io import fasta

fasta_file = '../../testing framework/data/databases/100prots.fasta'
database = fasta.read(fasta_file, True)

database = {x['name']: x for x in database}

from modules.sequence_generation import proteins, peptides
test_directory = '../data/testing_output/'

num_hybs = 5
min_length= 5
max_length = 20
num_peptides = 100
min_cont = 3 #min contribution for each side of a hybrid

# make hybrid proteins
hyb_prots = proteins.generate_hybrids([x for _, x in database.items()], num_hybs, min_contribution=max_length)
# create peptides
non_hybrid_peps = peptides.gen_peptides([x for _, x in database.items()], num_peptides, min_length=min_length, max_length=max_length, digest='random', dist='beta')
# create hybrid peptides
hyb_peps = peptides.gen_peptides(hyb_prots, num_hybs, min_length=min_length, max_length=max_length, digest='random', min_contribution=min_cont, hybrid_list=True)

all_proteins_raw = [x for _,x in database.items()] + hyb_prots
all_peptides_raw = non_hybrid_peps + hyb_peps

peptides = {}
for i, pep in enumerate(all_peptides_raw):
    peptides[i] = pep
    peptides[i]['scan_no'] = i
    
import json
experiment_info_file_name = 'experiment_info.json'

exp = {'database': fasta_file, 'peptides': peptides}
with open(test_directory + experiment_info_file_name, 'w') as o:
    json.dump(exp, o)

Generating hybrid protein 0/5[0%]Generating hybrid protein 1/5[20%]Generating hybrid protein 2/5[40%]Generating hybrid protein 3/5[60%]Generating hybrid protein 4/5[80%]
Finished generating hybrid proteins


In [8]:
from modules.sequence_generation import write_spectra

spectra = gen_realistic_spectra([p['sequence'] for p in all_peptides_raw])

write_spectra.write_mzml('realisticSpectra', [x._asdict() for x in spectra], output_dir=test_directory)


OrderedDict([('spectrum', [357.2256498775485, 44.520513505549914, 293.1700303856042, 341.69761393056916, 628.3776794578276, 741.4610881266184, 236.6435904886326, 414.74785371744997]), ('abundance', array([3540.46265055, 1389.32465648, 5535.55425355, 1142.16033593,
       5781.09010783,  802.9783693 ,  626.45684955, 4024.82585276])), ('precursor_mass', 414.75052725000006)])
OrderedDict([('spectrum', [865.4826648232279, 79.05744663768904, 164.11165660978418, 317.6921468366993, 433.24993550186673, 500.2648996785691, 442.25040246654754]), ('abundance', array([  4688.57463888, 129528.58731226,    741.50250775,  10538.6294523 ,
         5227.31840581,   2370.33902375,   2687.30810016])), ('precursor_mass', 442.25017225000005)])
OrderedDict([('spectrum', [314.20223181217347, 90.05630853091401, 397.1198417676071, 609.2768953693503, 97.0344734511997, 305.1352733975159]), ('abundance', array([3703.97926093, 1149.04628358, 4672.49426808, 2779.47100458,
       4276.63535847, 2366.43391687])), ('pr

'../data/testing_output/realisticSpectra.mzML'