# Run a test of hypedsearch with generated data
The following steps describe how the test works
1. Load a fasta database
2. Generate
    1. Hybrid proteins
    2. Peptides
    3. Hybrid peptides from the hybrid proteins
3. Generate spectra for all the peptides created
4. Run hypedsearch with the .fasta file (no hybrid proteins included) and the spectra files
5. Load the summary.json file created
6. Determine what number of alignments were correct

## 1. Load fasta database

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.file_io import fasta

fasta_file = '../../testing framework/data/databases/4prots.fasta'
database = fasta.read(fasta_file, True)

database = {x['name']: x for x in database}

## 2.  Generate the peptides, hybrid proteins and peptides

In [2]:
hybrid_peptide = 'LCGSHL-KAGSR' #insulin-bdnf
hybpep = 'LCGSHLKAGSR'

## 3. Generate spectra

In [3]:
from src.spectra import gen_spectra
from src.utils import utils
from modules.sequence_generation import write_spectra

test_directory = '../../testing framework/data/testing_output/'

utils.make_dir(test_directory)

cont = gen_spectra.gen_spectrum(hybpep)
spec = cont['spectrum']
pm = cont['precursor_mass']
spectra = [{'spectrum': spec, 'precursor_mass': pm}]
write_spectra.write_mzml('testSpectraFile', spectra, output_dir=test_directory)


Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


'../../testing framework/data/testing_output/testSpectraFile.mzML'

## 4. Run hypedsearch

In [4]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src import runner
from time import time

test_directory = '../../testing framework/data/testing_output/'
fasta_file = '../../testing framework/data/databases/4prots.fasta'

args = {
    'spectra_folder': test_directory,
    'database_file': fasta_file,
    'output_dir': test_directory,
    'min_peptide_len': 3,
    'max_peptide_len': 35,
}
st = time()
runner.run(args)
print('\nTotal runtime: {} seconds'.format(time() - st))

Loading database...
Done. Indexing database...
1426 unique kmers
Done.
Number of 3-mers found in the database: 1426
Analyzing spectra file 0/1[0%]

Analyzing spectrum 0/1[0%]
Total runtime: 0.2743360996246338 seconds


## Load the summary json

In [5]:
import json
test_directory = '../../testing framework/data/testing_output/'

summary = json.load(open(test_directory + 'summary.json', 'r'))


## For each of the hybrid missed alignments, see what was chosen instead

In [6]:
for filescanno, container in summary.items():
    for a in container['alignments']:
        print(hybpep + '\n' + a['sequence'])
        print('\n\n')

    

LCGSHLKAGSR
LCGSHLKAGSR



LCGSHLKAGSR
LCGSHLQSR



LCGSHLKAGSR
LCGSHLGSR



