# Run a test of hypedsearch with generated data
The following steps describe how the test works
1. Load a fasta database
2. Generate
    1. Hybrid proteins
    2. Peptides
    3. Hybrid peptides from the hybrid proteins
3. Generate spectra for all the peptides created
4. Run hypedsearch with the .fasta file (no hybrid proteins included) and the spectra files
5. Load the summary.json file created
6. Determine what number of alignments were correct

## 1. Load fasta database

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.file_io import fasta

fasta_file = '../data/databases/4prots.fasta'
database = fasta.read(fasta_file)
database = {x['name']: x for x in database}

## 2.  Generate the peptides, hybrid proteins and peptides

In [2]:
from sequence_generation import proteins, peptides

num_hybs = 5
min_length=5
max_length = 35
num_peptides = 50
min_cont = 3 #min contribution for each side of a hybrid

# make hybrid proteins
hyb_prots = proteins.generate_hybrids([x for _, x in database.items()], num_hybs, min_contribution=max_length)
# create peptides
non_hybrid_peps = peptides.gen_peptides([x for _, x in database.items()], num_peptides, min_length=min_length, max_length=max_length, digest='random', dist='beta')
# create hybrid peptides
hyb_peps = peptides.gen_peptides(hyb_prots, num_hybs, min_length=min_length, max_length=max_length, digest='random', min_contribution=min_cont, hybrid_list=True)

all_proteins_raw = [x for _,x in database.items()] + hyb_prots
all_peptides_raw = non_hybrid_peps + hyb_peps

peptides = {}
for i, pep in enumerate(all_peptides_raw):
    peptides[i] = pep

Generating hybrid protein 0/5[0%]Generating hybrid protein 1/5[20%]Generating hybrid protein 2/5[40%]Generating hybrid protein 3/5[60%]Generating hybrid protein 4/5[80%]
Finished generating hybrid proteins


## 3. Generate spectra

In [3]:
from src.spectra import gen_spectra
from src.utils import utils
from sequence_generation import write_spectra

test_directory = '../data/testing_output/'
utils.make_dir(test_directory)

spectra = []
sorted_keys = [int(c) for c in peptides.keys()]
sorted_keys.sort()
for k in sorted_keys:
    pep = peptides[k]
    cont = gen_spectra.gen_spectrum(pep['sequence'])
    spec = cont['spectrum']
    pm = cont['precursor_mass']
    spectra.append({'spectrum': spec, 'precursor_mass': pm})
write_spectra.write_mzml('testSpectraFile', spectra, output_dir=test_directory)


Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


'../data/testing_output/testSpectraFile.mzML'

## 4. Run hypedsearch

In [4]:
from src import runner
from time import time

args = {
    'spectra_folder': test_directory,
    'database_file': fasta_file,
    'output_dir': test_directory
}
st = time()
runner.run(args)
print('\nTotal runtime: {} seconds'.format(time() - st))

Analyzing spectrum 294/295[99%]
Total runtime: 39.797826051712036 seconds


## 5. Load the summary json

In [5]:
import json

summary = json.load(open(test_directory + 'summary.json', 'r'))

In [6]:
print(hyb_peps[0])
print(non_hybrid_peps[0])

{'sequence': 'MAEHLC', 'left_parent_starting_position': 299, 'left_parent_ending_position': 301, 'right_parent_starting_position': 28, 'right_parent_ending_position': 30, 'left_parent_name': 'caprin-1', 'right_parent_name': 'insulin', 'peptide_name': 'HYBRID_PEPTIDE000'}
{'peptide_name': 'peptide_00', 'sequence': 'EALYLVCGERGFFYTPK', 'parent_name': 'insulin', 'parent_sequence': 'MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN', 'starting_position': 36, 'ending_position': 52}


#### summary format
Each spectra entry has 3 candidates with the form
```python
{
    "starting_position": 64,
    "ending_position": 73,
    "length": 10,
    "b_score": 4.0,
    "y_score": 4.0,
    "confidence": 100.0,
    "protein_name": "insulin",
    "spectrum": [],
    "sequence": "QVELGGGPGA"
}
```

## 6. Determine which number of alignments were correct
This needs to be broken down into hybrid and non hybrid peptides to get some stats on how well its doing

In [7]:
n = 3
non_hyb_stats = {i: {
        'correct': 0,
        'correct_parent': 0,
        'correct_length': 0,
        'correct_start': 0,
        'correct_end': 0
    } for i in range(n)}
non_hyb_stats['count'] = 0

hyb_stats = {
    'count': 0,
    'left': {
        'correct': 0,
        'correct_parent': 0,
        'correct_length': 0,
        'correct_start': 0,
        'correct_end': 0
    },
    'right': {
        'correct': 0,
        'correct_parent': 0,
        'correct_length': 0,
        'correct_start': 0,
        'correct_end': 0
    }
}


In [8]:
def hyb_calc(result, real_pep):
    left_corrparent = False
    left_corrstart = False
    left_corrend = False
    left_corrlen = False
    right_corrparent = False
    right_corrstart = False
    right_corrend = False
    right_corrlen = False
    for i in range(n):
        if result[i]['protein_name'] == real_pep['left_parent_name']:
            left_corrparent = True
            left_corrstart = left_corrstart or (result[i]['starting_position'] == real_pep['left_parent_starting_position'])
            left_corrend = left_corrend or (result[i]['ending_position'] == real_pep['left_parent_ending_position'])
            left_corrlen = left_corrlen or (result[i]['length'] == real_pep['left_parent_ending_position'] - real_pep['left_parent_starting_position'] + 1)
        if result[i]['protein_name'] == real_pep['right_parent_name']: 
            right_corrparent = True
            right_corrstart = right_corrstart or(result[i]['starting_position'] == real_pep['right_parent_starting_position'])
            right_corrend = right_corrend or (result[i]['ending_position'] == real_pep['right_parent_ending_position'])
            right_corrlen = right_corrlen or (result[i]['length'] == real_pep['right_parent_ending_position'] - real_pep['right_parent_starting_position'] + 1)
    
    hyb_stats['count'] += 1
    hyb_stats['left']['correct_parent'] += 1 if left_corrparent else 0
    hyb_stats['left']['correct_length'] += 1 if left_corrlen else 0
    hyb_stats['left']['correct_start'] += 1 if left_corrstart else 0
    hyb_stats['left']['correct_end'] += 1 if left_corrend else 0
    hyb_stats['left']['correct'] += 1 if (left_corrparent and left_corrlen and left_corrstart and left_corrend) else 0
    hyb_stats['right']['correct_parent'] += 1 if right_corrparent else 0
    hyb_stats['right']['correct_length'] += 1 if right_corrlen else 0
    hyb_stats['right']['correct_start'] += 1 if right_corrstart else 0
    hyb_stats['right']['correct_end'] += 1 if right_corrend else 0
    hyb_stats['right']['correct'] += 1 if (right_corrparent and right_corrlen and right_corrstart and right_corrend) else 0
    

In [9]:
def non_hyb_calc(result, real_pep):
    non_hyb_stats['count'] += 1
    for i in range(n):
        if result[i]['protein_name'] != real_pep['parent_name']:
            continue
        corrlen = len(result[i]['sequence']) == len(real_pep['sequence'])
        corrstart = result[i]['starting_position'] == real_pep['starting_position']
        corrend = result[i]['ending_position'] == real_pep['ending_position']
        
        non_hyb_stats[i]['correct_parent'] += 1 
        non_hyb_stats[i]['correct_length'] += 1 if corrlen else 0
        non_hyb_stats[i]['correct_start'] += 1 if corrstart else 0
        non_hyb_stats[i]['correct_end'] += 1 if corrend else 0
        non_hyb_stats[i]['correct'] += 1 if (corrend and corrlen and corrstart) else 0
        return


In [10]:
scan_no_keyed_results = {x['scan_no']: x['alignments'] for _, x in summary.items()}
for k in sorted_keys:
    pep = peptides[k]
    if 'hybrid' in pep['peptide_name'].lower():
        hyb_calc(scan_no_keyed_results[k], pep)
    else:
        non_hyb_calc(scan_no_keyed_results[k], pep)

In [11]:
percent = lambda a, b: (a* 100 // b)

printstat = lambda name, stat: '{}{}\n'.format(name, str(stat).rjust(60-len(name), '.'))

secbreak = ''.join(['=' for _ in range(60)])
headbreak = ''.join(['-' for _ in range(60)])
nhcount = non_hyb_stats['count']
topalign = non_hyb_stats[0]
otheralign = {stat: non_hyb_stats[1][stat] + non_hyb_stats[2][stat] for stat in topalign.keys()}

######################## NON HYBRID PRETTY PRINTING ############################

nonhybsum = 'NON HYBRID STATS\n' + headbreak + '\n'
nonhybsum += printstat('number of peptides', nhcount) 
nonhybsum += 'Top alignment\n\n'
nonhybsum += printstat('correct alignment', topalign['correct'])
nonhybsum += printstat('%', percent(topalign['correct'], nhcount)) 
nonhybsum += printstat('correct protein', topalign['correct_parent']) 
nonhybsum += printstat('%', percent(topalign['correct_parent'], nhcount))
nonhybsum += printstat('correct starting position', topalign['correct_start'])
nonhybsum += printstat('%', percent(topalign['correct_start'], nhcount))
nonhybsum += printstat('correct ending position', topalign['correct_end'])
nonhybsum += printstat('%', percent(topalign['correct_end'], nhcount))
nonhybsum += printstat('correct length', topalign['correct_length'])
nonhybsum += printstat('%', percent(topalign['correct_length'], nhcount))

nonhybsum += '\n2nd or 3rd alignment\n\n'
nonhybsum += printstat('number of peptides', nhcount) 
nonhybsum += printstat('correct alignment', otheralign['correct'])
nonhybsum += printstat('%', percent(otheralign['correct'], nhcount)) 
nonhybsum += printstat('correct protein', otheralign['correct_parent']) 
nonhybsum += printstat('%', percent(otheralign['correct_parent'], nhcount))
nonhybsum += printstat('correct starting position', otheralign['correct_start'])
nonhybsum += printstat('%', percent(otheralign['correct_start'], nhcount))
nonhybsum += printstat('correct ending position', otheralign['correct_end'])
nonhybsum += printstat('%', percent(otheralign['correct_end'], nhcount))
nonhybsum += printstat('correct length', otheralign['correct_length'])
nonhybsum += printstat('%', percent(otheralign['correct_length'], nhcount))
nonhybsum += '\n' + secbreak + '\n\n'

############################ HYBRID PRETYY PRINTING ##############################
hcount = hyb_stats['count']
lalign = hyb_stats['left']
ralign = hyb_stats['right']

hybsum = 'HYBRID STATS (if the top {} alignments contained either left or right it appears)\n'.format(n) + headbreak + '\n'
hybsum += printstat('number of peptides', hcount)
hybsum += '\nLeft \n\n'
hybsum += printstat('correct side alignment', lalign['correct'])
hybsum += printstat('%', percent(lalign['correct'], hcount))
hybsum += printstat('correct protein', lalign['correct_parent'])
hybsum += printstat('%', percent(lalign['correct_parent'], hcount))
hybsum += printstat('correct starting position', lalign['correct_start'])
hybsum += printstat('%', percent(lalign['correct_start'], hcount))
hybsum += printstat('correct ending position', lalign['correct_end'])
hybsum += printstat('%', percent(lalign['correct_end'], hcount))
hybsum += printstat('correct length', lalign['correct_length'])
hybsum += printstat('%', percent(lalign['correct_length'], hcount))

hybsum += '\nRight \n\n'
hybsum += printstat('correct side alignment', ralign['correct'])
hybsum += printstat('%', percent(ralign['correct'], hcount))
hybsum += printstat('correct protein', ralign['correct_parent'])
hybsum += printstat('%', percent(ralign['correct_parent'], hcount))
hybsum += printstat('correct starting position', ralign['correct_start'])
hybsum += printstat('%', percent(ralign['correct_start'], hcount))
hybsum += printstat('correct ending position', ralign['correct_end'])
hybsum += printstat('%', percent(ralign['correct_end'], hcount))
hybsum += printstat('correct length', ralign['correct_length'])
hybsum += printstat('%', percent(ralign['correct_length'], hcount))

print(nonhybsum + hybsum)

NON HYBRID STATS
------------------------------------------------------------
number of peptides........................................50
Top alignment

correct alignment.........................................50
%........................................................100
correct protein...........................................50
%........................................................100
correct starting position.................................50
%........................................................100
correct ending position...................................50
%........................................................100
correct length............................................50
%........................................................100

2nd or 3rd alignment

number of peptides........................................50
correct alignment..........................................0
%..........................................................0
correct protein...............

In [14]:
print(peptides[173])

{'sequence': 'CGSHLVEALVNIPKRLE', 'left_parent_starting_position': 30, 'left_parent_ending_position': 38, 'right_parent_starting_position': 615, 'right_parent_ending_position': 622, 'left_parent_name': 'insulin', 'right_parent_name': 'dixin', 'peptide_name': 'HYBRID_PEPTIDE123'}
