# Run a test of hypedsearch with generated data
The following steps describe how the test works
1. Load a fasta database
2. Generate
    1. Hybrid proteins
    2. Peptides
    3. Hybrid peptides from the hybrid proteins
3. Generate spectra for all the peptides created
4. Run hypedsearch with the .fasta file (no hybrid proteins included) and the spectra files
5. Load the summary.json file created
6. Determine what number of alignments were correct

## 1. Load fasta database

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.file_io import fasta

fasta_file = '../data/databases/4prots.fasta'
database = fasta.read(fasta_file, True)

database = {x['name']: x for x in database}

## 2.  Generate the peptides, hybrid proteins and peptides

In [2]:
from sequence_generation import proteins, peptides
test_directory = '../data/testing_output/'

num_hybs = 5
min_length= 5
max_length = 35
num_peptides = 100
min_cont = 3 #min contribution for each side of a hybrid

# make hybrid proteins
hyb_prots = proteins.generate_hybrids([x for _, x in database.items()], num_hybs, min_contribution=max_length)
# create peptides
non_hybrid_peps = peptides.gen_peptides([x for _, x in database.items()], num_peptides, min_length=min_length, max_length=max_length, digest='random', dist='beta')
# create hybrid peptides
hyb_peps = peptides.gen_peptides(hyb_prots, num_hybs, min_length=min_length, max_length=max_length, digest='random', min_contribution=min_cont, hybrid_list=True)

all_proteins_raw = [x for _,x in database.items()] + hyb_prots
all_peptides_raw = non_hybrid_peps + hyb_peps

peptides = {}
for i, pep in enumerate(all_peptides_raw):
    peptides[i] = pep
    peptides[i]['scan_no'] = i

Generating hybrid protein 0/5[0%]Generating hybrid protein 1/5[20%]Generating hybrid protein 2/5[40%]Generating hybrid protein 3/5[60%]Generating hybrid protein 4/5[80%]
Finished generating hybrid proteins


## 2.1 Save this info so that I can analyze it later from Neo-Fusion

In [3]:
import json
experiment_info_file_name = 'experiment_info.json'

exp = {'database': fasta_file, 'peptides': peptides}
with open(test_directory + experiment_info_file_name, 'w') as o:
    json.dump(exp, o)


## 2.2 Load data if available instead of creating it

In [4]:
# import json

# expfile = '../data/testing_output/experiment_info.json'
# exp = json.load(open(expfile, 'r'))
# peptides = exp['peptides']

## 3. Generate spectra

In [5]:
from src.spectra import gen_spectra
from src.utils import utils
from sequence_generation import write_spectra

utils.make_dir(test_directory)

spectra = []
sorted_keys = [int(c) for c in peptides.keys()]
sorted_keys.sort()
for k in sorted_keys:
    pep = peptides[k]
    cont = gen_spectra.gen_spectrum(pep['sequence'])
    spec = cont['spectrum']
    pm = cont['precursor_mass']
    spectra.append({'spectrum': spec, 'precursor_mass': pm})
write_spectra.write_mzml('testSpectraFile', spectra, output_dir=test_directory)


Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


'../data/testing_output/testSpectraFile.mzML'

## 4. Run hypedsearch

In [6]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src import runner
from time import time

test_directory = '../data/testing_output/'
fasta_file = '../data/databases/4prots.fasta'

args = {
    'spectra_folder': test_directory,
    'database_file': fasta_file,
    'output_dir': test_directory,
    'min_peptide_len': 3,
    'max_peptide_len': 35,
}
st = time()
runner.run(args)
print('\nTotal runtime: {} seconds'.format(time() - st))

Loading database...
Done. Indexing database...
1426 unique kmers
Done.
Number of 3-mers found in the database: 1426
Analyzing spectra file 0/1[0%]

Analyzing spectrum 344/345[99%]
Total runtime: 16.389214992523193 seconds


## 5. Load the summary json

In [7]:
import json
test_directory = '../data/testing_output/'

summary = json.load(open(test_directory + 'summary.json', 'r'))
print(test_directory)

../data/testing_output/


#### summary format
Each entry is of the form
```python
{
    '<file-name>_<scan_number>':{
        'spectrum': {...},
        'alignments': [{...}],
        'b_scores': [{...}],
        'y_score': [{...}]
    }
}
```
The only attribute we're really interested in is the alignments attribute which has the form
```python
{
    'alignments': [{
        'b_alignment': {...},
        'y_alignment': {...}, 
        'spectrum': [...],
        'protein': str,
        'alignment_score': float,
        'sequence': str,
        'hybrid': bool,
        'hybrid_sequence': str
    }, ...
    ]
}
```

## 6. Determine which number of alignments were correct
This needs to be broken down into hybrid and non hybrid peptides to get some stats on how well its doing

In [9]:
n = 5
non_hyb_stats = {i: {
        'correct': 0,
        'correct_parent': 0,
        'correct_length': 0,
        'correct_start': 0,
        'correct_end': 0
    } for i in range(n)}
non_hyb_stats['count'] = 0

hyb_stats ={i: {
        'left_correct_parent': 0,
        'right_correct_parent': 0, 
        'correct_start': 0,
        'correct_end': 0,
        'correct_length': 0,
        'correct': 0
    } for i in range(n)}
hyb_stats['count'] = 0


In [28]:
def hyb_calc(result, real_pep):
    hyb_stats['count'] += 1
    for i in range(min(n, len(result))):
        res = result[i]
        result_hybrid = 'hybrid' in res['protein_name']
        if not result_hybrid: 
            continue
        
        left_corrparent = real_pep['left_parent_name'] == res['protein'].split('-')[0]
        right_corrparent = real_pep['right_parent_name'] == res['protein'].split('-')[1]
        corr_start = real_pep['left_parent_starting_position'] == res['b_alignment']['kmer']['start_position']
        corr_end = real_pep['right_parent_ending_position'] == res['y_alignment']['kmer']['end_position']
        corr_len = len(real_pep['sequence']) == res['length']
        
        if not left_corrparent or not right_corrparent:
            continue
        
        hyb_stats[i]['left_correct_parent'] += 1 if left_corrparent else 0
        hyb_stats[i]['right_correct_parent'] += 1 if right_corrparent else 0
        hyb_stats[i]['correct_start'] += 1 if corr_start else 0
        hyb_stats[i]['correct_end'] += 1 if corr_end else 0
        hyb_stats[i]['correct_length'] += 1 if corr_len else 0
        hyb_stats[i]['correct'] += 1 if left_corrparent and right_corrparent and corr_start and corr_end and corr_len else 0
        

In [29]:
def non_hyb_calc(result, real_pep):
    non_hyb_stats['count'] += 1
    iterrange = min(n, len(result))
    for i in range(iterrange):
        if result[i]['protein'] != real_pep['parent_name']:
            continue
        corrlen = len(result[i]['sequence']) == len(real_pep['sequence'])
        resstartpos = min(result[i]['b_alignment']['kmer']['start_position'], result[i]['y_alignment']['kmer']['start_position'])
        resendpos = min(result[i]['b_alignment']['kmer']['end_position'], result[i]['y_alignment']['kmer']['end_position'])
        corrstart = resstartpos == real_pep['starting_position']
        corrend = resendpos == real_pep['ending_position']
        
        non_hyb_stats[i]['correct_parent'] += 1 
        non_hyb_stats[i]['correct_length'] += 1 if corrlen else 0
        non_hyb_stats[i]['correct_start'] += 1 if corrstart else 0
        non_hyb_stats[i]['correct_end'] += 1 if corrend else 0
        non_hyb_stats[i]['correct'] += 1 if (corrend and corrlen and corrstart) else 0
        return


In [30]:
expfile = '../data/testing_output/experiment_info.json'
exp = json.load(open(expfile, 'r'))

scan_no_keyed_results = {x['spectrum']['scan_number']: x for _, x in summary.items()}
sorted_keys = [int(c) for c in exp['peptides'].keys()]

for k in sorted_keys:
    pep = exp['peptides'][str(k)]
    if 'hybrid' in pep['peptide_name'].lower():
        hyb_calc(scan_no_keyed_results[k]['alignments'], pep)
    else:
        non_hyb_calc(scan_no_keyed_results[k]['alignments'], pep)

KeyError: 'protein_name'

In [31]:
percent = lambda a, b: (a* 100 // b)

printstat = lambda name, stat: '{}{}\n'.format(name, str(stat).rjust(60-len(name), '.'))

secbreak = ''.join(['=' for _ in range(60)])
headbreak = ''.join(['-' for _ in range(60)])
nhcount = non_hyb_stats['count']
topalign = non_hyb_stats[0]
otheralign = {}
for i in range(1, n):
    for stat in topalign.keys():
        if stat not in otheralign:
            otheralign[stat] = 0
        otheralign[stat] += non_hyb_stats[i][stat]

######################## NON HYBRID PRETTY PRINTING ############################

nonhybsum = 'NON HYBRID STATS\n' + headbreak + '\n'
nonhybsum += printstat('number of peptides', nhcount) 
nonhybsum += 'Top alignment\n\n'
nonhybsum += printstat('correct alignment', topalign['correct'])
nonhybsum += printstat('%', percent(topalign['correct'], nhcount)) 
nonhybsum += printstat('correct protein', topalign['correct_parent']) 
nonhybsum += printstat('%', percent(topalign['correct_parent'], nhcount))
nonhybsum += printstat('correct starting position', topalign['correct_start'])
nonhybsum += printstat('%', percent(topalign['correct_start'], nhcount))
nonhybsum += printstat('correct ending position', topalign['correct_end'])
nonhybsum += printstat('%', percent(topalign['correct_end'], nhcount))
nonhybsum += printstat('correct length', topalign['correct_length'])
nonhybsum += printstat('%', percent(topalign['correct_length'], nhcount))

nonhybsum += '\n2 to {} alignment\n\n'.format(n)
nonhybsum += printstat('number of peptides', nhcount) 
nonhybsum += printstat('correct alignment', otheralign['correct'])
nonhybsum += printstat('%', percent(otheralign['correct'], nhcount)) 
nonhybsum += printstat('correct protein', otheralign['correct_parent']) 
nonhybsum += printstat('%', percent(otheralign['correct_parent'], nhcount))
nonhybsum += printstat('correct starting position', otheralign['correct_start'])
nonhybsum += printstat('%', percent(otheralign['correct_start'], nhcount))
nonhybsum += printstat('correct ending position', otheralign['correct_end'])
nonhybsum += printstat('%', percent(otheralign['correct_end'], nhcount))
nonhybsum += printstat('correct length', otheralign['correct_length'])
nonhybsum += printstat('%', percent(otheralign['correct_length'], nhcount))
nonhybsum += '\n' + secbreak + '\n\n'

############################ HYBRID PRETYY PRINTING ##############################
hcount = hyb_stats['count']
topalignh = hyb_stats[0]
otheralignh = {}
for i in range(1, n):
    for stat in topalignh.keys():
        if stat not in otheralignh:
            otheralignh[stat] = 0
        otheralignh[stat] += hyb_stats[i][stat]

hybsum = 'HYBRID STATS (if the top {} alignments contained either left or right it appears)\n'.format(n) + headbreak + '\n'
hybsum += printstat('number of peptides', hcount)
hybsum += 'Top alignment\n\n'
hybsum += printstat('correct alignment', topalignh['correct'])
hybsum += printstat('%', percent(topalignh['correct'], hcount))
hybsum += printstat('correct left parent', topalignh['left_correct_parent'])
hybsum += printstat('%', percent(topalignh['left_correct_parent'], hcount))
hybsum += printstat('correct right parent', topalignh['right_correct_parent'])
hybsum += printstat('%', percent(topalignh['right_correct_parent'], hcount))
hybsum += printstat('correct starting position', topalignh['correct_start'])
hybsum += printstat('%', percent(topalignh['correct_start'], hcount))
hybsum += printstat('correct ending position', topalignh['correct_end'])
hybsum += printstat('%', percent(topalignh['correct_end'], hcount))
hybsum += printstat('correct length', topalign['correct_length'])
hybsum += printstat('%', percent(topalignh['correct_length'], hcount))

hybsum += '\n2 to {} alignment\n\n'.format(n)
hybsum += printstat('correct alignment', otheralignh['correct'])
hybsum += printstat('%', percent(otheralignh['correct'], hcount))
hybsum += printstat('correct left parent', otheralignh['left_correct_parent'])
hybsum += printstat('%', percent(otheralignh['left_correct_parent'], hcount))
hybsum += printstat('correct right parent', otheralignh['right_correct_parent'])
hybsum += printstat('%', percent(otheralignh['right_correct_parent'], hcount))
hybsum += printstat('correct starting position', otheralignh['correct_start'])
hybsum += printstat('%', percent(otheralignh['correct_start'], hcount))
hybsum += printstat('correct ending position', otheralignh['correct_end'])
hybsum += printstat('%', percent(otheralignh['correct_end'], hcount))
hybsum += printstat('correct length', otheralign['correct_length'])
hybsum += printstat('%', percent(otheralignh['correct_length'], hcount))
print(nonhybsum + hybsum)

NON HYBRID STATS
------------------------------------------------------------
number of peptides.......................................700
Top alignment

correct alignment..........................................0
%..........................................................0
correct protein..........................................266
%.........................................................38
correct starting position................................245
%.........................................................35
correct ending position....................................0
%..........................................................0
correct length............................................21
%..........................................................3

2 to 5 alignment

number of peptides.......................................700
correct alignment..........................................0
%..........................................................0
correct protein...................

In [12]:
print(sorted_keys)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [11]:
def printres(x):
    for a in x:
        print(a)
printres(scan_no_keyed_results[10])
print(exp['peptides']['10'])

{'starting_position': 451, 'ending_position': 465, 'length': 30, 'sequence': 'SHATEQRPQKEPIDQSHATEQRPQKEPIDQ', 'b_score': 0.9, 'y_score': 0.8666666666666667, 'confidence': 1.7666666666666666, 'protein_name': 'caprin-1', 'spectrum': [44.523290435, 74.041847785, 88.03930443499999, 113.05274643499999, 131.555319285, 147.076419135, 148.571303435, 188.097351285, 199.095142935, 225.09821643499998, 236.62373328500001, 262.103362135, 263.616439435, 296.135330435, 301.145029785, 327.645728435, 365.192511285, 375.187426135, 397.18300943500003, 405.696283935, 429.22180028500003, 454.222665935, 472.24019013500003, 477.74818228500004, 518.251954935, 526.225602435, 555.7987377850001, 582.299436435, 601.282783135, 619.828026785, 646.820732935, 654.284180435, 684.3493232850001, 695.347114935, 729.377746135, 734.8731627850001, 751.889146935, 770.3917197850001, 809.402618435, 810.385291435, 838.9211757850001, 857.4363241350001, 873.4319074350001, 882.4371897850001, 907.438055435, 954.4890881350001, 1035

In [6]:
times = [0.04854393005371094,
0.03188514709472656,
0.07952308654785156,
0.009988069534301758,
0.032685041427612305,
0.03956890106201172,
0.04300117492675781,
0.08851480484008789,
0.045289039611816406,
0.02736687660217285,
0.02323770523071289,
0.018954992294311523,
0.034613847732543945,
0.11969780921936035,
0.04238271713256836,
0.028224706649780273,
0.0736851692199707,
0.041391849517822266,
0.035143136978149414,
0.027536869049072266,
0.13624906539916992,
0.06007981300354004,
0.050293922424316406,
0.06956601142883301,
0.024131059646606445,
0.029160022735595703,
0.10336899757385254,
0.037492990493774414,
0.030508041381835938,
0.024775266647338867,
0.10678887367248535]

import statistics 

print(statistics.mean(times))

0.05044028835911905
