# Comparison of results
we need to take in the results from both (1) hypedsearch (or whatever its called) and (2) Neo-Fusion to see how they compare. Generated data is saved in the `experiment.json` files, so we can load that in to know the true value and compare that to what each get. The steps we should take are the following:
1. Load the `experiment.json` file
2. Load the `json` output from hypedsearch
3. Load the results of Neo-Fusion
4. See how well hypedsearch did at identifying both hybrid and non hybrid sequences
5. See how well Neo-Fusion did at identifying both hybrid and non hybrid sequences
6. Compare overlaps and non overlapping results

__NOTE__: all the data we are testing here are (1) IDEAL theoretical spectra, (2) the hybrids are only cis-spliced as Neo-Fusion cannot search for trans-spliced peptides

### Constants and imports

In [1]:
import json

expjsondir = '../../sandbox/data/testing_output/cis_spliced/'
experiment_json_file = expjsondir + 'experiment_info.json'

hypedsearch_output = '/Users/zacharymcgrath/Desktop/Experiment output/09062020/'


## 1. Load the `experiment.json` file

In [2]:
with open(experiment_json_file, 'r') as o:
    exp = json.load(o)
    

## 2. Load `json` from hypedsearch

In [3]:
summary = json.load(open(hypedsearch_output + 'summary.json', 'r'))


## 3. Load Neo-Fusion results

## 4. Check the correctness of hypedsearch

In [4]:
n = 5
non_hyb_stats = {i: {
        'correct': 0,
        'correct_parent': 0,
        'correct_sequence': 0,
    } for i in range(n)}
non_hyb_stats['count'] = 0

hyb_stats ={i: {
        'left_correct_parent': 0,
        'right_correct_parent': 0, 
        'correct_sequence': 0,
        'correct': 0
    } for i in range(n)}
hyb_stats['count'] = 0

wrong_hybrid_alignemnts = []
wrong_nonhybrid_alignments = []

In [5]:
def hyb_calc(result, real_pep):
    hyb_stats['count'] += 1
    is_correct = False
    for i in range(min(n, len(result))):
        res = result[i]
        result_hybrid = 'hybrid_sequence' in res
        if not result_hybrid: 
            continue
        
        left_corrparent = real_pep['left_parent_name'] in res['left_proteins']
        right_corrparent = real_pep['right_parent_name'] in res['right_proteins']
        corr_seq = real_pep['sequence'] == res['sequence']
        
        hyb_stats[i]['left_correct_parent'] += 1 if left_corrparent else 0
        hyb_stats[i]['right_correct_parent'] += 1 if right_corrparent else 0
        hyb_stats[i]['correct_sequence'] += 1 if corr_seq else 0
        hyb_stats[i]['correct'] += 1 if left_corrparent and right_corrparent and corr_seq else 0
        
        is_correct = corr_seq
        
        if is_correct:
            break
            
    if not is_correct:
        print('appending {} to bad for real pep {}'.format([x['sequence'] for x in result], real_pep['sequence']))
        wrong_hybrid_alignemnts.append((result, real_pep))

In [6]:
def non_hyb_calc(result, real_pep):
    non_hyb_stats['count'] += 1
    iterrange = min(n, len(result))
    for i in range(iterrange):
        if 'proteins' not in result[i] or real_pep['parent_name'] not in result[i]['proteins']:
            continue

        corrseq = result[i]['sequence'] == real_pep['sequence']
        corrprotein = real_pep['parent_name'] in result[i]['proteins']
        
        non_hyb_stats[i]['correct_parent'] += 1 if corrprotein else 0 
        non_hyb_stats[i]['correct_sequence'] += 1 if corrseq else 0
        non_hyb_stats[i]['correct'] += 1 if (corrseq and corrprotein) else 0
        
        if i != 0 and not (corrprotein and corrseq):
            wrong_nonhybrid_alignments.append((result, real_pep))
        return


In [7]:
exp = json.load(open(experiment_json_file, 'r'))

scan_no_keyed_results = {x['spectrum']['scan_number']: x for _, x in summary.items()}
sorted_keys = [int(c) for c in exp['peptides'].keys()]

for k in sorted_keys:
    pep = exp['peptides'][str(k)]
    if k not in scan_no_keyed_results:
        continue
    if 'hybrid' in pep['peptide_name'].lower():
        hyb_calc(scan_no_keyed_results[k]['alignments'], pep)
    else:
        non_hyb_calc(scan_no_keyed_results[k]['alignments'], pep)

appending ['KALKTSTKCRGL', 'KAIKTSTKCRGL', 'KAIKTLGTAS'] to bad for real pep SPAYSRHSSSHSSSISPVRLPLN
appending ['PQDLQLLVAI', 'PQDLQGLLVAI', 'PQDLQLGLLL'] to bad for real pep LKGQFNFDHPDAITEGKTVQIPV
appending ['AVAVKDQRKAIKTAKPAP', 'AVAVKDQRKAIKTKISDPLDLSTIEKQILIG', 'AVAVKDQRKAIKTFKEFY'] to bad for real pep RSPAYSRHSSSHSSSISPVRLPLNS
appending ['IPALLNAIALSA', 'IALLPNAIALSA', 'LLPAINAIALSA'] to bad for real pep CSGRALPSSSQQTILMLKE
appending ['TVLSVSPVV', 'VTSLVSPVV', 'VTSLVSPVV'] to bad for real pep KALKGQFNFDHPDAITEGKTVQIPVYD
appending ['TQPPGYGFLFTGG', 'HGSKADLYHQS', 'QGPTPHPGETN'] to bad for real pep TSLLEALEELRMRIAKIILETQELK
appending ['EKEEFLAWQHDEEAAP', 'EKEEFLAWQHDKAYPVMNDAEEDDEEEAAP', 'EKEEFLAWQHDESPSP'] to bad for real pep QFNFDHPDAITEGKTVQ
appending ['PEKEEFLAWQHDLEANDKAPEYKSS', 'PEKEEFLAWQHDLEANDKAPEPPAA', 'PEKEEFLAWQHDLEANDKAPVSWIH'] to bad for real pep CSGRALPSSSQQTILMLKE
appending ['DGSPSGGGKEVYL', 'GGEEAGGGKEVYL', 'GGEEAWTGGGKEVYL'] to bad for real pep VVLVILVKAPRQI
appen

In [8]:
percent = lambda a, b: (a* 100 // b)

printstat = lambda name, stat: '{}{}\n'.format(name, str(stat).rjust(60-len(name), '.'))

secbreak = ''.join(['=' for _ in range(60)])
headbreak = ''.join(['-' for _ in range(60)])
nhcount = non_hyb_stats['count']
topalign = non_hyb_stats[0]
otheralign = {}
for i in range(1, n):
    for stat in topalign.keys():
        if stat not in otheralign:
            otheralign[stat] = 0
        otheralign[stat] += non_hyb_stats[i][stat]

######################## NON HYBRID PRETTY PRINTING ############################

nonhybsum = 'NON HYBRID STATS\n' + headbreak + '\n'
nonhybsum += printstat('number of peptides', nhcount) 
nonhybsum += 'Top alignment\n\n'
nonhybsum += printstat('correct alignment', topalign['correct'])
nonhybsum += printstat('%', percent(topalign['correct'], nhcount)) 
nonhybsum += printstat('correct protein', topalign['correct_parent']) 
nonhybsum += printstat('%', percent(topalign['correct_parent'], nhcount))
nonhybsum += printstat('correct sequence', topalign['correct_sequence'])
nonhybsum += printstat('%', percent(topalign['correct_sequence'], nhcount))

nonhybsum += '\n2 to {} alignment\n\n'.format(n)
nonhybsum += printstat('number of peptides', nhcount) 
nonhybsum += printstat('correct alignment', otheralign['correct'])
nonhybsum += printstat('%', percent(otheralign['correct'], nhcount)) 
nonhybsum += printstat('correct protein', otheralign['correct_parent']) 
nonhybsum += printstat('%', percent(otheralign['correct_parent'], nhcount))
nonhybsum += printstat('correct sequence', otheralign['correct_sequence'])
nonhybsum += printstat('%', percent(otheralign['correct_sequence'], nhcount))
nonhybsum += '\n' + secbreak + '\n\n'

############################ HYBRID PRETYY PRINTING ##############################
hcount = hyb_stats['count']
topalignh = hyb_stats[0]
otheralignh = {}
for i in range(1, n):
    for stat in topalignh.keys():
        if stat not in otheralignh:
            otheralignh[stat] = 0
        otheralignh[stat] += hyb_stats[i][stat]

hybsum = 'HYBRID STATS\n'+ headbreak + '\n'
hybsum += printstat('number of peptides', hcount)
hybsum += 'Top alignment\n\n'
hybsum += printstat('correct alignment', topalignh['correct'])
hybsum += printstat('%', percent(topalignh['correct'], hcount))
hybsum += printstat('correct sequence', topalignh['correct_sequence'])
hybsum += printstat('%', percent(topalignh['correct_sequence'], hcount))
hybsum += printstat('correct left parent', topalignh['left_correct_parent'])
hybsum += printstat('%', percent(topalignh['left_correct_parent'], hcount))
hybsum += printstat('correct right parent', topalignh['right_correct_parent'])
hybsum += printstat('%', percent(topalignh['right_correct_parent'], hcount))

hybsum += '\n2 to {} alignment\n\n'.format(n)
hybsum += printstat('correct alignment', otheralignh['correct'])
hybsum += printstat('%', percent(otheralignh['correct'], hcount))
hybsum += printstat('correct sequence', otheralignh['correct_sequence'])
hybsum += printstat('%', percent(otheralignh['correct_sequence'], hcount))
hybsum += printstat('correct left parent', otheralignh['left_correct_parent'])
hybsum += printstat('%', percent(otheralignh['left_correct_parent'], hcount))
hybsum += printstat('correct right parent', otheralignh['right_correct_parent'])
hybsum += printstat('%', percent(otheralignh['right_correct_parent'], hcount))

print(nonhybsum + hybsum)

NON HYBRID STATS
------------------------------------------------------------
number of peptides.......................................100
Top alignment

correct alignment..........................................0
%..........................................................0
correct protein...........................................25
%.........................................................25
correct sequence...........................................0
%..........................................................0

2 to 5 alignment

number of peptides.......................................100
correct alignment..........................................0
%..........................................................0
correct protein............................................2
%..........................................................2
correct sequence...........................................0
%..........................................................0


HYBRID STATS
-------------------