## Imports

In [1]:
import pandas as pd
import json

# 1. Hacky False Positive Calculation
For the sake of making it easier, we are going to do the FP in a bit of a weird way. We have a dataset with 280 proteins we know to be the acutal source. We will run hypedsearch with both this set and a set of 300 prots that are NOT in this set. Then we will compare results. 
1. Load in the truth set
2. Create another set of proteins that are exclusive of the truth set
3. Run hypedsearch on both sets
4. Count the number of results that the non truth set had over the truth set

## Constants

In [2]:
results_dir = '/Users/zacharymcgrath/Desktop/Experiment output/FP/'
truth_results_json = results_dir + 'truth/summary.json'
non_truth_results_json = results_dir + 'not_truth/summary.json'

truth_results = json.load(open(truth_results_json, 'r'))
non_truth_results = json.load(open(non_truth_results_json, 'r'))

# save them in a dictionary by their id so that we can compare
ided_truth = {v['spectrum']['id'].replace('.pkl', ''): v for _, v in truth_results.items()}
ided_non_truth = {v['spectrum']['id'].replace('.pkl', ''): v for _, v in non_truth_results.items()}


## Compare and count

In [11]:
truth_greater = {}
non_truth_greater = {}

for k, v in ided_truth.items():
    
    if k not in ided_non_truth:
        truth_greater[k] = v
        
    else:
        nt_v = ided_non_truth[k]
        
        # see if we have anything in our alignments
        if len(v['alignments']) == 0 and len(nt_v['alignments']) == 0:
            continue
            
        elif len(v['alignments']) == 0 and len(nt_v['alignments']) > 0:
            non_truth_greater[k] = v
            continue
            
        elif len(v['alignments']) >0 and len(nt_v['alignments']) == 0:
            truth_greater[k] = v 
            continue

        # see which score was greater
        best_truth = sorted(v['alignments'], key=lambda x: x['total_score'], reverse=True)[0]['total_score']
        best_non_truth = sorted(nt_v['alignments'], key=lambda x: x['total_score'], reverse=True)[0]['total_score']
        
        # get all sequences with this score
        best_truth_seqs = [x['sequence'] for x in v['alignments'] if x['total_score'] >= best_truth]
        best_non_truth_seqs = [x['sequence'] for x in nt_v['alignments'] if x['total_score'] >= best_non_truth]
        
        entry = (v, nt_v)
        
        if best_truth > best_non_truth:
            truth_greater[k] = entry
            
        elif any([x == y for x in best_truth_seqs for y in best_non_truth_seqs]):
            truth_greater[k] = entry
            
        else:
            non_truth_greater[k] = entry
            
for k, v in ided_non_truth.items():
    if k not in ided_truth and k not in truth_greater and k not in non_truth_greater:
        non_truth_greater[k] = v

In [12]:
print(len(truth_greater))
print(len(non_truth_greater))

751
323


# 2. Get overlap with SpectrumMill results

## 1. Load the `ssv` file

In [1]:

specmilresultsfile = '/Users/zacharymcgrath/Downloads/NOD2_E3_results.ssv'
specmilresults = pd.read_csv(specmilresultsfile, sep=';')


In [2]:
specmilresults.loc[specmilresults['filename'] == 'NOD2_E3.7065.7065.2']

Unnamed: 0,number,filename,parent_charge,score,deltaForwardReverseScore,deltaRank1Rank2Score,percent_scored_peak_intensity,totalIntensity,previous_aa,sequence,next_aa,retentionTimeMin,chromatographicPeakWidthSec,parent_m_over_z,species,entry_name
6,7,NOD2_E3.7065.7065.2,2,12.55,6.28,7.14,81.3,282000.0,(G),DLPVNSPMTKG,(D),26.08,15.0,579.7953,MOUSE,HYBRID: mouse ins2C EVEDPQVAQLELGGGPGAGD-LPVNS...


### 1.a Get the hybrids out of the dataframe

In [3]:
specmilhybs = specmilresults.loc[specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexedhybs = {row['filename']: row['sequence'] for _, row in specmilhybs.iterrows()}

### 1.b Get the non hybrids out of the dataframe

In [4]:
specmilnonhybs = specmilresults.loc[~specmilresults['entry_name'].str.contains('HYBRID')]
filenameindexednonhybs = {row['filename']: row['sequence'] for _, row in specmilnonhybs.iterrows()}

## 2. Check to see the overlap in results

In [6]:
missed = {}
found = {}
hyb_hits = {}
nonhyb_hits = {}

for k, v in truth_results_json.items():
    _id = v['spectrum']['id'].replace('.pkl', '')
    if _id in filenameindexedhybs:
        
        f = False
        # go through each of the sequences and see if we have a match
        for a in v['alignments']:
            
            # replace all Is and Ls with Bs to allow for slight misses in the hybrid case
            hyped_res = a['sequence'].replace('I', 'B').replace('L', 'B')
            spec_mil_res = filenameindexedhybs[_id].replace('I', 'B').replace('L', 'B')
            
            if hyped_res == spec_mil_res:
                hyb_hits[_id] = v
                f = True
                found[_id] = v
                continue
                
        if not f: 
            missed[_id] = v
            
    else: 
        
        f = False
        # go through each of the sequences and see if we have a match
        for a in v['alignments']:
            if a['sequence'] == filenameindexednonhybs[_id]:
                nonhyb_hits[_id] = v
                f = True
                found[_id] = v
                
        if not f: 
            missed[_id] = v
            
print(f'Found {len(hyb_hits)}/{len(filenameindexedhybs)} hybrids')
print(f'Found {len(nonhyb_hits)}/{len(filenameindexednonhybs)} non hybrids')
    

Found 5/8 hybrids
Found 528/1078 non hybrids


# 3. Calculate the results overlapped between the FP and the hybrid 