## Imports

In [1]:
import pandas as pd
import json
from pyteomics import fasta
import random

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src import runner

# Hacky False Positive Calculation
For the sake of making it easier, we are going to do the FP in a bit of a weird way. We have a dataset with 280 proteins we know to be the acutal source. We will run hypedsearch with both this set and a set of 300 prots that are NOT in this set. Then we will compare results. 
1. Load in the truth set
2. Create another set of proteins that are exclusive of the truth set
3. Run hypedsearch on both sets
4. Count the number of results that the non truth set had over the truth set

## Constants

In [2]:
specPath = '/Users/zacharymcgrath/Desktop/nod2 data/filteredSpec/'

truth_set = '/Users/zacharymcgrath/Desktop/nod2 data/filteredNOD2.fasta'
not_truth_set = '/Users/zacharymcgrath/Desktop/nod2 data/not_truth_subset.fasta'
whole_db = '/Users/zacharymcgrath/Desktop/nod2 data/all data/NOD2_mouse_database.fasta'

outputDir = '/Users/zacharymcgrath/Desktop/Experiment output/FP/'
minPep = 3
maxPep = 30
tolerance = 20
relative_abundance_filter = 0.0
precursor_tolerance = 3
peak_filter = 25
verbose = True

truth_run_params = {
    'spectra_folder': specPath,
    'database_file': truth_set,
    'output_dir': outputDir + 'truth/',
    'min_peptide_len': minPep,
    'max_peptide_len': maxPep,
    'tolerance': tolerance,
    'precursor_tolerance': precursor_tolerance,
    'peak_filter': peak_filter, 
    'relative_abundance_filter': relative_abundance_filter,
    'digest': 'trypsin', 
    'missed_cleavages': 2,
    'verbose': verbose,
    'DEBUG': False
}

non_truth_run_params = {
    'spectra_folder': specPath,
    'database_file': not_truth_set,
    'output_dir': outputDir + 'not_truth/',
    'min_peptide_len': minPep,
    'max_peptide_len': maxPep,
    'tolerance': tolerance,
    'precursor_tolerance': precursor_tolerance,
    'peak_filter': peak_filter, 
    'relative_abundance_filter': relative_abundance_filter,
    'digest': 'trypsin', 
    'missed_cleavages': 2,
    'verbose': verbose, 
    'DEBUG': False
}

## 1. Load in truth set

In [3]:
get_name = lambda x: x.split('|')[-1].split(' ')[0]

ts = {get_name(entry.description): None for entry in fasta.read(truth_set)}

## 2. Create the other set of 280 proteins

In [4]:
num_keep = 280
all_prots = {get_name(entry.description): entry for entry in fasta.read(whole_db)}
all_prot_keys = list(all_prots.keys())

saving = {}
# get num_keep
for i in range(num_keep):
    print(f'\rchoosing prot {i+1}/{num_keep}', end='')
    
    while not False:
        
        # pick a protein 
        selected = all_prot_keys[random.randint(0, len(all_prot_keys) - 1)]
        
        # see if its in saving or ts
        if selected in ts or selected in saving:
            continue
            
            
        # flip it and add "reverse" to the description
        entry = all_prots[selected]
        
        rev_seq = entry.sequence[::-1]
        split_desc = entry.description.split('|')
        split_desc[-1] = 'REVERSE_' + split_desc[-1]
        new_desc = '|'.join(split_desc)
        
        saving[selected] = entry._replace(description=new_desc, sequence=rev_seq)
        break
            
# write to file
fasta.write([v for _, v in saving.items()], open(not_truth_set, 'w'))


choosing prot 1/280choosing prot 2/280choosing prot 3/280choosing prot 4/280choosing prot 5/280choosing prot 6/280choosing prot 7/280choosing prot 8/280choosing prot 9/280choosing prot 10/280choosing prot 11/280choosing prot 12/280choosing prot 13/280choosing prot 14/280choosing prot 15/280choosing prot 16/280choosing prot 17/280choosing prot 18/280choosing prot 19/280choosing prot 20/280choosing prot 21/280choosing prot 22/280choosing prot 23/280choosing prot 24/280choosing prot 25/280choosing prot 26/280choosing prot 27/280choosing prot 28/280choosing prot 29/280choosing prot 30/280choosing prot 31/280choosing prot 32/280choosing prot 33/280choosing prot 34/280choosing prot 35/280choosing prot 36/280choosing prot 37/280choosing prot 38/280choosing prot 39/280choosing prot 40/280choosing prot 41/280choosing prot 42/280choosing prot 43/280choosing prot 44/280choosing prot 45/280choosing prot 46/280choosing prot 47/280choosing prot 48/280

<_io.TextIOWrapper name='/Users/zacharymcgrath/Desktop/nod2 data/not_truth_subset.fasta' mode='w' encoding='UTF-8'>

## 3. Run hyped search on both

In [5]:
%%time
runner.run(truth_run_params)

Loading database...
Done
Loading spectra...
Done
On batch 1/1
On protein 280/280 [100%]
Sorting the set of protein masses...
Initializing other processors...
Done.
Creating an alignment for 1085/1086 [100%]
Finished search. Writting results to /Users/zacharymcgrath/Desktop/Experiment output/FP/truth/...
Could not make an alignment for 260/1086 spectra (23%)
CPU times: user 4min 4s, sys: 17.4 s, total: 4min 21s
Wall time: 16min 31s


In [6]:
%%time
runner.run(non_truth_run_params)

Loading database...
Done
Loading spectra...
Done
On batch 1/1
On protein 274/274 [100%]
Sorting the set of protein masses...
Initializing other processors...
Done.
Creating an alignment for 1085/1086 [100%]
Finished search. Writting results to /Users/zacharymcgrath/Desktop/Experiment output/FP/not_truth/...
Could not make an alignment for 312/1086 spectra (28%)
CPU times: user 6min 4s, sys: 32.1 s, total: 6min 36s
Wall time: 18min


## 4. Compare and count

In [7]:
truth_results_json = outputDir + 'truth/summary.json'
non_truth_results_json = outputDir + 'not_truth/summary.json'

truth_results = json.load(open(truth_results_json, 'r'))
non_truth_results = json.load(open(non_truth_results_json, 'r'))

# save them in a dictionary by their id so that we can compare
ided_truth = {v['spectrum']['id'].replace('.pkl', ''): v for _, v in truth_results.items()}
ided_non_truth = {v['spectrum']['id'].replace('.pkl', ''): v for _, v in non_truth_results.items()}

In [8]:
truth_greater = {}
non_truth_greater = {}

for k, v in ided_truth.items():
    
    if k not in ided_non_truth:
        truth_greater[k] = v
        
    else:
        nt_v = ided_non_truth[k]
        
        # see if we have anything in our alignments
        if len(v['alignments']) == 0 and len(nt_v['alignments']) == 0:
            continue
            
        elif len(v['alignments']) == 0 and len(nt_v['alignments']) > 0:
            non_truth_greater[k] = v
            continue
            
        elif len(v['alignments']) >0 and len(nt_v['alignments']) == 0:
            truth_greater[k] = v 
            continue

        # see which score was greater
        best_truth = sorted(v['alignments'], key=lambda x: x['total_score'], reverse=True)[0]['total_score']
        best_non_truth = sorted(nt_v['alignments'], key=lambda x: x['total_score'], reverse=True)[0]['total_score']
        
        # get all sequences with this score
        best_truth_seqs = [x['sequence'] for x in v['alignments'] if x['total_score'] >= best_truth]
        best_non_truth_seqs = [x['sequence'] for x in nt_v['alignments'] if x['total_score'] >= best_non_truth]
        
        entry = (v, nt_v)
        
        if best_truth > best_non_truth:
            truth_greater[k] = entry
            
        elif any([x == y for x in best_truth_seqs for y in best_non_truth_seqs]):
            truth_greater[k] = entry
            
        else:
            non_truth_greater[k] = entry
            
for k, v in ided_non_truth.items():
    if k not in ided_truth and k not in truth_greater and k not in non_truth_greater:
        non_truth_greater[k] = v

In [9]:
print(len(truth_greater))
print(len(non_truth_greater))

582
299


In [10]:
for k, v in non_truth_greater.items():
    if isinstance(v, dict):
        continue
    t_seqs = [x['sequence'] for x in v[0]['alignments']]
    nt_seqs = [x['sequence'] for x in v[1]['alignments']]
    print(f'Truth: {t_seqs} \t non truth: {nt_seqs}')

Truth: ['DLQTLALIL', 'DLQTLALII', 'DLQTLSPLL', 'DLQTLALEV', 'DLQTLANLL'] 	 non truth: ['DIQTLPSLL', 'DIQTLSPLL', 'DLQDGAIPGL', 'DIQDGAIPGL', 'DLQDQLVAP']
Truth: ['SLSLMYEAKAPLAGQVNTMVTTSTTTTVAK', 'SLSIEEFMRPLAGQVNTMVTTSTTTTVAK', 'ISSIDEFCALLGAPFAPLHGGGSLHYSLSR', 'SLLSSAQKIISSSAGQVNTMVTTSTTTTVAK'] 	 non truth: ['TVLSELTWFNALIPAWQYGLFDFVQR', 'SLLSEPLTSFNALIPAWQYGLFDFVQR', 'SISLTFPWFNALIPAWQYGLFDFVQR', 'SLLSAGYHTFNALIPAWQYGLFDFVQR', 'SLVTEYHTFNALIPAWQYGLFDFVQR']
Truth: ['SSYRNITTYK', 'SSGDFNITTYK', 'SSNGFNITTYK', 'SSMSTNITTYK', 'SSTDCNITTYK'] 	 non truth: ['SSIPNPPQTYK', 'SSPINPPQTYK', 'SSPARPPQTYK', 'SSAPRPPQTYK', 'SSKPVPPQTYK']
Truth: ['DIQIEQLGGSPG', 'DIQIEQLNSGP', 'DIQIEQLNSPG', 'DIQIEQISGGPG', 'DIQIEQLSGGPG'] 	 non truth: ['DVAAQLELNSGP', 'DVAAQLELNGSP', 'DVAAQLEVQSPG', 'TASGGIQSHAER', 'ATSNIQSHAER']
Truth: ['DPQTVAGAFSYEESPG', 'DPQTVAGAFSYSATSPG', 'DPQTVAGAFSYGEASGP'] 	 non truth: ['RNCLELQQSSANYK', 'ATSQDLLQQSSANYK', 'DTKAAIEQSPMHGPAT', 'DTKAAIEQLQESSFS', 'DTKAAIEQVTNGEFM']
Truth: 