## Imports 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import copy

from pyteomics import fasta
from collections import defaultdict

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src import runner
from src import gen_spectra
from src.postprocessing import review
from src import utils

# Multiple datasets post analysis
Run multiple data sets and do some post processing to see how well we do overall

### Define the data

In [2]:
# define the list of datasets
# make it a list of tuples of (mzml, spectrum mill *sv, database, prefix dir)
raw_prefix = '/Users/zacharymcgrath/Desktop/raw inputs/'
NOD2_data = (
    raw_prefix + 'NOD2_E3/NOD2_E3.mzML', 
    raw_prefix + 'NOD2_E3/NOD2_E3_results.ssv', 
    raw_prefix + 'mouse_database.fasta',
    raw_prefix + 'NOD2_E3/'
)

BALB3_data = (
    raw_prefix + 'BALB3_E3/BALB3_E3.mzML', 
    raw_prefix + 'BALB3_E3/BALB3_E3.ssv', 
    raw_prefix + 'mouse_database.fasta',
    raw_prefix + 'BALB3_E3/'
)

datasets = [NOD2_data, BALB3_data]


### Filter the data
We don't care about the spectra that don't get matches in spectrumMill because we need something to compare to. Additionally, create a filtered database for just the proteins that are found in the results

In [3]:
def db_filter(db_file: str, results_file: str, output_fasta: str) -> None:
    '''
    Create the subset of proteins needed for the database search
    
    Inputs:
        db_file:        (str)  the original fasta file
        results_file:   (str)  the results ssv file from spectrumMill
        output_fasta:   (str)  the fasta file to write to
    '''
    
    # load all protiens into a dictionary
    db = {}
    for entry in fasta.read(db_file):
        name = entry.description.split('|')[2]
        name = name[:name.index('OS=')-1]
        name = ' '.join(name.split(' ')[1:])
        db[name.lower()] = entry

    # load the results ssv into a dataframe 
    res_df = pd.read_csv(results_file, sep=';')
    
    print(f'Number of results: {len(res_df.index)}')

    # keep track of those we want
    filtered = []
    for idx, row in res_df.iterrows():
        key = row['entry_name'].lower()
        
        if key not in db:
            continue
            
        filtered.append(db[key])

    filtered = list(set(filtered))
    
    print(f'Number of proteins in database was reduced from {len(db)} to {len(filtered)}')
    
    fasta.write(filtered, output_fasta, file_mode='w')

In [4]:
updated_datasets = []

for dataset in datasets:
        
    # make a file name for the output
    output_fasta = dataset[-1] + 'filtered_' + os.path.basename(dataset[1])
    
    db_filter(dataset[2], dataset[1], output_fasta)
    
    updated_datasets.append((*dataset, output_fasta))

datasets = updated_datasets

Number of results: 1086
Number of proteins in database was reduced from 17028 to 279
Number of results: 971
Number of proteins in database was reduced from 17028 to 228


### Now run hyped search

In [5]:
min_pep = 3
max_pep = 30
tolerance = 20
precursor_tolerance = 10
peak_filter = 25
relative_abundance_filter = 0

for dataset in datasets:
        
    run_params = {
        'spectra_folder': dataset[3],
        'database_file': dataset[-1],
        'output_dir': dataset[3] + 'output/',
        'min_peptide_len': min_pep,
        'max_peptide_len': max_pep,
        'tolerance': tolerance,
        'precursor_tolerance': precursor_tolerance,
        'peak_filter': peak_filter, 
        'relative_abundance_filter': relative_abundance_filter,
        'digest': '', 
        'missed_cleavages': 0,
        'verbose': True,
        'DEBUG': False,
        'cores': 16,
        'truth_set': '', 
        'n': 100
    }

    runner.run(run_params)
    
    print()

Loading database...
Done
Loading spectra...
Done
On batch 1/1
On protein 279/279 [100%]
Sorting the set of protein masses...
Done
Initializing other processors...
Done.
Creating an alignment for 1085/1086 [100%]
Finished search. Writting results to /Users/zacharymcgrath/Desktop/raw inputs/NOD2_E3/output/...
Could not make an alignment for 1/1086 spectra (0%)
Loading database...
Done
Loading spectra...
Done
On batch 1/1
On protein 228/228 [100%]
Sorting the set of protein masses...
Done
Initializing other processors...
Done.
Creating an alignment for 12165/12166 [100%]
Finished search. Writting results to /Users/zacharymcgrath/Desktop/raw inputs/BALB3_E3/output/...
Could not make an alignment for 1090/12166 spectra (8%)


### Keep track of the output data location

In [6]:
output_data = [
    x[3] + 'output/summary.json' for x in datasets
]

### For each of the datasets, find out where the correct answer fell (no ties)

In [None]:
def dictify_table(df: pd.DataFrame, key: str) -> dict:
    '''
    Turn a pandas dataframe into a dictionary where the indices are the key specified
    '''
    df.set_index(keys)
    df.transpose()
    return df.to_dict()

In [None]:
results = defaultdict(lambda: 0)

for dataset, output in zip(datasets, output_data):
    
    # first we need to index the spectrumMill results by id
    specMill_results = dictify_table(pd.read_csv(dataset[1]), 'filename')
    
    # now we need to load the output data
    hypedSearch_results = json.load(open(output_data, 'r'))
    
    # now go through every result in specMill, find the corresponding one in hypedSearch, and add to results
    