# Convert SpectrumMill ssv results to experiment json

### Load ssv file into pandas df (because its easy)

In [1]:
import pandas as pd

results_file = '/Users/zacharymcgrath/Downloads/NOD2_E3_results.ssv'
df = pd.read_csv(results_file, ';')
df.head(2)

Unnamed: 0,number,filename,parent_charge,score,deltaForwardReverseScore,deltaRank1Rank2Score,percent_scored_peak_intensity,totalIntensity,previous_aa,sequence,next_aa,retentionTimeMin,chromatographicPeakWidthSec,parent_m_over_z,species,entry_name
0,1,NOD2_E3.13446.13477.2,2,10.1,10.1,9.91,84.5,183000.0,(E),DPQVEQLEL,(-),48.35,26.0,535.7725,MOUSE,ins1C18
1,2,NOD2_E3.18005.18246.2,2,12.84,11.07,12.84,97.8,40000000.0,(G),DLQTLALEVA,(-),65.78,29.0,536.8007,MOUSE,ins1C3


In [2]:
SEQUENCE_COL_NAME = 'sequence'
PARENT_PROTEIN_NAME = 'entry_name'

In [3]:
import sys
sys.path.append('/Users/zacharymcgrath/Documents/Layer_Research/Proteomics_Experiments/Database_Experiments/src')
from file_io import fasta

# the databse is shown in the column "species" so we'll just grab that manually from UniProt
database_file_name = '/Users/zacharymcgrath/Downloads/uniprot-proteome_UP000000589.fasta'
db_list = fasta.read(database_file_name, is_uniprot=True)
print(db_list[1])

{'name': 'SDHL_MOUSE', 'sequence': 'MAAQESLHVKTPLRDSMALSKLAGTSVFLKMDSSQPSGSFKIRGIGHLCKMKAKQGCRHFVCSSAGNAGMATAYAARRLGIPATIVVPNTTPALTIERLKNEGATVEVVGEMLDEAIQVAKALEKNNPGWVYISPFDDPLIWEGHTSLVKELKETLSAKPGAIVLSVGGGGLLCGVVQGLREVGWEDVPIIAMETFGAHSFHAAIKEGKLVTLPKITSVAKALGVNTVGAQTLKLFYEHPIFSEVISDQEAVSALEKFVDDEKILVEPACGAALAAVYSRVVCRLQDEGRLQTPLASLVVIVCGGSNISLAQLQALKVQLGLNGLPE', 'identifier': 'Q8VBT2', 'human_readable_name': 'L-serine dehydratase/L-threonine deaminase'}


#### switch list representation of database to dictionary based on the 'name' field

In [4]:
from collections import defaultdict
db_dict = defaultdict(list)

for e in db_list:
    db_dict[e['human_readable_name']].append({'sequence': e['sequence'], 'id': e['identifier'], 'name': e['name']})

In [5]:
db_dict['Peroxiredoxin-6']

[{'sequence': 'MPGGLLLGDEAPNFEANTTIGRIRFHDFLGDSWGILFSHPRDFTPVCTTELGRAAKLAPEFAKRNVKLIALSIDSVEDHLAWSKDINAYNGETPTEKLPFPIIDDKGRDLAILLGMLDPVEKDDNNMPVTARVVFIFGPDKKLKLSILYPATTGRNFDEILRVVDSLQLTGTKPVATPVDWKKGESVMVVPTLSEEEAKQCFPKGVFTKELPSGKKYLRYTPQP',
  'id': 'O08709',
  'name': 'PRDX6_MOUSE'},
 {'sequence': 'MKWHLKMRWGILFSHPRDFTPVCTTELGRAAKLAPEFAKRNVKLIALSIDSVEDHLAWSKDINAYNGETPTEKLPFPIIDDKGRDLAILLGMLDPVEKDANNMPVTARVVFIFGPDKKLKLSILYPATTGRNFDEILRVVDSLQLTGTKPVATPVDWKKGESVMVVPTLSEEEAKQCFPKGVFTKELPSGKKYLRYTPQP',
  'id': 'D3Z0Y2',
  'name': 'D3Z0Y2_MOUSE'},
 {'sequence': 'MPGGLLLGDEAPNFEANTTIGRIRFHDFLGDSWGILFSHPRDFTPVCTTELGRAAKLAPEFAKRNVKLIALSIDSVEDHLAWSKDINAYNGETPTEKLPFPIIDDKGRDLAILLGMLDPVEKDANNMPVTARVVFIFGPDKKLKLSILYPATTGRNFDEILRVVDSLQLTGTKPVATPVDWKKGESVMVVPTLSEEEAKQCFPKGVFTKELPSGKKYLRYTPQP',
  'id': 'Q6GT24',
  'name': 'Q6GT24_MOUSE'},
 {'sequence': 'MPGGLLLGDEAPNFEANTTIGRIRFHDFLGDSTSMLTMVKHPRKSCHFPSLMIRAGTLPSFWACWIQSRRTLTTCL',
  'id': 'A0A0A6YXQ7',
  'name': 'A0A0A6YXQ7_MOUSE'}]

### we now have all of our proteins indexable in db_hmn_dict by 'entry_name' in df

## Using the above info, create info part of the dictionary 
### Structure
The high level structure is
```python
{'experiment_info': {
    'proteins': [],
    'peptides': []
}}
```
Protein structure is 
```python
{
    "name": str,
    "human_readable_name": str,
    "sequence": str,
    "identifier": str
}
```
peptide structure is 
```python
{
    "peptide_name": "peptide_00",
    "peptide_sequence": "ALWGPDPAAAFVNQH",
    "parent_name": str,
    "parent_sequence": str",
    "starting_position": 14,
    "ending_position": 28 #inclusive
}
```

In [7]:
import regex as re
exp = {'experiment_info': {
    'proteins': [],
    'peptides': []
}}
start_len = len(df)
missing = 0
fill_zeroes = len(str(len(df)))

protein_count = defaultdict(int)

def rm_parens(s):
    if '(' in s:
        print(s)
        print(re.sub(r'\([^)]*\)', '', s))
    return(re.sub(r'\([^)]*\)', '', s))

def get_parent_info(name, pepseq):
    if not isinstance(db_dict[name], list):
        return db_dict[name]
    else:
        for e in db_dict[name]:
            if pepseq in e['sequence']:
                return e
        
for idx, row in df.iterrows():
    if row[PARENT_PROTEIN_NAME] not in db_dict:
        missing += 1
        continue
    pepname = 'peptide_'+ str(idx).zfill(fill_zeroes)
    pepseq = rm_parens(row[SEQUENCE_COL_NAME])
    parent_info = get_parent_info(row[PARENT_PROTEIN_NAME], pepseq)
    parentname = parent_info['name']
    parenthmnname = row[PARENT_PROTEIN_NAME]
    parentseq = parent_info['sequence']
    startpos = parentseq.index(pepseq)
    endpos = startpos + len(pepseq) - 1
    exp['experiment_info']['peptides'].append({
        'peptide_name': pepname,
        'peptide_sequence': pepseq,
        'parent_name': parentname,
        'parent_human_readable_name': parenthmnname,
        'parent_sequence': parentseq,
        'starting_position': startpos,
        'ending_position': endpos
    })
    protein_count[parentname] += 1
    
print('Missing {} of {} peptides due to parent names'.format(missing, start_len))

Missing 12 of 1086 peptides due to parent names


In [8]:
from copy import deepcopy
for hmnrdblname in db_dict:
    e = deepcopy(db_dict[hmnrdblname])
    if isinstance(e, list):
        for el in e:
            if el['name'] not in protein_count:
                continue
            el['human_readable_name'] = hmnrdblname
            exp['experiment_info']['proteins'].append(el)
    else:
        if e['name'] not in protein_count: 
            continue
        e['human_readable_name'] = hmnrdblname
        exp['experiment_info']['proteins'].append(e)

In [9]:
print(len(exp['experiment_info']['proteins']))

280


## dump this to a file

In [10]:
from file_io import JSON

file_name = '/Users/zacharymcgrath/Desktop/Experiment output/NOD2_E3/experiment_data.json'
JSON.save_dict(file_name, exp)