# Create cis spliced peptides
Since neo-fusion can only run with cis spliced peptides, we need to compare our results to theirs on those terms

## 1. Load fasta file

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.file_io import fasta

fasta_file = '../../testing framework/data/databases/100prots.fasta'
database = fasta.read(fasta_file, True)

database = {x['name']: x for x in database}

## 2.  Generate the peptides, hybrid proteins and peptides

In [2]:
from collections import namedtuple
from random import randint
hybridprotein = namedtuple('hybridprotein', ['protein', 'section_removed', 'start_index_removed', 'end_index_removed', 'sequence', 'name'])
hybridpeptide = namedtuple('hybridpeptide', ['protein', 'sequence', 'start_index'])
maxlen = 25
maxpeplen = 26
minpeplen = 6

def create_cis_spliced_hybrid_proteins(protdict, n):
    protdictkeys = list(protdict.keys())
    lenprotdict = len(protdictkeys)
    hybrids = []
    for i in range(n):
        prot = protdict[protdictkeys[randint(0, lenprotdict-1)]]
        start_pos = randint(maxpeplen, len(prot['sequence']) - maxlen)
        length = randint(minpeplen, maxlen)
        hp = hybridprotein(prot['name'], prot['sequence'][start_pos: start_pos+length], start_pos, start_pos+length-1, prot['sequence'][:start_pos] + prot['sequence'][start_pos+length:], 'hybrid-{}-protein'.format(i))
        hybrids.append(hp)
    return hybrids
        
def create_cis_spliced_hybrid_peptides(listhybridprots, n):
    hybridpeps = []
    for i in range(n):
        a = randint(0, len(listhybridprots)-1)
        hybprot = listhybridprots[a]
        peplen = randint(minpeplen, maxpeplen)
        half = round(peplen/2)
        hp = hybridpeptide(hybprot.name, hybprot.sequence[hybprot.start_index_removed - half - 1: hybprot.start_index_removed + half], hybprot.start_index_removed - half - 1)
        hybridpeps.append(hp)
    return hybridpeps
                                                                                                                            

In [3]:
from modules.sequence_generation import proteins, peptides
test_directory = '../data/testing_output/cis_spliced/'

num_hybs = 5
min_length= 5
max_length = 35
num_peptides = 1000
min_cont = 3 #min contribution for each side of a hybrid

# create peptides
non_hybrid_peps = peptides.gen_peptides([x for _, x in database.items()], num_peptides, min_length=min_length, max_length=max_length, digest='random', dist='beta')

hybridprots = create_cis_spliced_hybrid_proteins(database, num_hybs)
hybridpeps = create_cis_spliced_hybrid_peptides(hybridprots, num_hybs**2)

all_proteins_raw = [x for _,x in database.items()] + [dict(x._asdict()) for x in hybridprots]
all_peptides_raw = non_hybrid_peps + [dict(x._asdict()) for x in hybridpeps]

peps = {}
for i, pep in enumerate(all_peptides_raw):
    peps[i] = pep
    peps[i]['scan_no'] = i
    

## 2.1 Save this info so that I can analyze it later from Neo-Fusion

In [4]:
import json
experiment_info_file_name = 'experiment_info.json'

exp = {'database': fasta_file, 'peptides': peps}
with open(test_directory + experiment_info_file_name, 'w') as o:
    json.dump(exp, o)


## 3. Generate spectra

In [5]:
from src.spectra import gen_spectra
from src.utils import utils
from modules.sequence_generation import write_spectra, realistic_spectra

utils.make_dir(test_directory)

spectra = [x._asdict() for x in realistic_spectra.gen_realistic_spectra([pep['sequence'] for _, pep in peps.items()])]

write_spectra.write_mzml('testSpectraFile', spectra, output_dir=test_directory)


Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


'../data/testing_output/cis_spliced/testSpectraFile.mzML'

## test and fix any mess ups

In [6]:
import xml.etree.ElementTree as ET
ET.register_namespace('', "http://psi.hupo.org/ms/mzml")
tree = ET.parse(test_directory + 'testSpectraFile.mzML')
mzml = tree.find('{http://psi.hupo.org/ms/mzml}mzML')
# add an id to the mzml
mzml.set('id', 'testSpectraFileFixed')
mzml.set('xmlns', "http://psi.hupo.org/ms/mzml")
mzml.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
# remove some info from the mzml header [accession, version]
toDelMzml = ['accession']
for todel in toDelMzml:
    del mzml.attrib[todel]
    
# add file description stuff to the fileDescription tag
fileDescription = mzml.find('{http://psi.hupo.org/ms/mzml}fileDescription')


sourceFileElement = ET.fromstring('<sourceFileList count="1"> <sourceFile id="NOD2_E3.mzXML" name="NOD2_E3.mzXML" location="file:///C:/Users/zachmcgrath/Downloads"> <cvParam cvRef="MS" accession="MS:1000776" name="scan number only nativeID format" value=""/> <cvParam cvRef="MS" accession="MS:1000566" name="ISB mzXML format" value=""/> <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="89ecb0dd31ca3a2fdf5ef2c4f5341f6e5e9f06f0"/> </sourceFile> </sourceFileList>')
fileDescription.append(sourceFileElement)

instrumentConfigurationList = mzml.find('{http://psi.hupo.org/ms/mzml}instrumentConfigurationList')

instrumentConfiguration = ET.fromstring(' <instrumentConfiguration id="IC1"> <componentList count="3"> <source order="1"> <userParam name="msIonisation" value="HPLC-Chip/MS"/>  </source> <analyzer order="1"> <userParam name="msMassAnalyzer" value="Q-TOF"/>  </analyzer> <detector order="1"> <userParam name="msDetector" value="ADC"/> </detector> </componentList> </instrumentConfiguration>')
for child in instrumentConfigurationList:
    del child

instrumentConfigurationList.append(instrumentConfiguration)

run = mzml.find('{http://psi.hupo.org/ms/mzml}run')
spectrumList = run.find('{http://psi.hupo.org/ms/mzml}spectrumList')
print(len(spectrumList))
for spectrumElement in spectrumList:
    # need to add this
    # <cvParam cvRef="MS" accession="MS:1000127" name="centroid spectrum" value=""/>
    centroidElement = ET.Element('{http://psi.hupo.org/ms/mzml}cvParam', attrib={'name': 'centroid spectrum', 'accession':'MS:1000127', 'value': ''})
    pl = spectrumElement.find('{http://psi.hupo.org/ms/mzml}precursorList')
    bdal = spectrumElement.find('{http://psi.hupo.org/ms/mzml}binaryDataArrayList')
    if bdal is None or pl is None:
        print(spectrumElement)
        spectrumList.remove(spectrumElement)
        
    # add the range 
    scanListEl = spectrumElement.find('{http://psi.hupo.org/ms/mzml}scanList')
    scanEl = scanListEl.find('{http://psi.hupo.org/ms/mzml}scan')[0]
    scanWindowListElement = ET.fromstring('<scanWindowList count="1"> <scanWindow> <cvParam cvRef="MS" accession="MS:1000501" value="0" name="scan window lower limit" unitAccession="MS:1000040" unitName="m/z" unitCvRef="MS" /> <cvParam cvRef="MS" accession="MS:1000500" value="10000" name="scan window upper limit" unitAccession="MS:1000040" unitName="m/z" unitCvRef="MS" /> </scanWindow></scanWindowList>')
    scanEl.append(scanWindowListElement)
    
    spectrumElement.append(centroidElement)

tree.write(test_directory + 'testSpectraFileFixed.mzML', encoding='utf-8', xml_declaration=True)
    

1025


In [7]:
'''
inside <instrumentConfiguration>
<componentList count="3">
          <source order="1">
            <userParam name="msIonisation" value="HPLC-Chip/MS"/>
          </source>
          <analyzer order="1">
            <userParam name="msMassAnalyzer" value="Q-TOF"/>
          </analyzer>
          <detector order="1">
            <userParam name="msDetector" value="ADC"/>
          </detector>
        </componentList>
        
inside <fileDescription>     
<sourceFileList count="2">
        <sourceFile id="NOD2_E3.mzXML" name="NOD2_E3.mzXML" location="file:///C:\Users\zachmcgrath\Downloads">
          <cvParam cvRef="MS" accession="MS:1000776" name="scan number only nativeID format" value=""/>
          <cvParam cvRef="MS" accession="MS:1000566" name="ISB mzXML format" value=""/>
          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="89ecb0dd31ca3a2fdf5ef2c4f5341f6e5e9f06f0"/>
        </sourceFile>
      </sourceFileList>
      
inside run -> spectrumList -> spectrum -> scanList -> scan
<scanWindowList count="1">
                <scanWindow>
                  <cvParam cvRef="MS" accession="MS:1000501" value="0" name="scan window lower limit" unitAccession="MS:1000040" unitName="m/z" unitCvRef="MS" />
                  <cvParam cvRef="MS" accession="MS:1000500" value="4000" name="scan window upper limit" unitAccession="MS:1000040" unitName="m/z" unitCvRef="MS" />
                </scanWindow>
              </scanWindowList>
'''

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 568-569: truncated \UXXXXXXXX escape (<ipython-input-7-a5b7a281105d>, line 23)