# Create cis spliced peptides
Since neo-fusion can only run with cis spliced peptides, we need to compare our results to theirs on those terms

## 1. Load fasta file

In [35]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.file_io import fasta

fasta_file = '../../testing framework/data/databases/100prots.fasta'
database = fasta.read(fasta_file, True)

database = {x['name']: x for x in database}

## 2.  Generate the peptides, hybrid proteins and peptides

In [36]:
from collections import namedtuple
from random import randint
hybridprotein = namedtuple('hybridprotein', ['protein', 'section_removed', 'start_index_removed', 'end_index_removed', 'sequence', 'name'])
hybridpeptide = namedtuple('hybridpeptide', ['protein', 'sequence', 'start_index'])
maxlen = 25
maxpeplen = 26
minpeplen = 6

def create_cis_spliced_hybrid_proteins(protdict, n):
    protdictkeys = list(protdict.keys())
    lenprotdict = len(protdictkeys)
    hybrids = []
    for i in range(n):
        prot = protdict[protdictkeys[randint(0, lenprotdict-1)]]
        start_pos = randint(maxpeplen, len(prot['sequence']) - maxlen)
        length = randint(minpeplen, maxlen)
        hp = hybridprotein(prot['name'], prot['sequence'][start_pos: start_pos+length], start_pos, start_pos+length-1, prot['sequence'][:start_pos] + prot['sequence'][start_pos+length:], 'hybrid-{}-protein'.format(i))
        hybrids.append(hp)
    return hybrids
        
def create_cis_spliced_hybrid_peptides(listhybridprots, n):
    hybridpeps = []
    for i in range(n):
        a = randint(0, len(listhybridprots)-1)
        hybprot = listhybridprots[a]
        peplen = randint(minpeplen, maxpeplen)
        half = round(peplen/2)
        hp = hybridpeptide(hybprot.name, hybprot.sequence[hybprot.start_index_removed - half - 1: hybprot.start_index_removed + half], hybprot.start_index_removed - half - 1)
        hybridpeps.append(hp)
    return hybridpeps
                                                                                                                            

In [37]:
from modules.sequence_generation import proteins, peptides
test_directory = '../../testing framework/data/testing_output/'

num_hybs = 5
min_length= 5
max_length = 35
num_peptides = 1000
min_cont = 3 #min contribution for each side of a hybrid

# create peptides
non_hybrid_peps = peptides.gen_peptides([x for _, x in database.items()], num_peptides, min_length=min_length, max_length=max_length, digest='random', dist='beta')

hybridprots = create_cis_spliced_hybrid_proteins(database, num_hybs)
hybridpeps = create_cis_spliced_hybrid_peptides(hybridprots, num_hybs**2)

all_proteins_raw = [x for _,x in database.items()] + [dict(x._asdict()) for x in hybridprots]
all_peptides_raw = non_hybrid_peps + [dict(x._asdict()) for x in hybridpeps]

peptides = {}
for i, pep in enumerate(all_peptides_raw):
    peptides[i] = pep
    peptides[i]['scan_no'] = i

## 2.1 Save this info so that I can analyze it later from Neo-Fusion

In [38]:
import json
experiment_info_file_name = 'experiment_info.json'

exp = {'database': fasta_file, 'peptides': peptides}
with open(test_directory + experiment_info_file_name, 'w') as o:
    json.dump(exp, o)


## 3. Generate spectra

In [39]:
from src.spectra import gen_spectra
from src.utils import utils
from modules.sequence_generation import write_spectra

utils.make_dir(test_directory)

spectra = []
sorted_keys = [int(c) for c in peptides.keys()]
sorted_keys.sort()
for k in sorted_keys:
    pep = peptides[k]
    cont = gen_spectra.gen_spectrum(pep['sequence'])
    spec = cont['spectrum']
    if len(spec) < 2:
        print(k)
        print(pep)
    pm = cont['precursor_mass']
    spectra.append({'spectrum': spec, 'precursor_mass': pm})
write_spectra.write_mzml('testSpectraFile', spectra, output_dir=test_directory)


'../../testing framework/data/testing_output/testSpectraFile.mzML'

## test and fix any mess ups

In [50]:
import xml.etree.ElementTree as ET
ET.register_namespace('', "http://psi.hupo.org/ms/mzml")
tree = ET.parse('../../testing framework/data/testing_output/testSpectraFile.mzML')
mzml = tree.find('{http://psi.hupo.org/ms/mzml}mzML')
# add an id to the mzml
mzml.set('id', 'testSpectraFileFixed')
mzml.set('xmlns', "http://psi.hupo.org/ms/mzml")
mzml.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
# remove some info from the mzml header [accession, version]
toDelMzml = ['accession']
for todel in toDelMzml:
    del mzml.attrib[todel]
    
# add file description stuff to the fileDescription tag
fileDescription = mzml.fin('{http://psi.hupo.org/ms/mzml}fileDescription')
# sourceFile id="NOD2_E3.mzXML" name="NOD2_E3.mzXML" location="file:///C:\Users\zachmcgrath\Downloads">
sourceFileElement = ET.Element('sourceFile', attrib={'id': 'testSpectraFileFixed', 'name': 'testSpectraFileFixed'})

run = mzml.find('{http://psi.hupo.org/ms/mzml}run')
spectrumList = run.find('{http://psi.hupo.org/ms/mzml}spectrumList')
print(len(spectrumList))
for spectrumElement in spectrumList:
    # need to add this
    # <cvParam cvRef="MS" accession="MS:1000127" name="centroid spectrum" value=""/>
    centroidElement = ET.Element('{http://psi.hupo.org/ms/mzml}cvParam', attrib={'name': 'centroid spectrum', 'accession':'MS:1000127', 'value': ''})
    pl = spectrumElement.find('{http://psi.hupo.org/ms/mzml}precursorList')
    bdal = spectrumElement.find('{http://psi.hupo.org/ms/mzml}binaryDataArrayList')
    if bdal is None or pl is None:
        print(spectrumElement)
        spectrumList.remove(spectrumElement)
    spectrumElement.append(centroidElement)

tree.write('../../testing framework/data/testing_output/testSpectraFileFixed.mzML', encoding='utf-8', xml_declaration=True)
    

1025


AttributeError: 'ElementTree' object has no attribute 'tostring'

In [None]:
<mzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="NOD2_E3" version="1.1.0">
<mzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="testSpectraFileFixed" version="1.1.0">