# Processing the spectra into a graph
The mgf spectra is loaded into model where it is processed by ```1_spectra-preprocessing``` scripts. The processed spectra is then sent to the ```process_spectrum_Graph```.

This notebook explores the behaviour of ```process_spectrum_Graph```

In [7]:
from typing import List
import numpy as np
from pyteomics import mass
import re

In [8]:
# config cell
config = {
    'max_num_peaks': 400,
    'aa_mass_tolerance': 0.05,
}
vocab_reverse = ['A',
                 'R',
                 'N',
                 'Nmod',
                 'D',
                 # 'C',
                 'Cmod',
                 'E',
                 'Q',
                 'Qmod',
                 'G',
                 'H',
                 'I',
                 'L',
                 'K',
                 'M',
                 'Mmod',
                 'F',
                 'P',
                 'S',
                 'T',
                 'W',
                 'Y',
                 'V',
                 ]
mass_H = 1.0078
mass_H2O = 18.0106
mass_NH3 = 17.0265
mass_N_terminus = 1.0078
mass_C_terminus = 17.0027
mass_CO = 27.9949
mass_AA = {'_PAD': 0.0,
           '_GO': mass_N_terminus - mass_H,
           '_EOS': mass_C_terminus + mass_H,
           'A': 71.03711,  # 0
           'R': 156.10111,  # 1
           'N': 114.04293,  # 2
           'Nmod': 115.02695,
           'D': 115.02694,  # 3
           # 'C': 103.00919, # 4
           'Cmod': 160.03065,  # C(+57.02)
           # ~ 'Cmod':161.01919, # C(+58.01) # orbi
           'E': 129.04259,  # 5
           'Q': 128.05858,  # 6
           'Qmod': 129.0426,
           'G': 57.02146,  # 7
           'H': 137.05891,  # 8
           'I': 113.08406,  # 9
           'L': 113.08406,  # 10
           'K': 128.09496,  # 11
           'M': 131.04049,  # 12
           'Mmod': 147.0354,
           'F': 147.06841,  # 13
           'P': 97.05276,  # 14
           'S': 87.03203,  # 15
           'T': 101.04768,  # 16
           'W': 186.07931,  # 17
           'Y': 163.06333,  # 18
           'V': 99.06841,  # 19
           }

In [None]:
def process_spectrum_graph(spectrum: List):
    """
    This function takes in the process mass spectrum dataset

    Parameters:


    """

    scan, peptide_ids, spectrum_mz, spectrum_intensity, peptide_mass, pep_charge = spectrum

    max_num_peaks = config['max_num_peaks']

    aa_edge_precision = config['aa_mass_tolerance'] # the range around the mass we should look

    mp = peptide_mass

    spectrum_intensity = np.divide(spectrum_intensity, max(spectrum_intensity)) # normalize the intensity peaks

    peaks = np.stack([spectrum_mz, spectrum_intensity], axis=1)
    #np.stack joins two arrays at the specified axis.
    # In this case we are joining the arrays so that they form spectrum_mz-spectrum_intensity pairs. In the same way zip works

    b_or_y, diffs = match_peaks(peptide_ids=peptide_ids,
                                spectrum_mz=spectrum_mz,
                                tolerance=aa_edge_precision)



The b and y ions and their mass differences is calculated with the ```match_peaks``` function that takes in the peptide sequence in the form of their vocab ids, the spectrum_mz and the tolerance and matches the peptides to the spectrum_mz.

The functions ```fragments_mgf``` create theoretical fragments from the peptide sequence.

In [None]:
def match_peaks(peptide_ids, spectrum_mz, tolerance, _8ions=False):
    """
    Matching the peaks of the mass spectrum to the peptide sequence

    """
    peptide_str = [vocab_reverse[i] for i in peptide_ids] # the peptide is converted back to string
    # these functions create theoretical fragments
    if _8ions: # not sure what are _8ions
        true_peaks, tp_ions, tp_frags = fragments_mgf_8ion(peptide=peptide_str)
    else:
        true_peaks, tp_ions, tp_frags = fragments_mgf(peptide=peptide_str)

    # we now match the peaks in the spectra to the theoretical peaks
    matched_peaks = []
    matched_diffs = []
    # for each mass in the spectrum within_tol finds the true_peak that finds the difference
    for mz in spectrum_mz:
        log_diff = within_tol(mz, true_peaks, rtol=0, atol=tolerance)

        if np.any(log_diff[..., 0]):
            closest_idx = np.argmin(np.log_diff[...,1])








## Fragments_MGF function
This function creates theoretical fragments of a given peptide sequence and calculates their masses. The code as a whole is given in the following cells, followed by the breakdown what each section of the code does.

In [9]:
def fragments_mgf(peptide: str,types=("b","y"), max_charge=1, ion_loss=False):
    """
    This function creates theoretical fragments from the peptide sequence of types 'types' and of charges from 1 to max_charge.

    Parameters:
    peptide: str
        The peptide sequence in the form of a string
    types: tuple
        The types of fragments to be created. Default is ("b","y")
    max_charge: int
    ion_loss: bool
        If True, the function will also return the ion losses of the fragments
    """
    peptide = "".join(peptide) # converts peptide list to string

    peaks = []
    ions = []
    frags = []
    pep_len = len(peptide)

    for i in range(1, pep_len):
        for ion_type in types:
            for charge in range(1, max_charge+1):
                if ion_type[0] in 'abc':
                    peaks.append(mass.fast_mass(peptide[:1], ion_type=ion_type, charge=charge)) # from a peptide sequence the mass for each peptide subsequence is calculated with fast_mass. Giving the mass of each theoretical fragment.
                    ions.append(ion_type+str(i)+("+"+str(charge) if max_charge > 1 else ""))
                    frags.append("".join(peptide[:i]))
                else:
                    peaks.append((sum([mass_AA[j] for j in peptide[i:]]) +
                                  mass_C_terminus+
                                  mass_N_terminus+
                                  (charge*mass_H)+
                                  mass.calculate_mass(
                                      mass.std_ion_comp[ion_type]
                                  )) /charge)
                    ions.append(ion_type+str(pep_len-i)+ ("+"+str(charge) if max_charge > 1 else ""))
                    frags.append("".join(peptide[i:]))
    frags = [str(i) for i in frags]

    if max_charge>1:
        order = sorted(range(len(ions)),key=lambda x: (ions[x][0],re.findall("(?:\d+)(\_|\*)*",ions[x])[0],int(re.search("(\d+)",ions[x])[0]),int(re.findall("(\d+)",ions[x])[1])))
    else:
        order = sorted(range(len(ions)),key=lambda x: (ions[x][0],re.findall("(?:\d+)(\_|\*)*",ions[x])[0],int(re.search("(\d+)",ions[x])[0])))

    peaks = list(np.array(peaks)[order])
    ions = list(np.array(ions)[order])
    frags = list(np.array(frags)[order])

    if ion_loss:
        df = ionLoss([peaks, ions, frags])
    else:
        df = [peaks, ions, frags]
    return df

atomic_mass = {"H":1.007825,
               "C":12.000000,
               "N":14.003074,
               "O":15.994915,
               "P":1.007276}

add =    {"a":-(atomic_mass["C"]+atomic_mass["O"]),
           "b":0,
           "c":atomic_mass["N"]+3*atomic_mass["H"],
           "x":atomic_mass["C"]+2*atomic_mass["O"],
           "y":2*atomic_mass["H"]+atomic_mass["O"],
           "z":-(atomic_mass["N"]+atomic_mass["H"])+atomic_mass["O"],
           "water":2*atomic_mass["H"]+atomic_mass["O"],
           "ammonia":atomic_mass["N"]+ 3*atomic_mass["H"]}

def ionLoss(df,
            water=None,
            ammonia=None):

    if ammonia is None:
        ammonia = ["K", "N", "Q", "R"]
    if water is None:
        water = ["Cterm", "D", "E", "S", "T"]
    assert len(df[0])==len(df[1])
    assert len(df[1])==len(df[2])

    rules = {"D":"^D.","E":"^E.","S":".S.","T":".T."}
    rules = [rules[i] for i in {"D", "E", "S", "T"}.intersection(water)]
    wmatch = [re.match("|".join(rules),i) for i in df[2]]
    widx = [i for i in range(len(wmatch)) if wmatch[i] != None]
    CtermIdx = [i for i in range(len(df[1])) if df[1][i][0] in "xyz"]
    widx = list(set(widx + CtermIdx))

    rules = {"K":"^.*K.", "N":"^.*N.", "Q":"^.*Q.", "R":".R."}
    rules = [rules[i] for i in {"K", "N", "Q", "R"}.intersection(ammonia)]
    amatch = [re.match("|".join(rules),i) for i in df[2]]
    aidx = [i for i in range(len(amatch)) if amatch[i] != None]

    df = remove_mol(df,widx,add["water"],ion="_")
    df = remove_mol(df,aidx,add["ammonia"],ion="*")

    return df

def remove_mol(df,idx,mass,ion):
    df = df[:]

    df_ion = [[df[j][i] for i in idx] for j in range(len(df))]

    df_ion[0] = [i-mass for i in df_ion[0]]
    df_ion[1] = [i+ion for i in df_ion[1]]
    for i in range(len(df)):
        df[i]+=df_ion[i]
    return df

In [10]:
fragments_mgf('PEPTIDE')

[[98.06004031562,
  98.06004031562,
  98.06004031562,
  98.06004031562,
  98.06004031562,
  98.06004031562,
  148.06089,
  263.08782999999994,
  376.17188999999996,
  477.21957,
  574.2723299999999,
  703.3149199999999],
 ['b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'y1', 'y2', 'y3', 'y4', 'y5', 'y6'],
 ['P',
  'PE',
  'PEP',
  'PEPT',
  'PEPTI',
  'PEPTID',
  'E',
  'DE',
  'IDE',
  'TIDE',
  'PTIDE',
  'EPTIDE']]

1. The function takes as a parameter the 'PEPTIDE' if its a list it converts it to a string.

In [16]:
peptide = 'PEPTIDE'
peptide = list(peptide)
print(peptide)
peptide = "".join(peptide)
peptide

['P', 'E', 'P', 'T', 'I', 'D', 'E']


'PEPTIDE'

In [15]:
peaks = []
ions = []
frags = []
types = ("b", "y")
max_charge = 1
pep_len = len(peptide)

for i in range(1, pep_len):
    for ion_type in types:
        for charge in range(1, max_charge+1):
            if ion_type[0] in 'abc':
                peaks.append(mass.fast_mass(peptide[:1], ion_type=ion_type, charge=charge)) # from a peptide sequence the mass for each peptide subsequence is calculated with fast_mass. Giving the mass of each theoretical fragment.
                ions.append(ion_type+str(i)+("+"+str(charge) if max_charge > 1 else ""))
                frags.append("".join(peptide[:i]))
            else:
                peaks.append((sum([mass_AA[j] for j in peptide[i:]]) +
                              mass_C_terminus+
                              mass_N_terminus+
                              (charge*mass_H)+
                              mass.calculate_mass(
                                  mass.std_ion_comp[ion_type]
                              )) /charge)
                ions.append(ion_type+str(pep_len-i)+ ("+"+str(charge) if max_charge > 1 else ""))
                frags.append("".join(peptide[i:]))

['PEPTIDE']