In [None]:
import numpy as np

SEQ_LEN = 30  # Sequence length for prosit
VEC_LENGTH = 174
def create_masking(charges_array, sequences_lengths):
    """
    assume reshaped output of prosit, shape sould be (num_seq, 174)
    set filtered output where not allowed positions are set to -1
    prosit output has the form:
    y1+1 y1+2 y1+3 b1+1 b1+2 b1+3 y2+1     y2+2 y2+3     b2+1     b2+2 b2+3
    if charge >= 3: all allowed
    if charge == 2: all +3 invalid
    if charge == 1: all +2 & +3 invalid
    """

    assert len(charges_array) == len(sequences_lengths)

    mask = np.ones(shape=(len(charges_array), VEC_LENGTH), dtype=np.float16)
    
    for i in range(len(charges_array)):
        charge_one_hot = charges_array[i]
        len_seq = sequences_lengths[i]
        m = mask[i]

        # filter according to peptide charge
        if np.array_equal(charge_one_hot, [1, 0, 0, 0, 0, 0]):
            invalid_indexes = [(x * 3 + 1) for x in range((SEQ_LEN - 1) * 2)] + [
                (x * 3 + 2) for x in range((SEQ_LEN - 1) * 2)
            ]
            m[invalid_indexes] = np.nan

        elif np.array_equal(charge_one_hot, [0, 1, 0, 0, 0, 0]):
            invalid_indexes = [x * 3 + 2 for x in range((SEQ_LEN - 1) * 2)]
            m[invalid_indexes] = np.nan

        if len_seq < SEQ_LEN:
            invalid_indexes = range((len_seq - 1) * 6, VEC_LENGTH)
            m[invalid_indexes] = np.nan

    return mask

In [None]:
def sum(a,b):
    c = a + b
    return c, a

ss = sum(1,2)
print(ss[0])

3


In [None]:
charges_array = [[1, 0, 0, 0, 0, 0],[0, 1, 0, 0, 0, 0]]
sequences_lengths = [5,13]
mask = create_masking(charges_array, sequences_lengths)
print(mask)

[[ 1. nan nan  1. nan nan  1. nan nan  1. nan nan  1. nan nan  1. nan nan
   1. nan nan  1. nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan]
 [ 1.  1. nan  1.  1. nan  1.  1. nan  1.  1. nan  1.  1. nan  1.  1. nan
   1.  1. nan  1.  1. nan  1.  1. nan  1.  1. nan  1.  1. nan  1.  1. nan
   1.  1. nan  1.  1. nan  1.  1. nan  1.  1. nan  1.  1. nan  1.  1. nan
   1.  1. nan  1.  1. nan  1.  1. nan  1.  1. nan  1.  1. nan

In [None]:
import numpy as np
#from spectrum_fundamentals.annotation.annotation import peak_pos_xl_cms2

VEC_LENGTH = 174 * 2


def peak_pos_xl_cms2(unmod_seq: str, crosslinker_position: int) -> list:
    """
    Determines the positions of all potential normal and xl fragments within the vector generated by generate_annotation_matrix.

    This fuction is used only for cleavable crosslinked peptides.

    :param unmod_seq: Un modified peptide sequence
    :param crosslinker_position: The position of crosslinker
    :raises ValueError: if Peptides exceeding a length of 30.
    :return: position of diffrent fragments as list
    """
    peaks_y = []
    peaks_b = []
    peaks_yshort = []
    peaks_bshort = []
    peaks_ylong = []
    peaks_blong = []

    if len(unmod_seq) < 31:
        if crosslinker_position != 1:
            peaks_b = np.array([3, 4, 5])
            peaks_b = np.tile(peaks_b, crosslinker_position - 1) + np.repeat(
                np.arange(crosslinker_position - 1) * 6, 3
            )
            first_pos_ylong = (
                (len(unmod_seq) - crosslinker_position) * 6
            ) + 174  # fisrt  position for ylong
            peaks_ylong = np.arange(first_pos_ylong, first_pos_ylong + 3)
            peaks_ylong = np.tile(peaks_ylong, crosslinker_position - 1) + np.repeat(
                np.arange(crosslinker_position - 1) * 6, 3
            )

        if len(unmod_seq) != crosslinker_position:
            peaks_y = [0, 1, 2]
            peaks_y = np.tile(
                peaks_y, len(unmod_seq) - crosslinker_position
            ) + np.repeat(np.arange(len(unmod_seq) - crosslinker_position) * 6, 3)
            first_pos_blong = (
                ((crosslinker_position - 1) * 6) + 174 + 3
            )  # fisrt  position for blong
            peaks_blong = [first_pos_blong, first_pos_blong + 1, first_pos_blong + 2]
            peaks_blong = np.arange(first_pos_blong, first_pos_blong + 3)
            peaks_blong = list(
                np.tile(peaks_blong, len(unmod_seq) - crosslinker_position)
                + np.repeat(np.arange(len(unmod_seq) - crosslinker_position) * 6, 3)
            )

        peaks_yshort = [x - 174 for x in peaks_ylong]
        peaks_bshort = [x - 174 for x in peaks_blong]
        peaks_range = (
            list(peaks_y)
            + list(peaks_b)
            + list(peaks_yshort)
            + list(peaks_bshort)
            + list(peaks_ylong)
            + list(peaks_blong)
        )
        peaks_range.sort()
    else:
        raise ValueError(
            f"Peptides exceeding a length of 30 are not supported: {len(unmod_seq)}"
        )

    return (
        peaks_range,
        peaks_y,
        peaks_b,
        peaks_yshort,
        peaks_bshort,
        peaks_ylong,
        peaks_blong,
    )

def create_masking(unmod_seq, crosslinker_position):
    """
    assume reshaped output of xl-prosit, shape sould be (num_seq, 174 * 2)
    set filtered output where not allowed positions are set to -1
    we set charge = 2 for all peptide a and b
    """

    assert len(unmod_seq) == len(crosslinker_position)
    mask = np.ones(shape=(len(unmod_seq), VEC_LENGTH))
    for i in range(len(unmod_seq)):
        m = mask[i].copy()
        peaks_ranges = peak_pos_xl_cms2(unmod_seq[i], crosslinker_position[i])
        updated_mask  = np.setdiff1d(np.arange(0, 348), peaks_ranges[0])
        #print(peaks_ranges[0])
        #print(updated_mask)
        #print(m)
        m[updated_mask] = np.nan
        mask[i] = m
    return mask

def apply_masking(peaks, mask):
    peaks[peaks < 0] = np.finfo(np.float32).eps
    out = np.multiply(peaks, mask)
    out = (out.T / np.nanmax(out, axis=1)).T
    return out


In [None]:
unmod_seq = ["KKD","KDDFKKK"]
crosslinker_position = [1,7]
mask = create_masking(unmod_seq, crosslinker_position)
print(mask)

[[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1. nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan  1.  1.  1.
  nan nan nan  1.  1.  1. nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan 

In [None]:
import numpy as np

arr1 = np.array([1, 2, 3, 4, 5, 6, 7, 8])
arr2 = np.array([2, 4, 5])

result = np.setdiff1d(arr1, arr2)

print(result)


[1 3 6 7 8]


In [None]:
#import triton_python_backend_utils as pb_utils
import numpy as np
import json

def gen_annotation():
    ions = [
        "y",
        "b",
    ]
    charges = [1, 2, 3]
    positions = [x for x in range(1, 30)]
    annotation = []
    for pos in positions:
        for ion in ions:
            for charge in charges:
                annotation.append(f"{ion}{pos}+{charge}")
    return np.array(annotation).astype(np.object_)
gen_annotation = gen_annotation()
print(gen_annotation)

['y1+1' 'y1+2' 'y1+3' 'b1+1' 'b1+2' 'b1+3' 'y2+1' 'y2+2' 'y2+3' 'b2+1'
 'b2+2' 'b2+3' 'y3+1' 'y3+2' 'y3+3' 'b3+1' 'b3+2' 'b3+3' 'y4+1' 'y4+2'
 'y4+3' 'b4+1' 'b4+2' 'b4+3' 'y5+1' 'y5+2' 'y5+3' 'b5+1' 'b5+2' 'b5+3'
 'y6+1' 'y6+2' 'y6+3' 'b6+1' 'b6+2' 'b6+3' 'y7+1' 'y7+2' 'y7+3' 'b7+1'
 'b7+2' 'b7+3' 'y8+1' 'y8+2' 'y8+3' 'b8+1' 'b8+2' 'b8+3' 'y9+1' 'y9+2'
 'y9+3' 'b9+1' 'b9+2' 'b9+3' 'y10+1' 'y10+2' 'y10+3' 'b10+1' 'b10+2'
 'b10+3' 'y11+1' 'y11+2' 'y11+3' 'b11+1' 'b11+2' 'b11+3' 'y12+1' 'y12+2'
 'y12+3' 'b12+1' 'b12+2' 'b12+3' 'y13+1' 'y13+2' 'y13+3' 'b13+1' 'b13+2'
 'b13+3' 'y14+1' 'y14+2' 'y14+3' 'b14+1' 'b14+2' 'b14+3' 'y15+1' 'y15+2'
 'y15+3' 'b15+1' 'b15+2' 'b15+3' 'y16+1' 'y16+2' 'y16+3' 'b16+1' 'b16+2'
 'b16+3' 'y17+1' 'y17+2' 'y17+3' 'b17+1' 'b17+2' 'b17+3' 'y18+1' 'y18+2'
 'y18+3' 'b18+1' 'b18+2' 'b18+3' 'y19+1' 'y19+2' 'y19+3' 'b19+1' 'b19+2'
 'b19+3' 'y20+1' 'y20+2' 'y20+3' 'b20+1' 'b20+2' 'b20+3' 'y21+1' 'y21+2'
 'y21+3' 'b21+1' 'b21+2' 'b21+3' 'y22+1' 'y22+2' 'y22+3' 'b22+1' 

In [None]:
import pandas as pd
import re

# Create a sample DataFrame
data = {
    'text_column': [
        "AAM[UNIMOD:35]K[UNIMOD:26]",
        "K[UNIMOD:24]AAC[UNIMOD:5]KK",
        "C[UNIMOD:5]M[UNIMOD:35]KKK[UNIMOD:26]DD"
    ]
}
df = pd.DataFrame(data)

# Remove "[UNIMOD:35]" and "[UNIMOD:5]" from the "text_column"
df["text_column"] = df["text_column"].str.replace(r"\[UNIMOD:35\]|\[UNIMOD:5\]", "", regex=True)
df['position'] = df['text_column'].apply(lambda x: re.search(r'K\[[^\]]*\]', x).start() + 1)
# Display the modified DataFrame
print(df)


          text_column  position
0     AAMK[UNIMOD:26]         4
1   K[UNIMOD:24]AACKK         1
2  CMKKK[UNIMOD:26]DD         5


In [None]:
import pandas as pd
import re

# Assuming your DataFrame column is named 'column_name'
data = {'column_name': ['K[UNIMOD:26]KK', 'KKKKKKKK[UNIMOD:24]AACKK']}
df = pd.DataFrame(data)

# Find the position of 'K' connected to '[UNIMOD:26]' or '[UNIMOD:24]' after removing the tags
df['position'] = df['column_name'].apply(lambda x: re.search(r'K\[[^\]]*\]', x).start() + 1)

print(df[['column_name', 'position']])


                column_name  position
0            K[UNIMOD:26]KK         1
1  KKKKKKKK[UNIMOD:24]AACKK         8


In [None]:
import re

string = "C[UNIMOD:4]M[UNIMOD:35]KM[UNIMOD:35]KK[UNIMOD:1896]KKKDD"

# Remove brackets and their contents, except for 'K[UNIMOD:26]' and 'K[UNIMOD:24]'
output = re.sub(r'\[UNIMOD:(?!1896|1884\]).*?\]', '', string)
match = re.search(r'K(?=\[UNIMOD:(?:1896|1884)\])', output)
if match:
    position = match.start() + 1
    print(position)




6


In [None]:
import re

string = "KK[UNIMOD:1896]KKDD"

# Find the position of 'K' connected to '[UNIMOD:24]' or '[UNIMOD:26]'
match = re.search(r'K(?=\[UNIMOD:(?:1896|1884)\])', string)

if match:
    position = match.start() + 1
    print(position)
else:
    print("No match found.")


2


In [None]:
import re
def find_crosslinker_position(peptide_sequence: str):
    peptide_sequence = re.sub(r"\[UNIMOD:(?!1896|1884\]).*?\]", "", peptide_sequence)
    crosslinker_position = re.search(r"K(?=\[UNIMOD:(?:1896|1884)\])", peptide_sequence)
    crosslinker_position = crosslinker_position.start() + 1
    return crosslinker_position

peptide_sequences = "C[UNIMOD:4]M[UNIMOD:35]KM[UNIMOD:35]KK[UNIMOD:1884]KKKDD"
crosslinker_position = find_crosslinker_position(peptide_sequences)
print(crosslinker_position)

6


In [None]:
import numpy as np
def gen_annotation_linear_pep():
    ions = [
        "y",
        "b",
    ]
    charges = ["1", "2", "3"]
    positions = [x for x in range(1, 30)]
    annotation = []
    for pos in positions:
        for ion in ions:
            for charge in charges:
                annotation.append(ion + str(pos) + "+" + charge)
    return annotation
annotation = gen_annotation_linear_pep()
print(annotation)

['y1+1', 'y1+2', 'y1+3', 'b1+1', 'b1+2', 'b1+3', 'y2+1', 'y2+2', 'y2+3', 'b2+1', 'b2+2', 'b2+3', 'y3+1', 'y3+2', 'y3+3', 'b3+1', 'b3+2', 'b3+3', 'y4+1', 'y4+2', 'y4+3', 'b4+1', 'b4+2', 'b4+3', 'y5+1', 'y5+2', 'y5+3', 'b5+1', 'b5+2', 'b5+3', 'y6+1', 'y6+2', 'y6+3', 'b6+1', 'b6+2', 'b6+3', 'y7+1', 'y7+2', 'y7+3', 'b7+1', 'b7+2', 'b7+3', 'y8+1', 'y8+2', 'y8+3', 'b8+1', 'b8+2', 'b8+3', 'y9+1', 'y9+2', 'y9+3', 'b9+1', 'b9+2', 'b9+3', 'y10+1', 'y10+2', 'y10+3', 'b10+1', 'b10+2', 'b10+3', 'y11+1', 'y11+2', 'y11+3', 'b11+1', 'b11+2', 'b11+3', 'y12+1', 'y12+2', 'y12+3', 'b12+1', 'b12+2', 'b12+3', 'y13+1', 'y13+2', 'y13+3', 'b13+1', 'b13+2', 'b13+3', 'y14+1', 'y14+2', 'y14+3', 'b14+1', 'b14+2', 'b14+3', 'y15+1', 'y15+2', 'y15+3', 'b15+1', 'b15+2', 'b15+3', 'y16+1', 'y16+2', 'y16+3', 'b16+1', 'b16+2', 'b16+3', 'y17+1', 'y17+2', 'y17+3', 'b17+1', 'b17+2', 'b17+3', 'y18+1', 'y18+2', 'y18+3', 'b18+1', 'b18+2', 'b18+3', 'y19+1', 'y19+2', 'y19+3', 'b19+1', 'b19+2', 'b19+3', 'y20+1', 'y20+2', 'y20+3', 

In [None]:
import numpy as np
def peak_pos_xl_cms2(unmod_seq: str, crosslinker_position: int) -> list:
    """
    Determines the positions of all potential normal and xl fragments within the vector generated by generate_annotation_matrix.

    This fuction is used only for cleavable crosslinked peptides.

    :param unmod_seq: Un modified peptide sequence
    :param crosslinker_position: The position of crosslinker
    :raises ValueError: if Peptides exceeding a length of 30.
    :return: position of diffrent fragments as list
    """
    peaks_y = []
    peaks_b = []
    peaks_yshort = []
    peaks_bshort = []
    peaks_ylong = []
    peaks_blong = []

    if len(unmod_seq) < 31:
        if crosslinker_position != 1:
            peaks_b = np.array([3, 4, 5])
            peaks_b = np.tile(peaks_b, crosslinker_position - 1) + np.repeat(np.arange(crosslinker_position - 1) * 6, 3)
            first_pos_ylong = ((len(unmod_seq) - crosslinker_position) * 6) + 174  # fisrt  position for ylong
            peaks_ylong = np.arange(first_pos_ylong, first_pos_ylong + 3)
            peaks_ylong = np.tile(peaks_ylong, crosslinker_position - 1) + np.repeat(
                np.arange(crosslinker_position - 1) * 6, 3
            )

        if len(unmod_seq) != crosslinker_position:
            peaks_y = [0, 1, 2]
            peaks_y = np.tile(peaks_y, len(unmod_seq) - crosslinker_position) + np.repeat(
                np.arange(len(unmod_seq) - crosslinker_position) * 6, 3
            )
            first_pos_blong = ((crosslinker_position - 1) * 6) + 174 + 3  # fisrt  position for blong
            peaks_blong = [first_pos_blong, first_pos_blong + 1, first_pos_blong + 2]
            peaks_blong = np.arange(first_pos_blong, first_pos_blong + 3)
            peaks_blong = list(
                np.tile(peaks_blong, len(unmod_seq) - crosslinker_position)
                + np.repeat(np.arange(len(unmod_seq) - crosslinker_position) * 6, 3)
            )

        peaks_yshort = [x - 174 for x in peaks_ylong]
        peaks_bshort = [x - 174 for x in peaks_blong]
        peaks_range = (
            list(peaks_y)
            + list(peaks_b)
            + list(peaks_yshort)
            + list(peaks_bshort)
            + list(peaks_ylong)
            + list(peaks_blong)
        )
        peaks_range.sort()
    else:
        raise ValueError(f"Peptides exceeding a length of 30 are not supported: {len(unmod_seq)}")

    return peaks_range, peaks_y, peaks_b, peaks_yshort, peaks_bshort, peaks_ylong, peaks_blong

def gen_annotation_linear_pep():
    ions = [
        "y",
        "b",
    ]
    charges = ["1", "2", "3"]
    positions = [x for x in range(1, 30)]
    annotation = []
    for pos in positions:
        for ion in ions:
            for charge in charges:
                annotation.append(ion + str(pos) + "+" + charge)
    return annotation

def gen_annotation_xl(crosslinker_position: int):
    annotations = gen_annotation_linear_pep()
    annotation = np.concatenate((annotations, annotations))
    annotation = annotation.tolist()
    (
        peaks_range,
        peaks_y,
        peaks_b,
        peaks_yshort,
        peaks_bshort,
        peaks_ylong,
        peaks_blong,
    ) = peak_pos_xl_cms2("K" * 30, crosslinker_position)
    for pos in peaks_yshort:
        annotation[pos] = "y_short_" + annotation[pos][1:]
    for pos in peaks_bshort:
        annotation[pos] = "b_short_" + annotation[pos][1:]
    for pos in peaks_ylong:
        annotation[pos] = "y_long_" + annotation[pos][1:]
    for pos in peaks_blong:
        annotation[pos] = "b_long_" + annotation[pos][1:]
    pos_none = [num + 174 for num in peaks_y] + [num + 174 for num in peaks_b]
    for pos in pos_none:
        annotation[pos] = "None"
    return np.array(annotation).astype(np.object_)

annotation = gen_annotation_xl(1)
print(annotation)




['y1+1' 'y1+2' 'y1+3' 'b_short_1+1' 'b_short_1+2' 'b_short_1+3' 'y2+1'
 'y2+2' 'y2+3' 'b_short_2+1' 'b_short_2+2' 'b_short_2+3' 'y3+1' 'y3+2'
 'y3+3' 'b_short_3+1' 'b_short_3+2' 'b_short_3+3' 'y4+1' 'y4+2' 'y4+3'
 'b_short_4+1' 'b_short_4+2' 'b_short_4+3' 'y5+1' 'y5+2' 'y5+3'
 'b_short_5+1' 'b_short_5+2' 'b_short_5+3' 'y6+1' 'y6+2' 'y6+3'
 'b_short_6+1' 'b_short_6+2' 'b_short_6+3' 'y7+1' 'y7+2' 'y7+3'
 'b_short_7+1' 'b_short_7+2' 'b_short_7+3' 'y8+1' 'y8+2' 'y8+3'
 'b_short_8+1' 'b_short_8+2' 'b_short_8+3' 'y9+1' 'y9+2' 'y9+3'
 'b_short_9+1' 'b_short_9+2' 'b_short_9+3' 'y10+1' 'y10+2' 'y10+3'
 'b_short_10+1' 'b_short_10+2' 'b_short_10+3' 'y11+1' 'y11+2' 'y11+3'
 'b_short_11+1' 'b_short_11+2' 'b_short_11+3' 'y12+1' 'y12+2' 'y12+3'
 'b_short_12+1' 'b_short_12+2' 'b_short_12+3' 'y13+1' 'y13+2' 'y13+3'
 'b_short_13+1' 'b_short_13+2' 'b_short_13+3' 'y14+1' 'y14+2' 'y14+3'
 'b_short_14+1' 'b_short_14+2' 'b_short_14+3' 'y15+1' 'y15+2' 'y15+3'
 'b_short_15+1' 'b_short_15+2' 'b_short_15+3' 'y16

In [None]:
import numpy as np

SEQ_LEN = 30  # Sequence length for prosit
VEC_LENGTH = 174


def create_masking(charges_array, sequences_lengths):
    """
    assume reshaped output of prosit, shape sould be (num_seq, 174)
    set filtered output where not allowed positions are set to -1
    prosit output has the form:
    y1+1 y1+2 y1+3 b1+1 b1+2 b1+3 y2+1     y2+2 y2+3     b2+1     b2+2 b2+3
    if charge >= 3: all allowed
    if charge == 2: all +3 invalid
    if charge == 1: all +2 & +3 invalid
    """

    assert len(charges_array) == len(sequences_lengths)

    mask = np.ones(shape=(len(charges_array), VEC_LENGTH), dtype=np.float16)

    for i in range(len(charges_array)):
        charge_one_hot = charges_array[i]
        len_seq = sequences_lengths[i]
        m = mask[i]

        # filter according to peptide charge
        if np.array_equal(charge_one_hot, [1, 0, 0, 0, 0, 0]):
            invalid_indexes = [(x * 3 + 1) for x in range((SEQ_LEN - 1) * 2)] + [
                (x * 3 + 2) for x in range((SEQ_LEN - 1) * 2)
            ]
            m[invalid_indexes] = np.nan

        elif np.array_equal(charge_one_hot, [0, 1, 0, 0, 0, 0]):
            invalid_indexes = [x * 3 + 2 for x in range((SEQ_LEN - 1) * 2)]
            m[invalid_indexes] = np.nan

        if len_seq < SEQ_LEN:
            invalid_indexes = range((len_seq - 1) * 6, VEC_LENGTH)
            m[invalid_indexes] = np.nan

    return mask


def apply_masking(peaks, mask):
    peaks[peaks < 0] = np.finfo(np.float32).eps
    out = np.multiply(peaks, mask)
    out = (out.T / np.nanmax(out, axis=1)).T
    return out


In [2]:
import numpy as np
def gen_annotation():
    ions = [
        "y",
        "b",
    ]
    charges = [1, 2, 3]
    positions = [x for x in range(1, 30)]
    annotation = []
    for pos in positions:
        for ion in ions:
            for charge in charges:
                annotation.append(f"{ion}{pos}+{charge}")
    return np.array(annotation).astype(np.object_)
t = gen_annotation()
print(t)

['y1+1' 'y1+2' 'y1+3' 'b1+1' 'b1+2' 'b1+3' 'y2+1' 'y2+2' 'y2+3' 'b2+1'
 'b2+2' 'b2+3' 'y3+1' 'y3+2' 'y3+3' 'b3+1' 'b3+2' 'b3+3' 'y4+1' 'y4+2'
 'y4+3' 'b4+1' 'b4+2' 'b4+3' 'y5+1' 'y5+2' 'y5+3' 'b5+1' 'b5+2' 'b5+3'
 'y6+1' 'y6+2' 'y6+3' 'b6+1' 'b6+2' 'b6+3' 'y7+1' 'y7+2' 'y7+3' 'b7+1'
 'b7+2' 'b7+3' 'y8+1' 'y8+2' 'y8+3' 'b8+1' 'b8+2' 'b8+3' 'y9+1' 'y9+2'
 'y9+3' 'b9+1' 'b9+2' 'b9+3' 'y10+1' 'y10+2' 'y10+3' 'b10+1' 'b10+2'
 'b10+3' 'y11+1' 'y11+2' 'y11+3' 'b11+1' 'b11+2' 'b11+3' 'y12+1' 'y12+2'
 'y12+3' 'b12+1' 'b12+2' 'b12+3' 'y13+1' 'y13+2' 'y13+3' 'b13+1' 'b13+2'
 'b13+3' 'y14+1' 'y14+2' 'y14+3' 'b14+1' 'b14+2' 'b14+3' 'y15+1' 'y15+2'
 'y15+3' 'b15+1' 'b15+2' 'b15+3' 'y16+1' 'y16+2' 'y16+3' 'b16+1' 'b16+2'
 'b16+3' 'y17+1' 'y17+2' 'y17+3' 'b17+1' 'b17+2' 'b17+3' 'y18+1' 'y18+2'
 'y18+3' 'b18+1' 'b18+2' 'b18+3' 'y19+1' 'y19+2' 'y19+3' 'b19+1' 'b19+2'
 'b19+3' 'y20+1' 'y20+2' 'y20+3' 'b20+1' 'b20+2' 'b20+3' 'y21+1' 'y21+2'
 'y21+3' 'b21+1' 'b21+2' 'b21+3' 'y22+1' 'y22+2' 'y22+3' 'b22+1' 

In [4]:
import re
def find_crosslinker_position(peptide_sequence: str):
    peptide_sequence = re.sub(r"\[UNIMOD:(?!1896|1884\]).*?\]", "", peptide_sequence)
    crosslinker_position = re.search(r"K(?=\[UNIMOD:(?:1896|1884)\])", peptide_sequence)
    crosslinker_position = crosslinker_position.start() + 1
    return crosslinker_position

peptide_sequence = "AADDC[UNIMOD:4]K[UNIMOD:1896]KKMM"
print(find_crosslinker_position(peptide_sequence))

6


In [9]:
def peak_pos_xl_cms2(unmod_seq: str, crosslinker_position: int) -> list:
    """
    Determines the positions of all potential normal and xl fragments within the vector generated by generate_annotation_matrix.

    This fuction is used only for cleavable crosslinked peptides.

    :param unmod_seq: Un modified peptide sequence
    :param crosslinker_position: The position of crosslinker
    :raises ValueError: if Peptides exceeding a length of 30.
    :return: position of diffrent fragments as list
    """
    peaks_y = []
    peaks_b = []
    peaks_yshort = []
    peaks_bshort = []
    peaks_ylong = []
    peaks_blong = []

    if len(unmod_seq) < 31:
        if crosslinker_position != 1:
            peaks_b = np.array([3, 4, 5])
            peaks_b = np.tile(peaks_b, crosslinker_position - 1) + np.repeat(
                np.arange(crosslinker_position - 1) * 6, 3
            )
            first_pos_ylong = (
                (len(unmod_seq) - crosslinker_position) * 6
            ) + 174  # fisrt  position for ylong
            peaks_ylong = np.arange(first_pos_ylong, first_pos_ylong + 3)
            peaks_ylong = np.tile(peaks_ylong, crosslinker_position - 1) + np.repeat(
                np.arange(crosslinker_position - 1) * 6, 3
            )

        if len(unmod_seq) != crosslinker_position:
            peaks_y = [0, 1, 2]
            peaks_y = np.tile(
                peaks_y, len(unmod_seq) - crosslinker_position
            ) + np.repeat(np.arange(len(unmod_seq) - crosslinker_position) * 6, 3)
            first_pos_blong = (
                ((crosslinker_position - 1) * 6) + 174 + 3
            )  # fisrt  position for blong
            peaks_blong = [first_pos_blong, first_pos_blong + 1, first_pos_blong + 2]
            peaks_blong = np.arange(first_pos_blong, first_pos_blong + 3)
            peaks_blong = list(
                np.tile(peaks_blong, len(unmod_seq) - crosslinker_position)
                + np.repeat(np.arange(len(unmod_seq) - crosslinker_position) * 6, 3)
            )

        peaks_yshort = [x - 174 for x in peaks_ylong]
        peaks_bshort = [x - 174 for x in peaks_blong]
        peaks_range = (
            list(peaks_y)
            + list(peaks_b)
            + list(peaks_yshort)
            + list(peaks_bshort)
            + list(peaks_ylong)
            + list(peaks_blong)
        )
        peaks_range.sort()
    else:
        raise ValueError(
            f"Peptides exceeding a length of 30 are not supported: {len(unmod_seq)}"
        )

    return (
        peaks_range,
        peaks_y,
        peaks_b,
        peaks_yshort,
        peaks_bshort,
        peaks_ylong,
        peaks_blong,
    )


def gen_annotation_linear_pep():
    ions = [
        "y",
        "b",
    ]
    charges = ["1", "2", "3"]
    positions = [x for x in range(1, 30)]
    annotation = []
    for pos in positions:
        for ion in ions:
            for charge in charges:
                annotation.append(ion + str(pos) + "+" + charge)
    return annotation

def gen_annotation_xl(crosslinker_position: int):
    annotations = gen_annotation_linear_pep()
    annotation = np.concatenate((annotations, annotations))
    annotation = annotation.tolist()
    (
        peaks_range,
        peaks_y,
        peaks_b,
        peaks_yshort,
        peaks_bshort,
        peaks_ylong,
        peaks_blong,
    ) = peak_pos_xl_cms2("K" * 30, crosslinker_position)
    for pos in peaks_yshort:
        annotation[pos] = "y_short_" + annotation[pos][1:]
    for pos in peaks_bshort:
        annotation[pos] = "b_short_" + annotation[pos][1:]
    for pos in peaks_ylong:
        annotation[pos] = "y_long_" + annotation[pos][1:]
    for pos in peaks_blong:
        annotation[pos] = "b_long_" + annotation[pos][1:]
    pos_none = [num + 174 for num in peaks_y] + [num + 174 for num in peaks_b]
    for pos in pos_none:
        annotation[pos] = "None"
    return np.array(annotation).astype(np.object_)
print(gen_annotation_xl(30))

['y_short_1+1' 'y_short_1+2' 'y_short_1+3' 'b1+1' 'b1+2' 'b1+3'
 'y_short_2+1' 'y_short_2+2' 'y_short_2+3' 'b2+1' 'b2+2' 'b2+3'
 'y_short_3+1' 'y_short_3+2' 'y_short_3+3' 'b3+1' 'b3+2' 'b3+3'
 'y_short_4+1' 'y_short_4+2' 'y_short_4+3' 'b4+1' 'b4+2' 'b4+3'
 'y_short_5+1' 'y_short_5+2' 'y_short_5+3' 'b5+1' 'b5+2' 'b5+3'
 'y_short_6+1' 'y_short_6+2' 'y_short_6+3' 'b6+1' 'b6+2' 'b6+3'
 'y_short_7+1' 'y_short_7+2' 'y_short_7+3' 'b7+1' 'b7+2' 'b7+3'
 'y_short_8+1' 'y_short_8+2' 'y_short_8+3' 'b8+1' 'b8+2' 'b8+3'
 'y_short_9+1' 'y_short_9+2' 'y_short_9+3' 'b9+1' 'b9+2' 'b9+3'
 'y_short_10+1' 'y_short_10+2' 'y_short_10+3' 'b10+1' 'b10+2' 'b10+3'
 'y_short_11+1' 'y_short_11+2' 'y_short_11+3' 'b11+1' 'b11+2' 'b11+3'
 'y_short_12+1' 'y_short_12+2' 'y_short_12+3' 'b12+1' 'b12+2' 'b12+3'
 'y_short_13+1' 'y_short_13+2' 'y_short_13+3' 'b13+1' 'b13+2' 'b13+3'
 'y_short_14+1' 'y_short_14+2' 'y_short_14+3' 'b14+1' 'b14+2' 'b14+3'
 'y_short_15+1' 'y_short_15+2' 'y_short_15+3' 'b15+1' 'b15+2' 'b15+3'
 'y_

In [1]:
import numpy as np
def gen_annotation():
    ions = [
        "y",
        "b",
    ]
    charges = [1, 2, 3]
    positions = [x for x in range(1, 30)]
    annotation = []
    for pos in positions:
        for ion in ions:
            for charge in charges:
                annotation.append(f"{ion}{pos}+{charge}")
    return np.array(annotation).astype(np.object_)

def gen_annotation_linear_pep():
    ions = [
        "y",
        "b",
    ]
    charges = ["1", "2", "3"]
    positions = [x for x in range(1, 30)]
    annotation = []
    for pos in positions:
        for ion in ions:
            for charge in charges:
                annotation.append(ion + str(pos) + "+" + charge)
    return annotation

def gen_annotation_xl(crosslinker_position: int):
    annotations = gen_annotation_linear_pep()
    annotation = np.concatenate((annotations, annotations))
    annotation = annotation.tolist()
    (
        peaks_range,
        peaks_y,
        peaks_b,
        peaks_yshort,
        peaks_bshort,
        peaks_ylong,
        peaks_blong,
    ) = peak_pos_xl_cms2("K" * 30, crosslinker_position)
    for pos in peaks_yshort:
        annotation[pos] = "y_short_" + annotation[pos][1:]
    for pos in peaks_bshort:
        annotation[pos] = "b_short_" + annotation[pos][1:]
    for pos in peaks_ylong:
        annotation[pos] = "y_long_" + annotation[pos][1:]
    for pos in peaks_blong:
        annotation[pos] = "b_long_" + annotation[pos][1:]
    pos_none = [num + 174 for num in peaks_y] + [num + 174 for num in peaks_b]
    for pos in pos_none:
        annotation[pos] = "None"
    return np.array(annotation).astype(np.object_)
import numpy as np
import json
import re


def peak_pos_xl_cms2(unmod_seq: str, crosslinker_position: int) -> list:
    """
    Determines the positions of all potential normal and xl fragments within the vector generated by generate_annotation_matrix.

    This fuction is used only for cleavable crosslinked peptides.

    :param unmod_seq: Un modified peptide sequence
    :param crosslinker_position: The position of crosslinker
    :raises ValueError: if Peptides exceeding a length of 30.
    :return: position of diffrent fragments as list
    """
    peaks_y = []
    peaks_b = []
    peaks_yshort = []
    peaks_bshort = []
    peaks_ylong = []
    peaks_blong = []

    if len(unmod_seq) < 31:
        if crosslinker_position != 1:
            peaks_b = np.array([3, 4, 5])
            peaks_b = np.tile(peaks_b, crosslinker_position - 1) + np.repeat(
                np.arange(crosslinker_position - 1) * 6, 3
            )
            first_pos_ylong = (
                (len(unmod_seq) - crosslinker_position) * 6
            ) + 174  # fisrt  position for ylong
            peaks_ylong = np.arange(first_pos_ylong, first_pos_ylong + 3)
            peaks_ylong = np.tile(peaks_ylong, crosslinker_position - 1) + np.repeat(
                np.arange(crosslinker_position - 1) * 6, 3
            )

        if len(unmod_seq) != crosslinker_position:
            peaks_y = [0, 1, 2]
            peaks_y = np.tile(
                peaks_y, len(unmod_seq) - crosslinker_position
            ) + np.repeat(np.arange(len(unmod_seq) - crosslinker_position) * 6, 3)
            first_pos_blong = (
                ((crosslinker_position - 1) * 6) + 174 + 3
            )  # fisrt  position for blong
            peaks_blong = [first_pos_blong, first_pos_blong + 1, first_pos_blong + 2]
            peaks_blong = np.arange(first_pos_blong, first_pos_blong + 3)
            peaks_blong = list(
                np.tile(peaks_blong, len(unmod_seq) - crosslinker_position)
                + np.repeat(np.arange(len(unmod_seq) - crosslinker_position) * 6, 3)
            )

        peaks_yshort = [x - 174 for x in peaks_ylong]
        peaks_bshort = [x - 174 for x in peaks_blong]
        peaks_range = (
            list(peaks_y)
            + list(peaks_b)
            + list(peaks_yshort)
            + list(peaks_bshort)
            + list(peaks_ylong)
            + list(peaks_blong)
        )
        peaks_range.sort()
    else:
        raise ValueError(
            f"Peptides exceeding a length of 30 are not supported: {len(unmod_seq)}"
        )

    return (
        peaks_range,
        peaks_y,
        peaks_b,
        peaks_yshort,
        peaks_bshort,
        peaks_ylong,
        peaks_blong,
    )


def find_crosslinker_position(peptide_sequence: str):
    peptide_sequence = re.sub(r"\[UNIMOD:(?!1896|1884\]).*?\]", "", peptide_sequence)
    crosslinker_position = re.search(r"K(?=\[UNIMOD:(?:1896|1884)\])", peptide_sequence)
    crosslinker_position = crosslinker_position.start() + 1
    return crosslinker_position

#regular_sequence = ["AAK[UNIMOD:1896]","AAKKK[UNIMOD:1896]"]
annotation_1 = np.empty((0, 174))

for i in range(2):
                #crosslinker_position = find_crosslinker_position(regular_sequence[i])
                annotation_i = gen_annotation()
                annotation_1 = np.vstack((annotation_1, annotation_i))


#annotation_2 = annotation = np.tile(gen_annotation(), 2).reshape((-1, 174))
#annotation_2 = annotation = np.tile(gen_annotation_xl(1), 2).reshape((-1, 348))
print(annotation_1)
#print(annotation_2)

[['y1+1' 'y1+2' 'y1+3' 'b1+1' 'b1+2' 'b1+3' 'y2+1' 'y2+2' 'y2+3' 'b2+1'
  'b2+2' 'b2+3' 'y3+1' 'y3+2' 'y3+3' 'b3+1' 'b3+2' 'b3+3' 'y4+1' 'y4+2'
  'y4+3' 'b4+1' 'b4+2' 'b4+3' 'y5+1' 'y5+2' 'y5+3' 'b5+1' 'b5+2' 'b5+3'
  'y6+1' 'y6+2' 'y6+3' 'b6+1' 'b6+2' 'b6+3' 'y7+1' 'y7+2' 'y7+3' 'b7+1'
  'b7+2' 'b7+3' 'y8+1' 'y8+2' 'y8+3' 'b8+1' 'b8+2' 'b8+3' 'y9+1' 'y9+2'
  'y9+3' 'b9+1' 'b9+2' 'b9+3' 'y10+1' 'y10+2' 'y10+3' 'b10+1' 'b10+2'
  'b10+3' 'y11+1' 'y11+2' 'y11+3' 'b11+1' 'b11+2' 'b11+3' 'y12+1' 'y12+2'
  'y12+3' 'b12+1' 'b12+2' 'b12+3' 'y13+1' 'y13+2' 'y13+3' 'b13+1' 'b13+2'
  'b13+3' 'y14+1' 'y14+2' 'y14+3' 'b14+1' 'b14+2' 'b14+3' 'y15+1' 'y15+2'
  'y15+3' 'b15+1' 'b15+2' 'b15+3' 'y16+1' 'y16+2' 'y16+3' 'b16+1' 'b16+2'
  'b16+3' 'y17+1' 'y17+2' 'y17+3' 'b17+1' 'b17+2' 'b17+3' 'y18+1' 'y18+2'
  'y18+3' 'b18+1' 'b18+2' 'b18+3' 'y19+1' 'y19+2' 'y19+3' 'b19+1' 'b19+2'
  'b19+3' 'y20+1' 'y20+2' 'y20+3' 'b20+1' 'b20+2' 'b20+3' 'y21+1' 'y21+2'
  'y21+3' 'b21+1' 'b21+2' 'b21+3' 'y22+1' 'y22+2' 'y

In [7]:
import numpy as np
import re

# from spectrum_fundamentals.annotation.annotation import peak_pos_xl_cms2

VEC_LENGTH = 348

def peak_pos_xl_cms2(unmod_seq: str, crosslinker_position: int) -> list:
    """
    Determines the positions of all potential normal and xl fragments within the vector generated by generate_annotation_matrix.

    This fuction is used only for cleavable crosslinked peptides.

    :param unmod_seq: Un modified peptide sequence
    :param crosslinker_position: The position of crosslinker
    :raises ValueError: if Peptides exceeding a length of 30.
    :return: position of diffrent fragments as list
    """
    peaks_y = []
    peaks_b = []
    peaks_yshort = []
    peaks_bshort = []
    peaks_ylong = []
    peaks_blong = []

    if len(unmod_seq) < 31:
        if crosslinker_position != 1:
            peaks_b = np.array([3, 4, 5])
            peaks_b = np.tile(peaks_b, crosslinker_position - 1) + np.repeat(
                np.arange(crosslinker_position - 1) * 6, 3
            )
            first_pos_ylong = (
                (len(unmod_seq) - crosslinker_position) * 6
            ) + 174  # fisrt  position for ylong
            peaks_ylong = np.arange(first_pos_ylong, first_pos_ylong + 3)
            peaks_ylong = np.tile(peaks_ylong, crosslinker_position - 1) + np.repeat(
                np.arange(crosslinker_position - 1) * 6, 3
            )

        if len(unmod_seq) != crosslinker_position:
            peaks_y = [0, 1, 2]
            peaks_y = np.tile(
                peaks_y, len(unmod_seq) - crosslinker_position
            ) + np.repeat(np.arange(len(unmod_seq) - crosslinker_position) * 6, 3)
            first_pos_blong = (
                ((crosslinker_position - 1) * 6) + 174 + 3
            )  # fisrt  position for blong
            peaks_blong = [first_pos_blong, first_pos_blong + 1, first_pos_blong + 2]
            peaks_blong = np.arange(first_pos_blong, first_pos_blong + 3)
            peaks_blong = list(
                np.tile(peaks_blong, len(unmod_seq) - crosslinker_position)
                + np.repeat(np.arange(len(unmod_seq) - crosslinker_position) * 6, 3)
            )

        peaks_yshort = [x - 174 for x in peaks_ylong]
        peaks_bshort = [x - 174 for x in peaks_blong]
        peaks_range = (
            list(peaks_y)
            + list(peaks_b)
            + list(peaks_yshort)
            + list(peaks_bshort)
            + list(peaks_ylong)
            + list(peaks_blong)
        )
        peaks_range.sort()
    else:
        raise ValueError(
            f"Peptides exceeding a length of 30 are not supported: {len(unmod_seq)}"
        )

    return (
        peaks_range,
        peaks_y,
        peaks_b,
        peaks_yshort,
        peaks_bshort,
        peaks_ylong,
        peaks_blong,
    )


def create_masking(unmod_seq, crosslinker_position):
    """
    assume reshaped output of xl-prosit, shape sould be (num_seq, 174 * 2)
    set filtered output where not allowed positions are set to -1
    we set charge = 2 for all peptide a and b
    """

    assert len(unmod_seq) == len(crosslinker_position)
    mask = np.ones(shape=(len(unmod_seq), VEC_LENGTH))
    for i in range(len(unmod_seq)):
        m = mask[i].copy()
        peaks_ranges = peak_pos_xl_cms2(unmod_seq[i], crosslinker_position[i])
        updated_mask = np.setdiff1d(np.arange(0, 348), peaks_ranges[0])
        updated_mask_charge_3 = np.arange(2, end + 348, 3)
        updated_mask = np.concatenate((updated_mask, updated_mask_charge_3))
        print(updated_mask)
        m[updated_mask] = np.nan
        mask[i] = m
    return mask

def find_crosslinker_position(peptide_sequence: str):
    peptide_sequence = re.sub(r"\[UNIMOD:(?!1896|1884\]).*?\]", "", peptide_sequence)
    crosslinker_position = re.search(r"K(?=\[UNIMOD:(?:1896|1884)\])", peptide_sequence)
    crosslinker_position = crosslinker_position.start() + 1
    return crosslinker_position


unmod_seq = ["DIADAVTAAGVEVAKSEVR"]
crosslinker_position = [15]
print(create_masking(unmod_seq,crosslinker_position))
#print(find_crosslinker_position("DIADAVTAAGVEVAK[UNIMOD:1896]SEVR"))

[108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 201 202 203 207 208 209 213 214 215 219 220 221 225 226 227 231 232 233
 237 238 239 243 244 245 249 250 251 255 256 257 282 283 284 285 286 287
 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
 342 343 344 345 346 347]
[[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
   1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
   1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
   1.  1.  1.  1.  1. 