In [None]:
variable_ptm_dict = {
    "K": {
        "Ac": "ac",
        "Bu": "me",
        "Me2": "me2",
        "Me3": "me3",
        "Cr": "cr",
        "Hib": "hib",
        "Su": "su"
    },
    "R": {
        "Me": "me",
        "Me2": "me2",
        "Cit": "cit",
    },
    "M": {
        "Ox": "ox",
    },
}
unmodified_abbreviation = "un"

fasta_file_name = "/home/sander/Documents/Proteomics/msqrob/data/HumanMycoCrap_2018_nr.fasta"
progenesis_file_name = "/home/sander/Documents/Proteomics/msqrob/data/All_Peptides_MSqRob_Formatted.csv"
out_file_name = progenesis_file_name[:-4] + "_SW.csv"
verbose = False

In [None]:
variable_ptm_dict = {
    "K": {
        "Ac": "ac",
        "Bu": "me",
        "Me2": "me2",
        "Me3": "me3",
        "Cr": "cr",
        "Hib": "hib",
        "Su": "su"
    },
    "R": {
        "Me": "me",
        "Me2": "me2",
        "Cit": "cit",
    },
    "M": {
        "Ox": "ox",
    },
}
unmodified_abbreviation = "un"

fasta_file_name = "/home/sander/Documents/Proteomics/msqrob/data/HumanMycoCrap_2018_nr.fasta"
progenesis_file_name = "/home/sander/Documents/Proteomics/msqrob/data/Histone_ID_exported.csv"
out_file_name = progenesis_file_name[:-4] + "_SW.csv"
verbose = True

In [None]:
import pyteomics
from pyteomics import fasta
import csv

description_format=lambda x: x.split()[0].split("|")[-1]
protein_sequences = {description_format(description): sequence for description, sequence in pyteomics.fasta.read(fasta_file_name)}


In [None]:
def parsePTMs(ptm_string, peptide_sequence, peptide_start_index, verbose=False):
    ptms = {
        "{}{}".format(
            aa,
            peptide_start_index + index + 1
        ): unmodified_abbreviation for index, aa in enumerate(
            peptide_sequence
        ) if aa in variable_ptm_dict
    }
    if len(ptm_string) > 0:
        for raw_ptm in ptm_string.split("|"):
            loc, aa, ptm = raw_ptm.split()
            try:
                loc = int(loc[1: -1])
            except ValueError:
                if verbose:
                    print(
                        "Undefined ptm {} on peptide {}".format(raw_ptm, peptide_sequence)
                    )
                continue
            aa = aa[1: -1]
            format_string = "{}{}".format(aa, peptide_start_index + loc)
            if format_string not in ptms:
                if verbose:
                    print(
                        "Undefined ptm {} on peptide {}".format(raw_ptm, peptide_sequence)
                    )
                continue
            try:
                ptms[format_string] = variable_ptm_dict[aa][ptm]
            except KeyError:
                if verbose:
                    print(
                        "Undefined ptm {} on peptide {}".format(raw_ptm, peptide_sequence)
                    )
                continue
    return ["{}{}".format(aa_loc, ptm) for aa_loc, ptm in ptms.items()]

with open(progenesis_file_name, "r") as raw_infile:
    header = True
    infile = csv.reader(raw_infile)
    with open(out_file_name, "w") as raw_outfile:
        outfile = csv.writer(raw_outfile)
        for row in infile:
            row_addition = ""
            if header:
                if "Sequence" in row:
                    header = False
                    sequence_column_index = row.index("Sequence")
                    protein_column_index = row.index("Protein")
                    ptm_column_index = row.index("Variable modifications ([position] description)")
                    row_addition = "msqrob_proteins"
            else:
                peptide_sequence = row[sequence_column_index]
                proteins_string = row[protein_column_index]
                ptm_string = row[ptm_column_index]
                if proteins_string != "":
                    row_additions = []
                    for protein_id in proteins_string.split(";"):
                        print(peptide_sequence, protein_id)
                        peptide_start_index = protein_sequences[protein_id].index(peptide_sequence)
                        print(peptide_start_index)
                        peptide_end_index = peptide_start_index + len(peptide_sequence)
                        ptms = parsePTMs(ptm_string, peptide_sequence, peptide_start_index, verbose)
                        row_additions += [
                            "{}_{}-{}_{}".format(
                                protein_id,
                                peptide_start_index, 
                                peptide_end_index,
                                ptm
                            ) for ptm in ptms
                        ]
                    row_addition = ";".join(row_additions)
            outfile.writerow(row + [row_addition])