In [1]:
variable_ptm_dict = {
    "K": {
        "Ac": "ac",
        "Bu": "me",
        "Me2": "me2",
        "Me3": "me3",
        "Cr": "cr",
        "Hib": "hib",
        "Su": "su"
    },
    "R": {
        "Me": "me",
        "Me2": "me2",
        "Cit": "cit",
    },
    "M": {
        "Ox": "ox",
    },
}
unmodified_abbreviation = "un"

fasta_file_name = "/mnt/d/MSqRob/hPTM_Parsing/Test cases/FASTA/HumanMycoCrap_2018_nr.fasta"
progenesis_file_name = "/mnt/d/MSqRob/hPTM_Parsing/Test cases/hPTMs/All_Peptide_IDs_For_MSqRob_RawAbundance.csv"
out_file_name = progenesis_file_name[:-4] + "_SW.csv"
verbose = False

def ptm_format(raw_ptm):
    loc, aa, *ptm = raw_ptm.split()
    ptm = " ".join(ptm)
    return loc, ptm, aa

In [12]:
variable_ptm_dict = {
    "K": {
        "Dimethyl": "me2",
        "Trimethyl": "me3",
        "EG 2-Hydroxyisobutyrylation": "hib",
    },
    "R": {
        "Dimethyl": "me2",
        "EG Citrullination": "cit",
    },
}
unmodified_abbreviation = "un"

fasta_file_name = "/home/sander/Documents/Proteomics/msqrob/data/HumanMycoCrap_2018_nr.fasta"
progenesis_file_name = "/home/sander/Documents/Proteomics/msqrob/data/Novel_Export_ForSW_Parsing.csv"
out_file_name = progenesis_file_name[:-4] + "_SW.csv"
verbose = True

def ptm_format(raw_ptm):
    loc, *ptm, aa = raw_ptm.split()
    ptm = " ".join(ptm)
    return loc, ptm, aa

In [2]:
import pyteomics
from pyteomics import fasta
import csv

description_format=lambda x: x.split()[0].split("|")[-1]
protein_sequences = {description_format(description): sequence for description, sequence in pyteomics.fasta.read(fasta_file_name)}


In [4]:
def parsePTMs(ptm_string, peptide_sequence, peptide_start_index, verbose=False):
    ptms = {
        "{}{}".format(
            aa,
            peptide_start_index + index + 1
        ): unmodified_abbreviation for index, aa in enumerate(
            peptide_sequence
        ) if aa in variable_ptm_dict
    }
    if len(ptm_string) > 0:
        for raw_ptm in ptm_string.split("|"):
#             if "ropion" in raw_ptm:
#                 continue
            try:
                loc, ptm, aa = ptm_format(raw_ptm)
            except ValueError:
                if verbose:
                    print(
                        "Undefined ptm {} on peptide {}".format(raw_ptm, peptide_sequence)
                    )
                raise Error
                continue    
            try:
                loc = int(loc[1: -1])
            except ValueError:
                if verbose:
                    print(
                        "Undefined ptm {} on peptide {}".format(raw_ptm, peptide_sequence)
                    )
                continue
            aa = aa[1: -1]
            format_string = "{}{}".format(aa, peptide_start_index + loc)
            if format_string not in ptms:
                if verbose:
                    print(
                        "Undefined ptm {} on peptide {}".format(raw_ptm, peptide_sequence)
                    )
                continue
            try:
                ptms[format_string] = variable_ptm_dict[aa][ptm]
            except KeyError:
                if verbose:
                    print(
                        "Undefined ptm {} on peptide {}".format(raw_ptm, peptide_sequence)
                    )
                continue
    return ["{}{}".format(aa_loc, ptm) for aa_loc, ptm in ptms.items()]

with open(progenesis_file_name, "r") as raw_infile:
    header = True
    infile = csv.reader(raw_infile)
    with open(out_file_name, "w") as raw_outfile:
        outfile = csv.writer(raw_outfile)
        for row in infile:
            row_addition = ""
            if header:
                if "Sequence" in row:
                    header = False
                    sequence_column_index = row.index("Sequence")
                    protein_column_index = row.index("Protein")
                    ptm_column_index = row.index("Variable modifications ([position] description)")
                    row_addition = "msqrob_proteins"
                outfile.writerow(row + [row_addition])
            else:
                peptide_sequence = row[sequence_column_index]
                proteins_string = row[protein_column_index]
                ptm_string = row[ptm_column_index]
                if proteins_string != "":
                    row_additions = []
                    for protein_id in proteins_string.split(";"):
                        peptide_start_index = protein_sequences[protein_id].index(peptide_sequence)
                        peptide_end_index = peptide_start_index + len(peptide_sequence)
                        ptms = parsePTMs(ptm_string, peptide_sequence, peptide_start_index, verbose)
                        for ptm in ptms:
                            row_addition =  "{}_{}-{}_{}".format(
                                protein_id,
                                peptide_start_index, 
                                peptide_end_index,
                                ptm
                            )
                            outfile.writerow(row + [row_addition])
#                         row_additions += [
#                             "{}_{}-{}_{}".format(
#                                 protein_id,
#                                 peptide_start_index, 
#                                 peptide_end_index,
#                                 ptm
#                             ) for ptm in ptms
#                         ]
#                     row_addition = ";".join(row_additions)
                else:
                    outfile.writerow(row + [row_addition])