In [None]:
import pandas as pd
import numpy as np
import ms2pip.ms2pipC
import ms2pip.retention_time
import pyteomics.parser
import pyteomics.fasta
import h5py
import os


def read_proteins_and_peptides_from_fasta(
    fasta_file_names,
    protease,
    missed_cleavages,
    min_peptide_length,
    max_peptide_length,
    standard_amino_acids,
    create_targets,
    create_decoys,
    **kwargs,
):
    print(f"Reading fastas")
    proteins = {}
    peptides = {}
    if not (create_targets or create_decoys):
        raise ValueError("No targets or decoys to create")
    for fasta_file_name in fasta_file_names:
        print(f"Reading {fasta_file_name}")
        if create_targets:
            reversed_protein_decoy = False
            proteins, peptides = __read_proteins_and_peptides_from_fasta(
                fasta_file_name,
                protease,
                missed_cleavages,
                min_peptide_length,
                max_peptide_length,
                standard_amino_acids,
                reversed_protein_decoy,
                proteins,
                peptides,
                **kwargs,
            )
        if create_decoys:
            reversed_protein_decoy = True
            proteins, peptides = __read_proteins_and_peptides_from_fasta(
                fasta_file_name,
                protease,
                missed_cleavages,
                min_peptide_length,
                max_peptide_length,
                standard_amino_acids,
                reversed_protein_decoy,
                proteins,
                peptides,
                **kwargs,
            )
    return proteins, peptides


def __read_proteins_and_peptides_from_fasta(
    fasta_file_name,
    protease,
    missed_cleavages,
    min_peptide_length,
    max_peptide_length,
    standard_amino_acids,
    reversed_protein_decoy,
    proteins,
    peptides,
    **kwargs,
):
    protein_index = 0
    for description, sequence in pyteomics.fasta.FASTA(fasta_file_name):
        protein_info = pyteomics.fasta.parse(description)
        protein = protein_info["entry"]
        del protein_info["entry"]
        if reversed_protein_decoy:
            protein = f"DECOY_{protein}"
            sequence = sequence[::-1]
        protein_info["sequence"] = sequence
        protein_info["index"] = protein_index
        if protein in proteins:
            continue
        proteins[protein] = protein_info
        for peptide in pyteomics.parser.cleave(
            sequence,
            pyteomics.parser.expasy_rules[protease],
            missed_cleavages
        ):
            if not (min_peptide_length < len(peptide) < max_peptide_length):
                continue
            if standard_amino_acids:
                if len(set(peptide) - set("ARNDBCEQZGHILKMFPSTWYV")) != 0:
                    continue
            if peptide not in peptides:
                peptides[peptide] = []
            peptides[peptide].append(protein_index)
        protein_index += 1
    return proteins, peptides


def peptide_dict_to_peprec(
    peptides,
    variable_ptms,
    fixed_ptms,
    ptm_dict,
    **kwargs,
):
    print(f"Creating peprec")
    peptide_list = [
        (
            peptide,
            ";".join([str(p) for p in protein_list])
        ) for (peptide, protein_list) in sorted(peptides.items())
    ]
    columns = [
        "peptide",
        "proteins",
    ]
    if (len(variable_ptms) + len(fixed_ptms)) > 0:
        columns += ["modifications"]
        modified_peptide_list = []
        for peptide, proteins in peptide_list:
            for ptm_combination in generate_ptm_combinations(
                f".{peptide}.",
                [[]] * (len(peptide) + 2),
                variable_ptms,
                fixed_ptms,
                static_ptms=False
            ):
                parsed_ptm_combination = "|".join(
                    [
                        f"{i}|{ptm_dict[ptm][0]}" for i, ptm in enumerate(ptm_combination) if ptm != ""
                    ]
                )
                if parsed_ptm_combination == "":
                    parsed_ptm_combination = "-"
                modified_peptide_list.append(
                    (peptide, proteins, parsed_ptm_combination)
                ) 
        peptide_list = modified_peptide_list
    peprec = pd.DataFrame(
        peptide_list,
        columns=columns
    )
    peprec["index"] = np.arange(peprec.shape[0])
    peprec.set_index("index", inplace=True)
    return peprec


def protein_dict_to_protrec(proteins, **kwargs):
    print(f"Creating protrec")
    columns = sorted(next(iter(proteins.values())))
    protrec = pd.DataFrame(
        [
            tuple(
                [protein_name] + [
                    protein_info[column] if column in protein_info else "" for column in columns
                ]
            ) for protein_name, protein_info in sorted(
                proteins.items(),
                key=lambda p: p[1]["index"]
            )
        ],
        columns=["protein"] + columns
    )
    protrec.set_index("index", inplace=True)
    return protrec


def predict_fragrec(
    peprec,
    model,
    cpu_count,
    charges,
    ptm_dict,
    variable_ptms,
    fixed_ptms,
    batch_size,
    **kwargs,
):
    print(f"Predicting fragrec")
    ms2pip_params = {
        "ms2pip": {
            "model": model,
            "frag_error": 0,
            "ptm": [
                ",".join([str(s) for s in ptm_values]) for ptm_values in ptm_dict.values()
            ],
            "sptm": [],
            "gptm": [],
        }
    }
    for start in range(0, peprec.shape[0], batch_size):
        end = start + batch_size
        charged_peprec = peprec[["peptide"]][start:end].copy()
        charged_peprec["spec_id"] = peprec.index[start:end]
        if (len(variable_ptms) + len(fixed_ptms)) == 0:
            charged_peprec["modifications"] = "-"
        else:
            charged_peprec["modifications"] = peprec["modifications"][start:end]
        for charge in charges:
            charged_peprec["charge"] = charge
            charged_fragrec = ms2pip.ms2pipC.MS2PIP(
                charged_peprec,
                num_cpu=cpu_count,
                params=ms2pip_params,
                return_results=True,
            ).run()
            del charged_fragrec["charge"]
            charged_fragrec.set_index(["spec_id", "ion", "ionnumber", "mz"], inplace=True)
            try:
                partial_fragrec = partial_fragrec.join(charged_fragrec)
                partial_fragrec.rename(
                    columns={'prediction': f'prediction_charge_{charge}'},
                    inplace=True
                )
            except NameError:
                partial_fragrec = charged_fragrec.rename(
                    columns={'prediction': f'prediction_charge_{charge}'}
                )
        try:
            fragrec = pd.concat([fragrec, partial_fragrec])
        except NameError:
            fragrec = partial_fragrec.copy()
        del partial_fragrec
    fragrec.reset_index(inplace=True)
    fragrec.sort_values(by="mz", inplace=True)
    fragrec.rename(
        columns={'spec_id': f'peptide_index'},
        inplace=True
    )
    fragrec["b_ion"] = fragrec["ion"]=="B"
    fragrec["y_ion"] = fragrec["ion"]=="Y"
    del fragrec["ion"]
    fragrec["index"] = np.arange(fragrec.shape[0])
    fragrec.set_index("index", inplace=True)
    return fragrec


def predict_rts(
    peprec,
    **kwargs,
):
    print(f"Predicting rts")
    rt_peprec = peprec[["peptide"]]
    if (len(variable_ptms) + len(fixed_ptms)) == 0:
        rt_peprec["modifications"] = "-"
    else:
        rt_peprec["modifications"] = peprec["modifications"]
    ms2pip.retention_time.RetentionTime().add_rt_predictions(rt_peprec)
    rts = rt_peprec["rt"].values
#     batch_size = 10**5
#     rts = []
#     for i in range(0, peprec.shape[0], batch_size):
#         tmp = peprec[["peptide", "modifications"]][i:i + batch_size]
#         ms2pip.retention_time.RetentionTime().add_rt_predictions(tmp)
#         rts.append(tmp["rt"])
#     rts = np.concatenate(rts)
    return rts


def write_hdf_file(
    fasta_file_names,
    output_folder,
    protrec,
    peprec,
    fragrec,
    create_targets,
    create_decoys,
    **kwargs
):
    base_name = "_".join(
        [
            ".".join(
                os.path.basename(fasta_file_name).split(".")[:-1]
            ) for fasta_file_name in fasta_file_names
        ]
    )
    file_name = os.path.join(output_folder, base_name)
    if create_targets:
        if create_decoys:
            file_name = f"{file_name}_concatenated_decoy.hdf"
        else:
            file_name = f"{file_name}.hdf"
    else:
        file_name = f"{file_name}_decoy.hdf"
    print(f"Wrting to {file_name}")
    with h5py.File(file_name, "w") as hdf_file:
        hdf_file.attrs["fasta_file_names"] = str(fasta_file_names)
        for arg, value in kwargs.items():
            if isinstance(value, str):
                hdf_file.attrs[arg] = value
            else:
                try:
                    iter(value)
                except TypeError:
                    hdf_file.attrs[arg] = value
                else:
                    hdf_file.attrs[arg] = str(value)
        write_dataframe_to_hdf_group_name(hdf_file, "proteins", protrec)
        write_dataframe_to_hdf_group_name(hdf_file, "peptides", peprec)
        write_dataframe_to_hdf_group_name(hdf_file, "fragments", fragrec)

        
def write_dataframe_to_hdf_group_name(hdf_file, group_name, data, **kwargs,):
    if group_name in hdf_file:
        del hdf_file[group_name]
    group = hdf_file.create_group(group_name)
    for column in data.columns:
        try:
            group.create_dataset(
                column,
                data=data[column],
                compression="lzf",
            )
        except TypeError:
            group.create_dataset(
                column,
                data=data[column],
                compression="lzf",
                dtype=h5py.string_dtype()
            )

                
def generate_ptm_combinations_recursively(ptms, selected=[]):
    if len(selected) == len(ptms):
        yield selected
    else:
        for ptm in ptms[len(selected)]:
            for ptm_combination in generate_ptm_combinations_recursively(
                ptms,
                selected + [ptm]
            ):
                yield ptm_combination


def generate_ptm_combinations(
    sequence,
    ptms,
    variable_ptms,
    fixed_ptms,
    static_ptms=False,
    **kwargs,
):
    local_ptms = [[] for i in sequence]
    if sequence[0] == "n":
        local_ptms[0] += ptms[0]
    if sequence[-1] == "c":
        local_ptms[-1] = ptms[-1]
    for i, ptm in enumerate(ptms[1:-1]):
        local_ptms[i + 1] += ptm
    for i, aa in enumerate(f"n{sequence[1:-1]}c"):
        if (not static_ptms) or (len(local_ptms[i]) == 0):
            if aa in variable_ptms:
                local_ptms[i] += variable_ptms[aa]
            if aa in fixed_ptms:
                local_ptms[i] += fixed_ptms[aa]
            else:
                local_ptms[i].append("")
    for ptm_combination in generate_ptm_combinations_recursively(local_ptms):
        yield ptm_combination

In [None]:
fasta_file_names = [
    "/home/sander/Documents/Proteomics/data/databases/fastas/ecoli.fasta",
    "/home/sander/Documents/Proteomics/data/databases/fastas/crap.fasta"
]
output_folder = "/home/sander/Documents/Proteomics/data/databases"

parameters = {
    "variable_ptms": {
        "M": ["M_ox"],
    #     "n": ["n_ac"],
    },
    "fixed_ptms": {
        "C": ["C_cam"]
    },
    "ptm_dict": {
        "M_ox": ("Oxidation" , 15.994915, "opt", "M"),
        "C_cam": ("Carbamidomethyl" , 57.021464, "opt", "C"),
        "N_ac": ("Acetyl" , 42.010565, "opt", "N-term"),
    },
    "model": "HCD",
    "protease": "trypsin",
    "missed_cleavages": 1,
    "min_peptide_length": 5,
    "max_peptide_length": 25,
    "standard_amino_acids": True,
    "create_targets": True,
    "create_decoys": True,
    "charges": [1, 2, 3],
    "cpu_count": 8,
    "batch_size": 10**5,
}

In [None]:
proteins, peptides = read_proteins_and_peptides_from_fasta(fasta_file_names, **parameters)
protrec = protein_dict_to_protrec(proteins, **parameters)
peprec = peptide_dict_to_peprec(peptides, **parameters)
# peprec["rt"] = predict_rts(peprec, modifications)
fragrec = predict_fragrec(peprec, **parameters)
write_hdf_file(fasta_file_names, output_folder, protrec, peprec, fragrec, **parameters)