In [29]:
import pandas as pd
import numpy as np
import ms2pip.ms2pipC
import ms2pip.retention_time
import pyteomics.parser
import pyteomics.fasta
import h5py


def read_proteins_and_peptides_from_fasta(
    file_name,
    protease="trypsin",
    missed_cleavages=0,
    min_peptide_length=6,
    max_peptide_length=30,
    standard_amino_acids=True,
    reversed_protein_decoy=False
):
    print(f"Reading {file_name}")
    fasta_file = pyteomics.fasta.FASTA(file_name)
    proteins = {}
    peptides = {}
    for protein_index, (description, sequence) in enumerate(fasta_file):
        protein_info = pyteomics.fasta.parse(description)
        protein = protein_info["entry"]
        del protein_info["entry"]
        if reversed_protein_decoy:
            protein = f"DECOY_{protein}"
            sequence = sequence[::-1]
        protein_info["sequence"] = sequence
        protein_info["index"] = protein_index
        proteins[protein] = protein_info
        for peptide in pyteomics.parser.cleave(
            sequence,
            pyteomics.parser.expasy_rules[protease],
            missed_cleavages
        ):
            if not (min_peptide_length < len(peptide) < max_peptide_length):
                continue
            if standard_amino_acids:
                if len(set(peptide) - set("ARNDBCEQZGHILKMFPSTWYV")) != 0:
                    continue
            if peptide not in peptides:
                peptides[peptide] = []
            peptides[peptide].append(protein_index)
    return proteins, peptides


def peptide_dict_to_peprec(peptides):
    print(f"Creating peprec")
    peprec = pd.DataFrame(
        [
            (
                peptide,
                ";".join([str(p) for p in protein_list])
            ) for (peptide, protein_list) in peptides.items()
        ],
        columns=[
            "peptide",
            "proteins",
        ]
    )
    peprec["index"] = np.arange(len(peptides))
    peprec.set_index("index", inplace=True)
    return peprec


def protein_dict_to_protrec(proteins):
    print(f"Creating protrec")
    columns = sorted(next(iter(proteins.values())))
    protrec = pd.DataFrame(
        [
            tuple(
                [protein_name] + [
                    protein_info[column] if column in protein_info else "" for column in columns
                ]
            ) for protein_name, protein_info in sorted(
                proteins.items(),
                key=lambda p: p[1]["index"]
            )
        ],
        columns=["protein"] + columns
    )
    protrec.set_index("index", inplace=True)
    return protrec


def predict_fragrec(
    peprec,
    ms2pip_params,
    cpu_count=8,
    charges=[2, 3],
    modifications=False,
):
    print(f"Predicting fragrec")
    for charge in charges:
        charged_peprec = peprec[["peptide"]]
        charged_peprec["spec_id"] = peprec.index
        charged_peprec["charge"] = charge
        if not modifications:
            charged_peprec["modifications"] = "-"
        else:
            charged_peprec["modifications"] = peprec["modifications"]
        charged_fragrec = ms2pip.ms2pipC.MS2PIP(
            charged_peprec,
            num_cpu=cpu_count,
            params=ms2pip_params,
            return_results=True,
        ).run()
        del charged_fragrec["charge"]
        charged_fragrec.set_index(["spec_id", "ion", "ionnumber", "mz"], inplace=True)
        try:
            fragrec = fragrec.join(charged_fragrec)
            fragrec.rename(
                columns={'prediction': f'prediction_charge_{charge}'},
                inplace=True
            )
        except NameError:
            fragrec = charged_fragrec.rename(
                columns={'prediction': f'prediction_charge_{charge}'}
            )
    fragrec.reset_index(inplace=True)
    fragrec.sort_values(by="mz", inplace=True)
    fragrec.rename(
        columns={'spec_id': f'peptide_index'},
        inplace=True
    )
    fragrec["b_ion"] = fragrec["ion"]=="B"
    fragrec["y_ion"] = fragrec["ion"]=="Y"
    del fragrec["ion"]
    fragrec["index"] = np.arange(fragrec.shape[0])
    fragrec.set_index("index", inplace=True)
    return fragrec


def predict_rts(
    peprec,
    modifications=False
):
    print(f"Predicting rts")
    rt_peprec = peprec[["peptide"]]
    if not modifications:
        rt_peprec["modifications"] = "-"
    else:
        rt_peprec["modifications"] = peprec["modifications"]
    ms2pip.retention_time.RetentionTime().add_rt_predictions(rt_peprec)
    rts = rt_peprec["rt"].values
#     batch_size = 10**5
#     rts = []
#     for i in range(0, peprec.shape[0], batch_size):
#         tmp = peprec[["peptide", "modifications"]][i:i + batch_size]
#         ms2pip.retention_time.RetentionTime().add_rt_predictions(tmp)
#         rts.append(tmp["rt"])
#     rts = np.concatenate(rts)
    return rts


def write_hdf_file(file_name, data):
    with h5py.File(file_name, "w") as hdf_file:
        for column in data.columns:
            try:
                hdf_file.create_dataset(
                    column,
                    data=data[column],
                    compression="lzf",
                )
            except TypeError:
                hdf_file.create_dataset(
                    column,
                    data=data[column],
                    compression="lzf",
                    dtype=h5py.string_dtype()
                )

In [12]:
fasta_file_name = "/home/sander/Documents/Proteomics/compomics/ms2pip/ecoli.fasta"

protrec_file_name = "/home/sander/Documents/Proteomics/compomics/ms2pip/ecoli_proteins.hdf"
peprec_file_name = "/home/sander/Documents/Proteomics/compomics/ms2pip/ecoli_peptides.hdf"
fragrec_file_name = "/home/sander/Documents/Proteomics/compomics/ms2pip/ecoli_fragments.hdf"

variable_ptms = {
    "M": ["M_ox"],
    "n": ["n_ac"],
}

fixed_ptms = {
    "C": ["C_cam"]
}

ptm_masses: {
    "C_cam": 57.0513,
    "M_ox": 15.9994,
    "N_ac": 42.0367,
}

model = "HCD"



ms2pip_params = {
    "ms2pip": {
        "model": "HCD",
        "frag_error": 0,
        "ptm": [ 
            'Oxidation,15.9994,opt,M',
            'Carbamidomethyl,57.0513,opt,C',
            'Acetyl,42.0367,opt,N-term'
        ],
        "sptm": [],
        "gptm": [],
    }
}

In [3]:
proteins, peptides = read_proteins_and_peptides_from_fasta(fasta_file_name)
protrec = protein_dict_to_protrec(proteins)
peprec = peptide_dict_to_peprec(peptides)
# peprec["rt"] = predict_rts(peprec)
fragrec = predict_fragrec(peprec, ms2pip_params)

Reading /home/sander/Documents/Proteomics/compomics/ms2pip/ecoli.fasta
Creating protrec
Creating peprec
Predicting fragrec
(0)500 (1)500 (3)500 (4)500 (6)500 (5)500 (2)500 (7)500 (4)1000 (1)1000 (0)1000 (6)1000 (5)1000 (2)1000 (3)1000 (7)1000 (6)1500 (4)1500 (0)1500 (2)1500 (1)1500 (3)1500 (5)1500 (7)1500 (0)2000 (1)2000 (4)2000 (3)2000 (6)2000 (2)2000 (7)2000 (5)2000 (3)2500 (1)2500 (0)2500 (4)2500 (2)2500 (7)2500 (6)2500 (5)2500 (3)3000 (0)3000 (4)3000 (2)3000 (1)3000 (7)3000 (6)3000 (5)3000 (3)3500 (0)3500 (2)3500 (4)3500 (1)3500 (7)3500 (6)3500 (5)3500 (0)4000 (3)4000 (2)4000 (4)4000 (7)4000 (6)4000 (1)4000 (4)4500 (0)4500 (5)4000 (3)4500 (2)4500 (6)4500 (7)4500 (1)4500 (0)5000 (4)5000 (2)5000 (3)5000 (6)5000 (5)4500 (1)5000 (7)5000 (0)5500 (4)5500 (2)5500 (6)5500 (3)5500 (7)5500 (1)5500 (5)5000 (0)6000 (4)6000 (6)6000 (2)6000 (3)6000 (1)6000 (5)5500 (7)6000 (0)6500 (6)6500 (2)6500 (4)6500 (3)6500 (1)6500 (5)6000 (7)6500 (0)7000 (2)7000 (6)7000 (3)7000 (4)7000 (1)7000 (7)7000 (5)65

In [30]:
write_hdf_file(protrec_file_name, protrec)
write_hdf_file(peprec_file_name, peprec)
write_hdf_file(fragrec_file_name, fragrec)