Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add suppot for crosslinking mass spectrometry experiments #28

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions mokapot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@
except DistributionNotFound:
pass

from .dataset import LinearPsmDataset
from .dataset import LinearPsmDataset, CrosslinkPsmDataset
from .model import Model, PercolatorModel, save_model, load_model
from .brew import brew
from .parsers.pin import read_pin, read_percolator
from .parsers.pepxml import read_pepxml
from .parsers.fasta import read_fasta, make_decoys, digest
from .writers import to_flashlfq, to_txt
from .confidence import LinearConfidence, plot_qvalues
from .confidence import (
LinearConfidence,
CrosslinkConfidence,
plot_qvalues,
)
208 changes: 168 additions & 40 deletions mokapot/confidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from . import qvalues
from . import utils
from .picked_protein import picked_protein
from .picked_protein import picked_protein, crosslink_picked_protein
from .writers import to_flashlfq, to_txt

LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -195,6 +195,7 @@ class Confidence:
"proteins": "Proteins",
"csms": "Cross-Linked PSMs",
"peptide_pairs": "Peptide Pairs",
"protein_pairs": "Protein Pairs",
}

def __init__(self, psms, scores, desc):
Expand Down Expand Up @@ -272,6 +273,14 @@ def _perform_tdc(self, psm_columns):
)
self._data = self._data.loc[psm_idx, :]

def _num_accepted(self, level):
"""Calculate the number of accepted discoveries"""
disc = self.confidence_estimates[level]
if disc is not None:
return (disc["mokapot q-value"] <= self._eval_fdr).sum()
else:
return None

def plot_qvalues(self, level="psms", threshold=0.1, ax=None, **kwargs):
"""Plot the cumulative number of discoveries over range of q-values.

Expand Down Expand Up @@ -310,20 +319,20 @@ def plot_qvalues(self, level="psms", threshold=0.1, ax=None, **kwargs):
class LinearConfidence(Confidence):
"""Assign confidence estimates to a set of PSMs

Estimate q-values and posterior error probabilities (PEPs) for PSMs and
peptides when ranked by the provided scores.
Estimate q-values and posterior error probabilities (PEPs) for PSMs,
peptides and proteins when ranked by the provided scores.

Parameters
----------
psms : LinearPsmDataset object
psms : a LinearPsmDataset object
A collection of PSMs.
scores : np.ndarray
A vector containing the score of each PSM.
desc : bool
Are higher scores better?
eval_fdr : float
The FDR threshold at which to report performance. This parameter
has no affect on the analysis itself, only logging messages.
The FDR threshold at which to report performance. This parameter has no
affect on the analysis itself, only logging messages.

Attributes
----------
Expand Down Expand Up @@ -383,14 +392,6 @@ def __repr__(self):

return base

def _num_accepted(self, level):
"""Calculate the number of accepted discoveries"""
disc = self.confidence_estimates[level]
if disc is not None:
return (disc["mokapot q-value"] <= self._eval_fdr).sum()
else:
return None

def _assign_confidence(self, desc=True):
"""
Assign confidence to PSMs and peptides.
Expand Down Expand Up @@ -426,6 +427,7 @@ def _assign_confidence(self, desc=True):
data = data.sort_values(
self._score_column, ascending=False
).reset_index(drop=True)

scores = data.loc[:, self._score_column].values
targets = data.loc[:, self._target_column].astype(bool).values
if all(targets):
Expand Down Expand Up @@ -506,7 +508,7 @@ def to_flashlfq(self, out_file="mokapot.flashlfq.txt"):
return to_flashlfq(self, out_file)


class CrossLinkedConfidence(Confidence):
class CrosslinkConfidence(Confidence):
"""
Assign confidence estimates to a set of cross-linked PSMs

Expand All @@ -516,70 +518,196 @@ class CrossLinkedConfidence(Confidence):

Parameters
----------
psms : CrossLinkedPsmDataset object
csms : a CrosslinkPsmDataset object
A collection of cross-linked PSMs.
scores : np.ndarray
A vector containing the score of each PSM.
A vector containing the score of each CSM.
desc : bool
Are higher scores better?
eval_fdr : float
The FDR threshold at which to report performance. This parameter has no
affect on the analysis itself, only logging messages.

Attributes
----------
levels : list of str
csms : pandas.DataFrame
Confidence estimates for cross-linked PSMs in the dataset.
peptide_pairs : pandas.DataFrame
Confidence estimates for peptide pairs in the dataset.

:meta private:
protein_pairs : pandas.DataFrame
Confidence estimates for protein pairs in the dataset.
confidence_estimates : Dict[str, pandas.DataFrame]
A dictionary of confidence estimates at each level.
decoy_confidence_estimates : Dict[str, pandas.DataFrame]
A dictionary of confidence estimates for the decoys at each level.
"""

def __init__(self, psms, scores, desc=True):
def __init__(self, csms, scores, desc=True, eval_fdr=0.01):
"""Initialize a CrossLinkedConfidence object"""
super().__init__(psms, scores, desc)
self._data[len(self._data.columns)] = psms.targets
self._target_column = self._data.columns[-1]
self._psm_columns = psms._spectrum_columns
self._peptide_column = psms._peptide_column
super().__init__(csms, scores, desc)
self._target_columns = csms._target_columns
self._num_target_column = _new_column("target", self._data)
self._data[self._num_target_column] = csms.targets
self._csm_columns = csms._spectrum_columns
self._peptide_columns = csms._peptide_columns
self._combined_peptides_column = _new_column(
"combined_peptides", self._data
)
self._data[self._combined_peptides_column] = csms.combined_peptides
self._eval_fdr = eval_fdr

LOGGER.info("Performing target-decoy competition...")
LOGGER.info(
"Keeping the best match per %s columns...",
"+".join(self._csm_columns),
)

self._perform_tdc(self._csm_columns)
LOGGER.info("\t- Found %i CSMs from unique spectra.", len(self._data))

self._perform_tdc(self._psm_columns)
self._assign_confidence(desc=desc)

self.accepted = {}
for level in self.levels:
self.accepted[level] = self._num_accepted(level)

def __repr__(self):
"""How to print the class"""
base = (
"A mokapot.confidence.CrosslinkConfidence object:\n"
f"\t- CSMs at q<={self._eval_fdr:g}: {self.accepted['csms']}\n"
f"\t- Peptide pairs at q<={self._eval_fdr:g}: "
f"{self.accepted['peptide_pairs']}\n"
)

if self._has_proteins:
base += (
f"\t- Protein pairs at q<={self._eval_fdr:g}: "
f"{self.accepted['protein_pairs']}\n"
)

return base

def plot_qvalues(self, level="csms", threshold=0.1, ax=None, **kwargs):
"""
Plot the cumulative number of discoveries over range of q-values.

The available levels can be found using
:py:meth:`~mokapot.confidence.Confidence.levels` attribute.

Parameters
----------
level : str, optional
The level of q-values to report.
threshold : float, optional
Indicates the maximum q-value to plot.
ax : matplotlib.pyplot.Axes, optional
The matplotlib Axes on which to plot. If `None` the current
Axes instance is used.
**kwargs : dict, optional
Arguments passed to :py:func:`matplotlib.pyplot.plot`.

Returns
-------
matplotlib.pyplot.Axes
An :py:class:`matplotlib.axes.Axes` with the cumulative
number of accepted target PSMs or peptides.
"""
return super().plot_qvalues(level, threshold, ax, **kwargs)

def _assign_confidence(self, desc=True):
"""
Assign confidence to PSMs and peptides.
Assign confidence to CSMs and peptides.

Parameters
----------
desc : bool
Are higher scores better?
"""
levels = ["csms", "peptide_pairs"]
peptide_idx = utils.groupby_max(
self._data, self._peptide_columns, self._score_column
self._data, self._combined_peptides_column, self._score_column
)

peptides = self._data.loc[peptide_idx]
levels = ("csms", "peptide_pairs")
LOGGER.info("\t- Found %i unique peptide pairs.", len(peptides))

level_data = [self._data, peptides]

if self._has_proteins:
proteins = crosslink_picked_protein(
peptides,
self._target_columns,
self._num_target_column,
self._peptide_columns,
self._score_column,
self._proteins,
)
levels += ["protein_pairs"]
level_data += [proteins]
LOGGER.info("\t- Found %i unique protein pairs.", len(proteins))

for level, data in zip(levels, level_data):
data = data.sort_values(
self._score_column, ascending=False
).reset_index(drop=True)

for level, data in zip(levels, (self._data, peptides)):
scores = data.loc[:, self._score_column].values
targets = data.loc[:, self._target_column].astype(bool).values
targets = data.loc[:, self._num_target_column].values

if all(targets == 2):
LOGGER.warning(
"No decoy CSMs remain for confidence estimation. "
"Confidence estimates may be unreliable."
)

# Estimate q-values and assign to dataframe
LOGGER.info("Assigning q-values to %s...", level)
data["mokapot q-value"] = qvalues.crosslink_tdc(
scores, targets, desc
)

data = (
data.loc[targets, :]
.sort_values(self._score_column, ascending=(not desc))
.reset_index(drop=True)
.drop(self._target_column, axis=1)
.rename(columns={self._score_column: "mokapot score"})
# Make output tables pretty
data = data.drop(self._num_target_column, axis=1).rename(
columns={self._score_column: "mokapot score"}
)

_, pep = qvality.getQvaluesFromScores(
scores[targets == 2], scores[~targets]
# Set scores to be the correct sign again:
data["mokapot score"] = data["mokapot score"] * (desc * 2 - 1)

# Logging update on q-values
LOGGER.info(
"\t- Found %i %s with q<=%g",
(data.loc[targets, "mokapot q-value"] <= self._eval_fdr).sum(),
level,
self._eval_fdr,
)

# Calculate PEPs
LOGGER.info("Assiging PEPs to %s...", level)
try:
_, pep = qvality.getQvaluesFromScores(
scores[targets == 2],
scores[~(targets == 2)],
includeDecoys=True,
)
except SystemExit as msg:
if "no decoy hits available for PEP calculation" in str(msg):
pep = 0
else:
raise

level = level.lower()
data["mokapot PEP"] = pep
self._confidence_estimates[level] = data

targ_idx = targets == 2
self.confidence_estimates[level] = data.loc[targ_idx, :]
self.decoy_confidence_estimates[level] = data.loc[~targ_idx, :]

if "protein_pairs" not in self.confidence_estimates.keys():
self.confidence_estimates["protein_pairs"] = None
self.decoy_confidence_estimates["protein_pairs"] = None


# Functions -------------------------------------------------------------------
Expand Down
Loading