Skip to content

Commit

Permalink
Merge pull request #21 from LokiLuciferase/feature-deepnog-update
Browse files Browse the repository at this point in the history
BUMP deepNOG to 1.1.0 (targeting PyPI) and update CI configurations
  • Loading branch information
LokiLuciferase committed Mar 11, 2020
2 parents 582fe11 + c2ece98 commit 204f8b1
Show file tree
Hide file tree
Showing 23 changed files with 328 additions and 140 deletions.
3 changes: 2 additions & 1 deletion .appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ install:
# pip will build them from source using the MSVC compiler matching the
# target Python version and architecture
- "python -m pip install wheel pytest"
- "python -m pip install -r requirements/prod.txt"
- "python -m pip install -r requirements/win.txt -f https://download.pytorch.org/whl/torch_stable.html"
- "python -m pip install -r requirements/dev.txt"

build_script:
# Build the compiled extension
Expand Down
31 changes: 12 additions & 19 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,22 +1,15 @@
MIT License
phenotrex
Copyright (C) 2019-2020, Lukas Lüftinger

Copyright (c) 2019, Lukas Lüftinger
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Binary file added phenotrex/bin/prodigal.linux
Binary file not shown.
Binary file added phenotrex/bin/prodigal.osx.10.9.5
Binary file not shown.
Binary file not shown.
3 changes: 3 additions & 0 deletions phenotrex/cli/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
required=False, help='Input genotype file.')
@click.option('--classifier', required=True, type=click.Path(exists=True),
help='Path of pickled classifier file.')
@click.option('--min_proba', type=click.FloatRange(0.5, 1.0), default=0.5,
help='Class probability threshold for displaying predictions. '
'Predictions below the threshold will be given as "N/A".')
@click.option('--out_explain_per_sample', type=click.Path(dir_okay=False),
help='Write SHAP explanations for each predicted sample to file (optional).')
@click.option('--out_explain_summary', type=click.Path(dir_okay=False),
Expand Down
9 changes: 6 additions & 3 deletions phenotrex/ml/cccv.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ def _completeness_cv(self, param, **kwargs) -> Dict[float, Dict[float, float]]:
X_train, y_train, tn = get_x_y_tn(training_records)
classifier.fit(X=X_train, y=y_train, **kwargs)

# initialize the resampler with the test_records only, so the samples are unknown to the classifier
# initialize the resampler with the test_records only,
# so the samples are unknown to the classifier
resampler = TrainingRecordResampler(random_state=self.random_state, verb=False)
resampler.fit(records=test_records)
cv_scores = {}
Expand All @@ -147,7 +148,8 @@ def run(self, records: List[TrainingRecord]):
"""
# TODO: run compress_vocabulary before?

self.logger.info("Begin completeness/contamination matrix crossvalidation on training data.")
self.logger.info(
"Begin completeness/contamination matrix crossvalidation on training data.")
t1 = time()
if self.n_jobs is None:
cv_scores = map(self._completeness_cv, self._replicates(records, self.cv,
Expand All @@ -168,7 +170,8 @@ def run(self, records: List[TrainingRecord]):
for comple in cv_scores_list[0].keys():
mba[comple] = {}
for conta in cv_scores_list[0][comple].keys():
single_result = [cv_scores_list[r][comple][conta] for r in range(self.n_replicates * self.cv)]
single_result = [cv_scores_list[r][comple][conta] for r in
range(self.n_replicates * self.cv)]
mean_over_fold_and_replicates = np.mean(single_result)
std_over_fold_and_replicates = np.std(single_result)
mba[comple][conta] = {"score_mean": mean_over_fold_and_replicates,
Expand Down
46 changes: 12 additions & 34 deletions phenotrex/ml/feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,14 @@

def compress_vocabulary(records: List[TrainingRecord], pipeline: Pipeline):
"""
Method to group features, that store redundant information, to avoid overfitting and speed up process (in some
cases). Might be replaced or complemented by a feature selection method in future versions.
Compressing vocabulary is optional, for the test dataset it took 30 seconds, while the time saved later on is not
significant.
Method to group features, that store redundant information,
to avoid overfitting and speed up process (in some cases).
Might be replaced or complemented by a feature selection method in future versions.
:param records: a list of TrainingRecord objects.
:param pipeline: the targeted pipeline where the vocabulary should be modified
:return: nothing, sets the vocabulary for CountVectorizer step
"""

X, y, tn = get_x_y_tn(records) # we actually only need X
vec = pipeline.named_steps["vec"]
if not vec.vocabulary:
Expand Down Expand Up @@ -58,8 +55,10 @@ def compress_vocabulary(records: List[TrainingRecord], pipeline: Pipeline):
pipeline.named_steps["vec"].fixed_vocabulary_ = True


def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipeline, step: float = DEFAULT_STEP_SIZE,
n_features: int = None, random_state: np.random.RandomState = None):
def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipeline,
step: float = DEFAULT_STEP_SIZE,
n_features: int = None,
random_state: np.random.RandomState = None):
"""
Function to apply RFE to limit the vocabulary used by the CustomVectorizer, optional step.
Expand Down Expand Up @@ -101,40 +100,19 @@ def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipel
support = selector.get_support()
support = support.nonzero()[0]
new_id = {support[x]: x for x in range(len(support))}
vocabulary = {feature: new_id[i] for feature, i in previous_vocabulary.items() if not new_id.get(i) is None}
vocabulary = {feature: new_id[i] for feature, i in previous_vocabulary.items() if
not new_id.get(i) is None}
size_after = selector.n_features_

t2 = time()

logger.info(f"{size_after} features were selected of {original_size} using Recursive Feature Eliminiation"
f" in {np.round(t2 - t1, 2)} seconds.")
logger.info(
f"{size_after} features were selected of {original_size} using Recursive Feature Eliminiation"
f" in {np.round(t2 - t1, 2)} seconds.")

# set vocabulary to vectorizer
pipeline.named_steps["vec"].vocabulary = vocabulary
pipeline.named_steps["vec"].vocabulary_ = vocabulary
pipeline.named_steps["vec"].fixed_vocabulary_ = True

return size_after


def multiple_step_rfecv(records: List[TrainingRecord], pipeline: Pipeline, n_features: int, step=(0.01, 0.01, 0.01, ),
random_state: np.random.RandomState = None):
"""
Function to apply multiple steps-sizes of RFECV in series, currently not used. Strategy might be problematic,
no clear benefit. #TODO rethink or remove
:param records: Data used
:param pipeline: The base estimator used
:param n_features: Goal number of features
:param step: List of steps that should be applied
:param random_state: random state for deterministic results
:return:
"""
# step = [0.0025]
for s in step:
size_after = recursive_feature_elimination(records, pipeline=pipeline, step=s, n_features=n_features,
random_state=random_state)
if size_after == n_features:
break

return size_after
8 changes: 6 additions & 2 deletions phenotrex/ml/prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from phenotrex.util.helpers import fail_missing_dependency as fastas_to_grs


def predict(fasta_files=tuple(), genotype=None, classifier=None,
def predict(fasta_files=tuple(), genotype=None, classifier=None, min_proba=0.5,
out_explain_per_sample=None, out_explain_summary=None,
shap_n_samples=None, n_max_explained_features=None, verb=False):
"""
Expand Down Expand Up @@ -71,4 +71,8 @@ def predict(fasta_files=tuple(), genotype=None, classifier=None,
print(f"# Trait: {model.trait_name}")
print("Identifier\tTrait present\tConfidence")
for record, result, probability in zip(gr, preds, probas):
print(f"{record.identifier}\t{translate_output[result]}\t{str(round(probability[result], 4))}")
if probability[result] < min_proba:
result_disp = "N/A"
else:
result_disp = translate_output[result]
print(f"{record.identifier}\t{result_disp}\t{str(round(probability[result], 4))}")
20 changes: 10 additions & 10 deletions phenotrex/ml/shap_handler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Tuple, Union, List
from typing import Tuple

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -97,20 +97,20 @@ def _get_feature_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
try:
X_agg = self._used_features.astype(float)
shap_agg = self._used_shaps.astype(float)
except (ValueError, AttributeError) as e:
except (ValueError, AttributeError):
raise RuntimeError('No explanations saved.')
sample_names = self._sample_names
return X_agg, shap_agg, sample_names

def _get_sorted_by_shap_data(self, sort_by_idx=None) -> Tuple[
np.ndarray, np.ndarray, np.ndarray]:
def _get_sorted_by_shap_data(self, sort_by_idx=None) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Sort features by absolute magnitude of shap values,
and return sorted features, shap values and feature names.
:param sort_by_idx: if an index into the sample names is passed,
sorting will be based only on this sample's SHAP values.
:return: Used features, used shaps, and the feature names all sorted by absolute magnitude of shap value.
:return: Used features, used shaps, and the feature names
all sorted by absolute magnitude of shap value.
"""
X_agg, shap_agg, _ = self._get_feature_data()

Expand All @@ -125,12 +125,11 @@ def _get_sorted_by_shap_data(self, sort_by_idx=None) -> Tuple[
absshap = np.apply_along_axis(np.abs, feature_axis, shap_agg)
if sort_by_idx is None: # sort features by absolute change in shap over all classes and samples
sort_criterion = np.apply_over_axes(np.sum, absshap, nonfeature_axes)
feature_sort_inds = np.squeeze(np.argsort(sort_criterion))[::-1]
else: # sort features by absolute change in shap over all classes for given sample idx
sort_criterion = np.sum(absshap[sort_by_idx, ...], axis=0)
feature_sort_inds = np.squeeze(np.argsort(sort_criterion))[::-1]
return X_agg[:, feature_sort_inds], shap_agg[..., feature_sort_inds], \
self._used_feature_names[feature_sort_inds]
return (X_agg[:, feature_sort_inds], shap_agg[..., feature_sort_inds],
self._used_feature_names[feature_sort_inds])

def plot_shap_force(self, sample_name: str, **kwargs):
"""
Expand Down Expand Up @@ -196,7 +195,6 @@ def get_shap_force(self, sample_name: str, n_max_features: int = 20) -> pd.DataF
:param sample_name:
:param n_max_features:
:param nsamples:
:return: a dataframe of the n_max_features most influential features,
their value in the sample, and the associated
SHAP value(s).
Expand All @@ -215,6 +213,7 @@ def get_shap_force(self, sample_name: str, n_max_features: int = 20) -> pd.DataF
shap_vals = [shap_agg_s[i, :n_max_features].round(5), ]
sample_names = [sample_name] * len(fns)
df_arrs = [sample_names, fns, feature_vals, *shap_vals]
df_arrs = [np.array(x) for x in df_arrs]
df_labels = ['sample', 'feature', 'feature_value',
*[f'SHAP value ({x})' for x in self._class_names]][:len(df_arrs)]
df = pd.DataFrame(df_arrs, index=df_labels).T
Expand All @@ -223,7 +222,8 @@ def get_shap_force(self, sample_name: str, n_max_features: int = 20) -> pd.DataF

def get_shap_summary(self, n_max_features: int = 50):
"""
Get summary of features for all predictions, sorted by average impact of feature on shap value.
Get summary of features for all predictions,
sorted by average impact of feature on shap value.
:param n_max_features:
:return: a DataFrame of most important SHAP values for samples in the given dataset.
Expand Down
39 changes: 27 additions & 12 deletions phenotrex/structure/records.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#
# Created by Lukas Lüftinger on 2/5/19.
#
from typing import List
from typing import List, Optional
from dataclasses import dataclass

"""Data structures containing Genotype and Phenotype information."""


@dataclass
class GenotypeRecord:
""" TODO add docstring """
"""
Genomic features of a sample referenced by `identifier`.
"""
identifier: str
features: List[str]

Expand All @@ -19,7 +19,12 @@ def __repr__(self):

@dataclass
class PhenotypeRecord:
""" TODO add docstring """
"""
Ground truth labels of sample `identifier`,
indicating presence/absence of trait `trait_name`:
- 0 if trait is absent
- 1 if trait is present
"""
identifier: str
trait_name: str
trait_sign: int
Expand All @@ -30,22 +35,32 @@ def __repr__(self):

@dataclass
class GroupRecord:
""" TODO add docstring """
"""
Group label of sample `identifier`.
Notes
-----
Useful for leave-one-group-out cross-validation (LOGO-CV),
for example, to take taxonomy into account.
"""
identifier: str
group_name: str
group_id: int
group_name: Optional[str]
group_id: Optional[int]

def __repr__(self):
return f"{self.identifier} group({self.group_name})={self.group_id}"


@dataclass
class TrainingRecord(GenotypeRecord, PhenotypeRecord, GroupRecord):
""" TODO add docstring """
pass

"""
Sample containing Genotype-, Phenotype- and GroupRecords,
suitable as machine learning input for a single observation.
"""
def __repr__(self):
gr_repr = GenotypeRecord.__repr__(self).split(' ')[1]
pr_repr = PhenotypeRecord.__repr__(self).split(' ')[1]
gro_repr = GroupRecord.__repr__(self).split(' ')[1]
if self.group_name is not None and self.group_id is not None:
gro_repr = GroupRecord.__repr__(self).split(' ')[1]
else:
gro_repr = ''
return f"id={self.identifier} {' '.join([gr_repr, pr_repr, gro_repr])}"

0 comments on commit 204f8b1

Please sign in to comment.