Merge pull request #21 from LokiLuciferase/feature-deepnog-update

BUMP deepNOG to 1.1.0 (targeting PyPI) and update CI configurations
univieCUBE · Mar 11, 2020 · 204f8b1 · 204f8b1
2 parents 582fe11 + c2ece98
commit 204f8b1
Show file tree

Hide file tree

Showing 23 changed files with 328 additions and 140 deletions.
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -51,7 +51,8 @@ install:
   # pip will build them from source using the MSVC compiler matching the
   # target Python version and architecture
   - "python -m pip install wheel pytest"
-  - "python -m pip install -r requirements/prod.txt"
+  - "python -m pip install -r requirements/win.txt -f https://download.pytorch.org/whl/torch_stable.html"
+  - "python -m pip install -r requirements/dev.txt"
 
 build_script:
   # Build the compiled extension

diff --git a/LICENSE b/LICENSE
@@ -1,22 +1,15 @@
-MIT License
+phenotrex
+Copyright (C) 2019-2020, Lukas Lüftinger
 
-Copyright (c) 2019, Lukas Lüftinger
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
 
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
 
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
diff --git a/phenotrex/bin/prodigal.linux b/phenotrex/bin/prodigal.linux
diff --git a/phenotrex/bin/prodigal.osx.10.9.5 b/phenotrex/bin/prodigal.osx.10.9.5
diff --git a/phenotrex/bin/prodigal → phenotrex/bin/prodigal.windows.exe b/phenotrex/bin/prodigal → phenotrex/bin/prodigal.windows.exe
diff --git a/phenotrex/cli/predict.py b/phenotrex/cli/predict.py
@@ -8,6 +8,9 @@
               required=False, help='Input genotype file.')
 @click.option('--classifier', required=True, type=click.Path(exists=True),
               help='Path of pickled classifier file.')
+@click.option('--min_proba', type=click.FloatRange(0.5, 1.0), default=0.5,
+              help='Class probability threshold for displaying predictions. '
+                   'Predictions below the threshold will be given as "N/A".')
 @click.option('--out_explain_per_sample', type=click.Path(dir_okay=False),
               help='Write SHAP explanations for each predicted sample to file (optional).')
 @click.option('--out_explain_summary', type=click.Path(dir_okay=False),

diff --git a/phenotrex/ml/cccv.py b/phenotrex/ml/cccv.py
@@ -124,7 +124,8 @@ def _completeness_cv(self, param, **kwargs) -> Dict[float, Dict[float, float]]:
         X_train, y_train, tn = get_x_y_tn(training_records)
         classifier.fit(X=X_train, y=y_train, **kwargs)
 
-        # initialize the resampler with the test_records only, so the samples are unknown to the classifier
+        # initialize the resampler with the test_records only,
+        # so the samples are unknown to the classifier
         resampler = TrainingRecordResampler(random_state=self.random_state, verb=False)
         resampler.fit(records=test_records)
         cv_scores = {}
@@ -147,7 +148,8 @@ def run(self, records: List[TrainingRecord]):
         """
         # TODO: run compress_vocabulary before?
 
-        self.logger.info("Begin completeness/contamination matrix crossvalidation on training data.")
+        self.logger.info(
+            "Begin completeness/contamination matrix crossvalidation on training data.")
         t1 = time()
         if self.n_jobs is None:
             cv_scores = map(self._completeness_cv, self._replicates(records, self.cv,
@@ -168,7 +170,8 @@ def run(self, records: List[TrainingRecord]):
         for comple in cv_scores_list[0].keys():
             mba[comple] = {}
             for conta in cv_scores_list[0][comple].keys():
-                single_result = [cv_scores_list[r][comple][conta] for r in range(self.n_replicates * self.cv)]
+                single_result = [cv_scores_list[r][comple][conta] for r in
+                                 range(self.n_replicates * self.cv)]
                 mean_over_fold_and_replicates = np.mean(single_result)
                 std_over_fold_and_replicates = np.std(single_result)
                 mba[comple][conta] = {"score_mean": mean_over_fold_and_replicates,

diff --git a/phenotrex/ml/feature_select.py b/phenotrex/ml/feature_select.py
@@ -17,17 +17,14 @@
 
 def compress_vocabulary(records: List[TrainingRecord], pipeline: Pipeline):
     """
-    Method to group features, that store redundant information, to avoid overfitting and speed up process (in some
-    cases). Might be replaced or complemented by a feature selection method in future versions.
-
-    Compressing vocabulary is optional, for the test dataset it took 30 seconds, while the time saved later on is not
-    significant.
+    Method to group features, that store redundant information,
+    to avoid overfitting and speed up process (in some cases).
+    Might be replaced or complemented by a feature selection method in future versions.
 
     :param records: a list of TrainingRecord objects.
     :param pipeline: the targeted pipeline where the vocabulary should be modified
     :return: nothing, sets the vocabulary for CountVectorizer step
     """
-
     X, y, tn = get_x_y_tn(records)  # we actually only need X
     vec = pipeline.named_steps["vec"]
     if not vec.vocabulary:
@@ -58,8 +55,10 @@ def compress_vocabulary(records: List[TrainingRecord], pipeline: Pipeline):
     pipeline.named_steps["vec"].fixed_vocabulary_ = True
 
 
-def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipeline, step: float = DEFAULT_STEP_SIZE,
-                                  n_features: int = None, random_state: np.random.RandomState = None):
+def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipeline,
+                                  step: float = DEFAULT_STEP_SIZE,
+                                  n_features: int = None,
+                                  random_state: np.random.RandomState = None):
     """
     Function to apply RFE to limit the vocabulary used by the CustomVectorizer, optional step.
 
@@ -101,40 +100,19 @@ def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipel
     support = selector.get_support()
     support = support.nonzero()[0]
     new_id = {support[x]: x for x in range(len(support))}
-    vocabulary = {feature: new_id[i] for feature, i in previous_vocabulary.items() if not new_id.get(i) is None}
+    vocabulary = {feature: new_id[i] for feature, i in previous_vocabulary.items() if
+                  not new_id.get(i) is None}
     size_after = selector.n_features_
 
     t2 = time()
 
-    logger.info(f"{size_after} features were selected of {original_size} using Recursive Feature Eliminiation"
-                f" in {np.round(t2 - t1, 2)} seconds.")
+    logger.info(
+        f"{size_after} features were selected of {original_size} using Recursive Feature Eliminiation"
+        f" in {np.round(t2 - t1, 2)} seconds.")
 
     # set vocabulary to vectorizer
     pipeline.named_steps["vec"].vocabulary = vocabulary
     pipeline.named_steps["vec"].vocabulary_ = vocabulary
     pipeline.named_steps["vec"].fixed_vocabulary_ = True
 
     return size_after
-
-
-def multiple_step_rfecv(records: List[TrainingRecord], pipeline: Pipeline, n_features: int, step=(0.01, 0.01, 0.01, ),
-                        random_state: np.random.RandomState = None):
-    """
-    Function to apply multiple steps-sizes of RFECV in series, currently not used. Strategy might be problematic,
-    no clear benefit. #TODO rethink or remove
-
-    :param records: Data used
-    :param pipeline: The base estimator used
-    :param n_features: Goal number of features
-    :param step: List of steps that should be applied
-    :param random_state: random state for deterministic results
-    :return:
-    """
-    # step = [0.0025]
-    for s in step:
-        size_after = recursive_feature_elimination(records, pipeline=pipeline, step=s, n_features=n_features,
-                                                   random_state=random_state)
-        if size_after == n_features:
-            break
-
-    return size_after
diff --git a/phenotrex/ml/prediction.py b/phenotrex/ml/prediction.py
@@ -10,7 +10,7 @@
     from phenotrex.util.helpers import fail_missing_dependency as fastas_to_grs
 
 
-def predict(fasta_files=tuple(), genotype=None, classifier=None,
+def predict(fasta_files=tuple(), genotype=None, classifier=None, min_proba=0.5,
             out_explain_per_sample=None, out_explain_summary=None,
             shap_n_samples=None, n_max_explained_features=None, verb=False):
     """
@@ -71,4 +71,8 @@ def predict(fasta_files=tuple(), genotype=None, classifier=None,
     print(f"# Trait: {model.trait_name}")
     print("Identifier\tTrait present\tConfidence")
     for record, result, probability in zip(gr, preds, probas):
-        print(f"{record.identifier}\t{translate_output[result]}\t{str(round(probability[result], 4))}")
+        if probability[result] < min_proba:
+            result_disp = "N/A"
+        else:
+            result_disp = translate_output[result]
+        print(f"{record.identifier}\t{result_disp}\t{str(round(probability[result], 4))}")
diff --git a/phenotrex/ml/shap_handler.py b/phenotrex/ml/shap_handler.py
@@ -1,4 +1,4 @@
-from typing import Tuple, Union, List
+from typing import Tuple
 
 import pandas as pd
 import numpy as np
@@ -97,20 +97,20 @@ def _get_feature_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
         try:
             X_agg = self._used_features.astype(float)
             shap_agg = self._used_shaps.astype(float)
-        except (ValueError, AttributeError) as e:
+        except (ValueError, AttributeError):
             raise RuntimeError('No explanations saved.')
         sample_names = self._sample_names
         return X_agg, shap_agg, sample_names
 
-    def _get_sorted_by_shap_data(self, sort_by_idx=None) -> Tuple[
-        np.ndarray, np.ndarray, np.ndarray]:
+    def _get_sorted_by_shap_data(self, sort_by_idx=None) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
         """
         Sort features by absolute magnitude of shap values,
         and return sorted features, shap values and feature names.
 
         :param sort_by_idx: if an index into the sample names is passed,
                             sorting will be based only on this sample's SHAP values.
-        :return: Used features, used shaps, and the feature names all sorted by absolute magnitude of shap value.
+        :return: Used features, used shaps, and the feature names
+                 all sorted by absolute magnitude of shap value.
         """
         X_agg, shap_agg, _ = self._get_feature_data()
 
@@ -125,12 +125,11 @@ def _get_sorted_by_shap_data(self, sort_by_idx=None) -> Tuple[
             absshap = np.apply_along_axis(np.abs, feature_axis, shap_agg)
             if sort_by_idx is None:  # sort features by absolute change in shap over all classes and samples
                 sort_criterion = np.apply_over_axes(np.sum, absshap, nonfeature_axes)
-                feature_sort_inds = np.squeeze(np.argsort(sort_criterion))[::-1]
             else:  # sort features by absolute change in shap over all classes for given sample idx
                 sort_criterion = np.sum(absshap[sort_by_idx, ...], axis=0)
             feature_sort_inds = np.squeeze(np.argsort(sort_criterion))[::-1]
-        return X_agg[:, feature_sort_inds], shap_agg[..., feature_sort_inds], \
-               self._used_feature_names[feature_sort_inds]
+        return (X_agg[:, feature_sort_inds], shap_agg[..., feature_sort_inds],
+                self._used_feature_names[feature_sort_inds])
 
     def plot_shap_force(self, sample_name: str, **kwargs):
         """
@@ -196,7 +195,6 @@ def get_shap_force(self, sample_name: str, n_max_features: int = 20) -> pd.DataF
 
         :param sample_name:
         :param n_max_features:
-        :param nsamples:
         :return: a dataframe of the n_max_features most influential features,
                  their value in the sample, and the associated
         SHAP value(s).
@@ -215,6 +213,7 @@ def get_shap_force(self, sample_name: str, n_max_features: int = 20) -> pd.DataF
             shap_vals = [shap_agg_s[i, :n_max_features].round(5), ]
         sample_names = [sample_name] * len(fns)
         df_arrs = [sample_names, fns, feature_vals, *shap_vals]
+        df_arrs = [np.array(x) for x in df_arrs]
         df_labels = ['sample', 'feature', 'feature_value',
                      *[f'SHAP value ({x})' for x in self._class_names]][:len(df_arrs)]
         df = pd.DataFrame(df_arrs, index=df_labels).T
@@ -223,7 +222,8 @@ def get_shap_force(self, sample_name: str, n_max_features: int = 20) -> pd.DataF
 
     def get_shap_summary(self, n_max_features: int = 50):
         """
-        Get summary of features for all predictions, sorted by average impact of feature on shap value.
+        Get summary of features for all predictions,
+        sorted by average impact of feature on shap value.
 
         :param n_max_features:
         :return: a DataFrame of most important SHAP values for samples in the given dataset.

diff --git a/phenotrex/structure/records.py b/phenotrex/structure/records.py
@@ -1,15 +1,15 @@
 #
 # Created by Lukas Lüftinger on 2/5/19.
 #
-from typing import List
+from typing import List, Optional
 from dataclasses import dataclass
 
-"""Data structures containing Genotype and Phenotype information."""
-
 
 @dataclass
 class GenotypeRecord:
-    """ TODO add docstring """
+    """
+    Genomic features of a sample referenced by `identifier`.
+    """
     identifier: str
     features: List[str]
 
@@ -19,7 +19,12 @@ def __repr__(self):
 
 @dataclass
 class PhenotypeRecord:
-    """ TODO add docstring """
+    """
+    Ground truth labels of sample `identifier`,
+    indicating presence/absence of trait `trait_name`:
+      - 0 if trait is absent
+      - 1 if trait is present
+    """
     identifier: str
     trait_name: str
     trait_sign: int
@@ -30,22 +35,32 @@ def __repr__(self):
 
 @dataclass
 class GroupRecord:
-    """ TODO add docstring """
+    """
+    Group label of sample `identifier`.
+    Notes
+    -----
+    Useful for leave-one-group-out cross-validation (LOGO-CV),
+    for example, to take taxonomy into account.
+    """
     identifier: str
-    group_name: str
-    group_id: int
+    group_name: Optional[str]
+    group_id: Optional[int]
 
     def __repr__(self):
         return f"{self.identifier} group({self.group_name})={self.group_id}"
 
 
 @dataclass
 class TrainingRecord(GenotypeRecord, PhenotypeRecord, GroupRecord):
-    """ TODO add docstring """
-    pass
-
+    """
+    Sample containing Genotype-, Phenotype- and GroupRecords,
+    suitable as machine learning input for a single observation.
+    """
     def __repr__(self):
         gr_repr = GenotypeRecord.__repr__(self).split(' ')[1]
         pr_repr = PhenotypeRecord.__repr__(self).split(' ')[1]
-        gro_repr = GroupRecord.__repr__(self).split(' ')[1]
+        if self.group_name is not None and self.group_id is not None:
+            gro_repr = GroupRecord.__repr__(self).split(' ')[1]
+        else:
+            gro_repr = ''
         return f"id={self.identifier} {' '.join([gr_repr, pr_repr, gro_repr])}"