Added decomposition classes for SVD and Hermitian ED.

Moved ICA to decomposition module. Updated code related to ICA. Signed-off-by: Timothy Click <tcthepoet@yahoo.com>
tclick · Jan 8, 2019 · 95be2b1 · 95be2b1
1 parent a2ad640
commit 95be2b1
Show file tree

Hide file tree

Showing 6 changed files with 619 additions and 32 deletions.
diff --git a/src/fluctmatch/commands/cmd_sca.py b/src/fluctmatch/commands/cmd_sca.py
@@ -27,7 +27,7 @@
 from sklearn.decomposition import PCA
 from ..analysis.paramtable import ParamTable
 from ..analysis import fluctsca
-from fluctmatch.fluctsca import ica
+from fluctmatch.decomposition import ica
 
 
 @click.command(

diff --git a/src/fluctmatch/decomposition/__init__.py b/src/fluctmatch/decomposition/__init__.py
@@ -0,0 +1,52 @@
+# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding: utf-8 -*-
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+#
+# pysca --- https://github.com/tclick/python-pysca
+# Copyright (c) 2015-2017 The pySCA Development Team and contributors
+# (see the file AUTHORS for the full list of names)
+#
+# Released under the New BSD license.
+#
+# Please cite your use of fluctmatch in published work:
+#
+# Timothy H. Click, Nixon Raj, and Jhih-Wei Chu.
+# Calculation of Enzyme Fluctuograms from All-Atom Molecular Dynamics
+# Simulation. Meth Enzymology. 578 (2016), 327-342,
+# doi:10.1016/bs.mie.2016.05.024.
+#
+from __future__ import (
+    absolute_import,
+    division,
+    print_function,
+    unicode_literals,
+)
+
+from future.utils import (
+    native_str,
+    raise_from,
+    with_metaclass,
+)
+from future.builtins import (
+    ascii,
+    bytes,
+    chr,
+    dict,
+    filter,
+    hex,
+    input,
+    map,
+    next,
+    oct,
+    open,
+    pow,
+    range,
+    round,
+    str,
+    super,
+    zip,
+)
+
+import numpy as np
+import pandas as pd
+
+
diff --git a/src/fluctmatch/decomposition/eigh.py b/src/fluctmatch/decomposition/eigh.py
@@ -0,0 +1,148 @@
+# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding: utf-8 -*-
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+#
+# fluctmatch --- https://github.com/tclick/python-fluctmatch
+# Copyright (c) 2015-2017 The fluctmatch Development Team and contributors
+# (see the file AUTHORS for the full list of names)
+#
+# Released under the New BSD license.
+#
+# Please cite your use of fluctmatch in published work:
+#
+# Timothy H. Click, Nixon Raj, and Jhih-Wei Chu.
+# Calculation of Enzyme Fluctuograms from All-Atom Molecular Dynamics
+# Simulation. Meth Enzymology. 578 (2016), 327-342,
+# doi:10.1016/bs.mie.2016.05.024.
+#
+
+import numpy as np
+from scipy.sparse import issparse, linalg
+
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils import check_random_state
+from sklearn.utils import check_array
+from sklearn.utils.extmath import randomized_svd, svd_flip
+
+class Eigh(BaseEstimator, TransformerMixin):
+    """Hermitian eigenvalue decomposition.
+
+    This transformer performs eigenvalue decomposition (ED) of
+    Hermitian square matrices. The eigenvalues and vectors are
+    sorted in descending order of the eigenvalues.
+
+    Parameters
+    ----------
+    copy : bool (default True)
+        If False, data passed to fit are overwritten and running
+        fit(X).transform(X) will not yield the expected results,
+        use fit_transform(X) instead.
+
+    Attributes
+    ----------
+    components_ : array, shape (n_components, n_components)
+        The transpose of the returned matrix.
+
+    explained_variance_ : array, shape (n_components,)
+        Same as the eigenvector.
+
+    explained_variance_ratio_ : array, shape (n_components,)
+        Percentage of variance explained by each of the selected components.
+
+    eigenvalues_ : array, shape (n_components,)
+        The eigenvalues corresponding to each of the selected components.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from fluctmatch.decomposition.svd import SVD
+    >>> X = np.arange(4, dtype=np.float).reshape((2,2))
+    >>> eigh = Eigh()
+    >>> eigh.fit(X)
+    Eigh(copy=True)
+    >>> print(eigh.explained_variance_ratio_)  # doctest: +ELLIPSIS
+    [-1... 4...]
+    >>> print(eigh.singular_values_)  # doctest: +ELLIPSIS
+    [-1... 4...]
+
+    Notes
+    -----
+    SVD suffers from a problem called "sign indeterminacy", which means the
+    sign of the ``components_`` and the output from transform depend on the
+    algorithm and random state. To work around this, fit instances of this
+    class to data once, then keep the instance around to do transformations.
+    """
+    def __init__(self, copy=True):
+        self.copy = copy
+
+    def fit(self, X: np.ndarray, y=None) -> object:
+        """Fit the model with X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data, where n_samples is the number of samples
+            and n_features is the number of features.
+
+        y : Ignored
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        self._fit(X)
+        return self
+
+    def transform(self, X: np.ndarray, y=None) -> np.ndarray:
+        """Fit the model with X and apply the dimensionality reduction on X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Training data, where n_samples is the number of samples
+            and n_features is the number of features.
+
+        y : Ignored
+
+        Returns
+        -------
+        X_new : array-like, shape (n_samples, n_components)
+
+        """
+        V = self._fit(X)
+        return V
+
+    def _fit(self, X: np.ndarray):
+        """Dispatch to the right submethod depending on the chosen solver."""
+
+        # Raise an error for sparse input.
+        # This is more informative than the generic one raised by check_array.
+        X: np.ndarray = check_array(X, dtype=[np.float64, np.float32],
+                                    ensure_2d=True, copy=self.copy)
+        L, V = linalg.eigsh(X, k=X.shape[0])
+        idx: np.ndarray = np.argsort(L)[::-1]
+        self.eigenvalues_: np.ndarray = L[idx].copy()
+        self.explained_variance_: np.ndarray = L[idx].copy()
+        total_eigv: float = L.sum()
+        self.explained_variance_ratio_: np.ndarray = L / total_eigv
+        V: np.ndarray = V[:, idx]
+        self.components_ = V.T
+        return V
+
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        """Transform X back to its original space.
+
+        Returns an array X_original whose transform would be X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_components)
+            New data.
+
+        Returns
+        -------
+        X_original : array, shape (n_samples, n_features)
+            Note that this is always a dense array.
+        """
+        X: np.ndarray = check_array(X)
+        return np.dot(X * self.eigenvalues_, self.components_)
diff --git a/src/fluctmatch/fluctsca/ica.py → src/fluctmatch/decomposition/ica.py b/src/fluctmatch/fluctsca/ica.py → src/fluctmatch/decomposition/ica.py
@@ -32,16 +32,14 @@
     as_float_array, check_array, check_is_fitted, check_random_state, FLOAT_DTYPES
 )
 
-from fluctmatch.lib.center import Center2D
-
 logger: logging.Logger = logging.getLogger(__name__)
 logging.captureWarnings(True)
 
 
 def _infomax(X, l_rate: float=None, weights: np.ndarray=None, block: float=None,
              w_change: float=1e-12, anneal_deg: float=60., anneal_step: float=0.9,
              extended: bool=True, n_subgauss: int=1, kurt_size: int=6000,
-             ext_blocks: int=1, max_iter:int =200,
+             ext_blocks: int=1, max_iter:int =200, whiten=True,
              random_state: np.random.RandomState=None, blowup: float=1e4,
              blowup_fac: float=0.5, n_small_angle: int=20, use_bias: bool=True,
              verbose: bool=None) -> np.ndarray:
@@ -149,7 +147,27 @@ def _infomax(X, l_rate: float=None, weights: np.ndarray=None, block: float=None,
 
     # check data shape
     n_samples, n_features = X.shape
-    n_features_square = n_features ** 2
+    n_features_square = np.square(n_features)
+
+    if whiten:
+        # Centering the columns (ie the variables)
+        X_mean = X.mean(axis=-1)
+        X -= X_mean[:, np.newaxis]
+
+        # Whitening and preprocessing by PCA
+        u, d, _ = linalg.svd(X, full_matrices=False)
+
+        del _
+        K = (u / d).T[:n_components]  # see (6.33) p.140
+        del u, d
+        X1 = np.dot(K, X)
+        # see (13.6) p.267 Here X1 is white and data
+        # in X has been projected onto a subspace by PCA
+        X1 *= np.sqrt(n_features)
+    else:
+        # X must be casted to floats to avoid typing issues with numpy
+        # 2.0 and the line below
+        X1 = as_float_array(X, copy=False)  # copy has been taken care of
 
     # check input parameters
     # heuristic default - may need adjustment for large or tiny data sets
@@ -204,7 +222,7 @@ def _infomax(X, l_rate: float=None, weights: np.ndarray=None, block: float=None,
         # ICA training block
         # loop across block samples
         for t in range(0, lastt, block):
-            u = np.dot(X[permute[t:t + block], :], weights)
+            u = np.dot(X1[permute[t:t + block], :], weights)
             u += np.dot(bias, onesrow).T
 
             if extended:
@@ -581,33 +599,19 @@ def fit(self, data: np.ndarray):
         self.n_samples, n_features = data.shape
         random_state = check_random_state(self.random_state)
 
-        pca = PCA(n_components=self.n_components, whiten=True,
-                  copy=True, svd_solver='full')
-
         # take care of ICA
         if self.method == 'fastica':
-            ica = FastICA(whiten=False, random_state=random_state,
+            ica = FastICA(whiten=self.whiten, random_state=random_state,
                           **self.fit_params)
-            pipe_data = [pca, ica] if self.whiten else [ica,]
-            pipeline = make_pipeline(*pipe_data)
-            pipeline.fit(data)
+            ica.fit(data)
             self.components_ = ica.components_
             self.mixing_ = ica.mixing_
         elif self.method in ('infomax', 'extended-infomax'):
-            if self.whiten:
-                data = pca.fit_transform(data)
             self.components_ = _infomax(data, random_state=random_state,
+                                        whiten=self.whiten,
                                         **self.fit_params)[:self.n_components]
-            if self.whiten:
-                self.components_ /= np.sqrt(pca.explained_variance_)[None, :]  # whitening
             self.mixing_ = linalg.pinv(self.components_)
 
-        # the things to store for PCA
-        if self.whiten:
-            self.mean_ = pca.mean_
-            self.pca_components_ = pca.components_
-            self.pca_explained_variance_ = pca.explained_variance_
-
         return self
 
     def transform(self, data: np.ndarray, copy: bool=True) -> np.ndarray:
@@ -616,11 +620,6 @@ def transform(self, data: np.ndarray, copy: bool=True) -> np.ndarray:
 
         data = check_array(data, copy=copy, dtype=FLOAT_DTYPES)
 
-        # Apply first PCA
-        if self.whiten:
-            data -= self.mean_
-            data = np.dot(data, self.pca_components_.T)
-
         # Apply unmixing to low dimension PCA
         sources = np.dot(data, self.components_.T)
         return sources
@@ -630,9 +629,6 @@ def inverse_transform(self, data: np.ndarray, copy: bool=True) -> np.ndarray:
 
         data: np.ndarray = check_array(data, copy=copy, dtype=FLOAT_DTYPES)
         data = np.dot(data, self.mixing_.T)
-        if self.whiten:
-            data = np.dot(data, self.pca_components_)
-            data += self.mean_
         return data
 
     def copy(self) -> object: