Added important notice in docstring, increment version

ulfaslak · Mar 26, 2016 · 53ac6e4 · 53ac6e4
1 parent 9c6d606
commit 53ac6e4
Show file tree

Hide file tree

Showing 8 changed files with 333 additions and 4 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -1 +1,2 @@
 Made very slight changes, none technical, but releasing again with incremented version number.
+Added important notice in docstrings.
diff --git a/build/lib/py_pcha/PCHA.py b/build/lib/py_pcha/PCHA.py
@@ -0,0 +1,230 @@
+"""Principal Convex Hull Analysis (PCHA) / Archetypal Analysis."""
+
+from __future__ import division
+import numpy as np
+from scipy.sparse import csr_matrix
+from datetime import datetime as dt
+import time
+
+from furthest_sum import furthest_sum
+
+
+def PCHA(X, noc, I=None, U=None, delta=0, verbose=False, conv_crit=1E-6, maxiter=500):
+    """Return archetypes of dataset.
+
+    Parameters
+    ----------
+    X : numpy.2darray
+        Data matrix in which to find archetypes
+
+    noc : int
+        Number of archetypes to find
+
+    I : 1d-array
+        Entries of X to use for dictionary in C (optional)
+
+    U : 1d-array
+        Entries of X to model in S (optional)
+
+
+    Output
+    ------
+    XC : numpy.2darray
+        I x noc feature matrix (i.e. XC=X[:,I]*C forming the archetypes)
+
+    S : numpy.2darray
+        noc x length(U) matrix, S>=0 |S_j|_1=1
+
+    C : numpy.2darray
+        noc x length(U) matrix, S>=0 |S_j|_1=1
+
+    SSE : float
+        Sum of Squared Errors
+
+    varexlp : float
+        Percent variation explained by the model
+    """
+    def S_update(S, XCtX, CtXtXC, muS, SST, SSE, niter):
+        """Update S for one iteration of the algorithm."""
+        noc, J = S.shape
+        e = np.ones((noc, 1))
+        for k in range(niter):
+            SSE_old = SSE
+            g = (np.dot(CtXtXC, S) - XCtX) / (SST / J)
+            g = g - e * np.sum(g.A * S.A, axis=0)
+
+            S_old = S
+            while True:
+                S = (S_old - g * muS).clip(min=0)
+                S = S / np.dot(e, np.sum(S, axis=0))
+                SSt = S * S.T
+                SSE = SST - 2 * np.sum(XCtX.A * S.A) + np.sum(CtXtXC.A * SSt.A)
+                if SSE <= SSE_old * (1 + 1e-9):
+                    muS = muS * 1.2
+                    break
+                else:
+                    muS = muS / 2
+
+        return S, SSE, muS, SSt
+
+    def C_update(X, XSt, XC, SSt, C, delta, muC, mualpha, SST, SSE, niter=1):
+        """Update C for one iteration of the algorithm."""
+        J, nos = C.shape
+
+        if delta != 0:
+            alphaC = np.sum(C, axis=0).A[0]
+            C = np.dot(C, np.diag(1 / alphaC))
+
+        e = np.ones((J, 1))
+        XtXSt = np.dot(X.T, XSt)
+
+        for k in range(niter):
+
+            # Update C
+            SSE_old = SSE
+            g = (np.dot(X.T, np.dot(XC, SSt)) - XtXSt) / SST
+
+            if delta != 0:
+                g = np.dot(g, np.diag(alphaC))
+            g = g.A - e * np.sum(g.A * C.A, axis=0)
+
+            C_old = C
+            while True:
+                C = (C_old - muC * g).clip(min=0)
+                nC = np.sum(C, axis=0) + np.finfo(float).eps
+                C = np.dot(C, np.diag(1 / nC.A[0]))
+
+                if delta != 0:
+                    Ct = C * np.diag(alphaC)
+                else:
+                    Ct = C
+
+                XC = np.dot(X, Ct)
+                CtXtXC = np.dot(XC.T, XC)
+                SSE = SST - 2 * np.sum(XC.A * XSt.A) + np.sum(CtXtXC.A * SSt.A)
+
+                if SSE <= SSE_old * (1 + 1e-9):
+                    muC = muC * 1.2
+                    break
+                else:
+                    muC = muC / 2
+
+            # Update alphaC
+            SSE_old = SSE
+            if delta != 0:
+                g = (np.diag(CtXtXC * SSt).T / alphaC - np.sum(C.A * XtXSt.A)) / (SST * J)
+                alphaC_old = alphaC
+                while True:
+                    alphaC = alphaC_old - mualpha * g
+                    alphaC[alphaC < 1 - delta] = 1 - delta
+                    alphaC[alphaC > 1 + delta] = 1 + delta
+
+                    XCt = np.dot(XC, np.diag(alphaC / alphaC_old))
+                    CtXtXC = np.dot(XCt.T, XCt)
+                    SSE = SST - 2 * np.sum(XCt.A * XSt.A) + np.sum(CtXtXC.A * SSt.A)
+
+                    if SSE <= SSE_old * (1 + 1e-9):
+                        mualpha = mualpha * 1.2
+                        XC = XCt
+                        break
+                    else:
+                        mualpha = mualpha / 2
+
+        if delta != 0:
+            C = C * np.diag(alphaC)
+
+        return C, SSE, muC, mualpha, CtXtXC, XC
+
+    N, M = X.shape
+
+
+    if I is None:
+        I = range(M)
+    if U is None:
+        U = range(M)
+
+    SST = np.sum(X[:, U] * X[:, U])
+
+    # Initialize C
+    try:
+        i = furthest_sum(X[:, I], noc, [int(np.ceil(len(I) * np.random.rand()))])
+    except IndexError:
+        class InitializationException(Exception): pass
+        raise InitializationException("Initialization does not converge. Too few examples in dataset.")
+
+    j = range(noc)
+    C = csr_matrix((np.ones(len(i)), (i, j)), shape=(len(I), noc)).todense()
+
+    XC = np.dot(X[:, I], C)
+
+    muS, muC, mualpha = 1, 1, 1
+
+    # Initialise S
+    XCtX = np.dot(XC.T, X[:, U])
+    CtXtXC = np.dot(XC.T, XC)
+    S = -np.log(np.random.random((noc, len(U))))
+    S = S / np.dot(np.ones((noc, 1)), np.mat(np.sum(S, axis=0)))
+    SSt = np.dot(S, S.T)
+    SSE = SST - 2 * np.sum(XCtX.A * S.A) + np.sum(CtXtXC.A * SSt.A)
+    S, SSE, muS, SSt = S_update(S, XCtX, CtXtXC, muS, SST, SSE, 25)
+
+    # Set PCHA parameters
+    iter_ = 0
+    dSSE = np.inf
+    t1 = dt.now()
+    varexpl = (SST - SSE) / SST
+
+    if verbose:
+        print '\nPrincipal Convex Hull Analysis / Archetypal Analysis'
+        print 'A ' + str(noc) + ' component model will be fitted'
+        print 'To stop algorithm press control C\n'
+
+    dheader = '%10s | %10s | %10s | %10s | %10s | %10s | %10s | %10s' % ('Iteration', 'Expl. var.', 'Cost func.', 'Delta SSEf.', 'muC', 'mualpha', 'muS', ' Time(s)   ')
+    dline = '-----------+------------+------------+-------------+------------+------------+------------+------------+'
+
+    while np.abs(dSSE) >= conv_crit * np.abs(SSE) and iter_ < maxiter and varexpl < 0.9999:
+        if verbose and iter_ % 100 == 0:
+            print dline
+            print dheader
+            print dline
+        told = t1
+        iter_ += 1
+        SSE_old = SSE
+
+        # C (and alpha) update
+        XSt = np.dot(X[:, U], S.T)
+        C, SSE, muC, mualpha, CtXtXC, XC = C_update(
+            X[:, I], XSt, XC, SSt, C, delta, muC, mualpha, SST, SSE, 10
+        )
+
+        # S update
+        XCtX = np.dot(XC.T, X[:, U])
+        S, SSE, muS, SSt = S_update(
+            S, XCtX, CtXtXC, muS, SST, SSE, 10
+        )
+
+        # Evaluate and display iteration
+        dSSE = SSE_old - SSE
+        t1 = dt.now()
+        if iter_ % 1 == 0:
+            time.sleep(0.000001)
+            varexpl = (SST - SSE) / SST
+            if verbose:
+                print '%10.0f | %10.4f | %10.4e | %10.4e | %10.4e | %10.4e | %10.4e | %10.4f \n' % (iter_, varexpl, SSE, dSSE/np.abs(SSE), muC, mualpha, muS, (t1-told).seconds)
+
+    # Display final iteration
+    varexpl = (SST - SSE) / SST
+    if verbose:
+        print dline
+        print dline
+        print '%10.0f | %10.4f | %10.4e | %10.4e | %10.4e | %10.4e | %10.4e | %10.4f \n' % (iter_, varexpl, SSE, dSSE/np.abs(SSE), muC, mualpha, muS, (t1-told).seconds)
+
+    # Sort components according to importance
+    ind, vals = zip(
+        *sorted(enumerate(np.sum(S, axis=1)), key=lambda x: x[0], reverse=1)
+    )
+    S = S[ind, :]
+    C = C[:, ind]
+    XC = XC[:, ind]
+
+    return XC, S, C, SSE, varexpl
diff --git a/build/lib/py_pcha/__init__.py b/build/lib/py_pcha/__init__.py
@@ -0,0 +1,12 @@
+"""Principal Convex Hull Analysis (PCHA) / Archetypal Analysis.
+
+This module is a Python translation of a Matlab module. Avaiable at:
+http://www.mortenmorup.dk/MMhomepageUpdated_files/Page327.htm.
+
+Translated by: Ulf Aslak Jensen (contact: ulfjensen@gmail.com)
+Original author (Matmab): Morten Morup (contact: morten.morup@gmail.com).
+Source paper: 'M. Morup and L. K. Hansen, Archetypal Analysis for Data Mining,
+Neural Computing 2011'.
+"""
+
+__version__ = "0.1.2"
diff --git a/build/lib/py_pcha/furthest_sum.py b/build/lib/py_pcha/furthest_sum.py
@@ -0,0 +1,78 @@
+"""FurthestSum algorithm to efficiently generate initial seeds/archetypes."""
+
+from __future__ import division
+import numpy as np
+from numpy.matlib import repmat
+
+
+def furthest_sum(K, noc, i, exclude=[]):
+    """Furthest sum algorithm, to efficiently generat initial seed/archetypes.
+
+    Parameters
+    ----------
+    K : numpy 2d-array
+        Either a data matrix or a kernel matrix.
+
+    noc : int
+        Number of candidate archetypes to extract.
+
+    i : int
+        inital observation used for to generate the FurthestSum.
+
+    exclude : numpy.1darray
+        Entries in K that can not be used as candidates.
+
+    Output
+    ------
+    i : int
+        The extracted candidate archetypes
+    """
+    def max_ind_val(l):
+        return max(zip(range(len(l)), l), key=lambda x: x[1])
+
+    I, J = K.shape
+    index = np.array(range(J))
+    index[exclude] = 0
+    index[i] = -1
+    ind_t = i
+    sum_dist = np.zeros((1, J), np.complex128)
+
+    if J > noc * I:
+        Kt = K
+        Kt2 = np.sum(Kt**2, axis=0)
+        for k in range(1, noc + 11):
+            if k > noc - 1:
+                Kq = np.dot(Kt[:, i[0]], Kt)
+                sum_dist -= np.lib.scimath.sqrt(Kt2 - 2 * Kq + Kt2[i[0]])
+                index[i[0]] = i[0]
+                i = i[1:]
+            t = np.where(index != -1)[0]
+            Kq = np.dot(Kt[:, ind_t].T, Kt)
+            sum_dist += np.lib.scimath.sqrt(Kt2 - 2 * Kq + Kt2[ind_t])
+            ind, val = max_ind_val(sum_dist[:, t][0].real)
+            ind_t = t[ind]
+            i.append(ind_t)
+            index[ind_t] = -1
+    else:
+        if I != J or np.sum(K - K.T) != 0:  # Generate kernel if K not one
+            Kt = K
+            K = np.dot(Kt.T, Kt)
+            K = np.lib.scimath.sqrt(
+                repmat(np.diag(K), J, 1) - 2 * K + \
+                repmat(np.mat(np.diag(K)).T, 1, J)
+            )
+
+        Kt2 = np.diag(K)  # Horizontal
+        for k in range(1, noc + 11):
+            if k > noc - 1:
+                sum_dist -= np.lib.scimath.sqrt(Kt2 - 2 * K[i[0], :] + Kt2[i[0]])
+                index[i[0]] = i[0]
+                i = i[1:]
+            t = np.where(index != -1)[0]
+            sum_dist += np.lib.scimath.sqrt(Kt2 - 2 * K[ind_t, :] + Kt2[ind_t])
+            ind, val = max_ind_val(sum_dist[:, t][0].real)
+            ind_t = t[ind]
+            i.append(ind_t)
+            index[ind_t] = -1
+
+    return i
diff --git a/py_pcha/PCHA.py b/py_pcha/PCHA.py
@@ -12,10 +12,14 @@
 def PCHA(X, noc, I=None, U=None, delta=0, verbose=False, conv_crit=1E-6, maxiter=500):
     """Return archetypes of dataset.
 
+    Note: Commonly data is formatted to have shape (examples, dimensions).
+    This function takes input and returns output of the transposed shape,
+    (dimensions, examples).
+
     Parameters
     ----------
     X : numpy.2darray
-        Data matrix in which to find archetypes
+        Data matrix in which to find archetypes.
 
     noc : int
         Number of archetypes to find
@@ -151,7 +155,7 @@ def C_update(X, XSt, XC, SSt, C, delta, muC, mualpha, SST, SSE, niter=1):
     except IndexError:
         class InitializationException(Exception): pass
         raise InitializationException("Initialization does not converge. Too few examples in dataset.")
-        
+
     j = range(noc)
     C = csr_matrix((np.ones(len(i)), (i, j)), shape=(len(I), noc)).todense()
 

diff --git a/py_pcha/__init__.py b/py_pcha/__init__.py
@@ -9,4 +9,4 @@
 Neural Computing 2011'.
 """
 
-__version__ = "0.1.1"
+__version__ = "0.1.2"
diff --git a/py_pcha/furthest_sum.py b/py_pcha/furthest_sum.py
@@ -8,6 +8,10 @@
 def furthest_sum(K, noc, i, exclude=[]):
     """Furthest sum algorithm, to efficiently generat initial seed/archetypes.
 
+    Note: Commonly data is formatted to have shape (examples, dimensions).
+    This function takes input and returns output of the transposed shape,
+    (dimensions, examples).
+    
     Parameters
     ----------
     K : numpy 2d-array

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='py_pcha',
-    version='0.1.1',
+    version='0.1.2',
     description='Python implemenation of PCHA algorithm',
     url='https://github.com/ulfaslak/py_pcha',
     download_url = 'https://github.com/ulfaslak/py_pcha/tarball/0.1',