In [1]:
import os
import xarray as xr
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
import tensorly as tl
import numpy as np

In [2]:
MVBS_path = '/Users/wu-jung/code_git/ooi_sonar/zplsc_data_2015fall/nc_MVBS_envFromFile/'
MVBS_path = '../data/reproduced_MVBS_files/'
MVBS_file = '20150817-20151017_MVBS_time_from_Sv_rangeBin5_all.nc'

In [3]:
MVBS = xr.open_dataset(os.path.join(MVBS_path, MVBS_file))
MVBS

In [4]:
#MVBS_PCP_path = '/Users/wu-jung/code_git/ooi_sonar/zplsc_data_2015fall/nc_PCP_envFromFile/'
MVBS_PCP_path = '../data/reproduced_MVBS_files/'
MVBS_rpca_file = '20150817-20151017_MVBS_time_from_Sv_rangeBin5_rpca.nc'

In [5]:
MVBS_rpca = xr.open_dataset(os.path.join(MVBS_PCP_path, MVBS_rpca_file))
MVBS_rpca

In [6]:
low_rank = MVBS_rpca['low_rank']
low_rank.shape

(62, 3, 37, 144)

In [7]:
#low_rank = low_rank.sel(frequency=[38000])

In [8]:
n_observations, n_frequencies, n_depth_levels, n_pins = low_rank.shape

In [9]:
low_rank = low_rank.values.reshape([n_observations,-1])
low_rank.shape

(62, 15984)

In [10]:
low_rank_nonneg = low_rank - low_rank.min()

In [11]:
low_rank_nonneg_scaled = (low_rank_nonneg/np.std(low_rank_nonneg.T, axis=1)).T

## Classic NMF

In [None]:
model = NMF(n_components=3, init='random', random_state=0)

In [None]:
H = model.fit_transform(low_rank_nonneg_scaled.T)
W = model.components_

In [None]:
(W.shape, H.shape)

In [None]:
hlines = plt.plot(H)

In [None]:
W_reorg = W.reshape(3,37,144)

In [None]:
fig, ax = plt.subplots(1,3,figsize=(15,6))
for icomp in range(3):
    ax[icomp].imshow(tl.unfold(tl.tensor(W_reorg[icomp,:,:].squeeze()),mode=1).T,
                     aspect='auto')

In [None]:
fig, ax = plt.subplots(1,3,figsize=(15,6))
for icomp in range(3):
    ax[icomp].imshow(tl.unfold(tl.tensor(W_reorg[icomp,:,:,:].squeeze()),mode=2).T,
                     aspect='auto')

In [None]:
model_scaled = NMF(n_components=3, init='random', random_state=0)
H = model_scaled.fit_transform(low_rank_nonneg)
W = model_scaled.components_
W_reorg = W.reshape(3,3,37,144)

In [None]:
hlines = plt.plot(H)
fig, ax = plt.subplots(1,3,figsize=(15,6))
for icomp in range(3):
    ax[icomp].imshow(tl.unfold(tl.tensor(W_reorg[icomp,:,:,:].squeeze()),mode=2).T,
                     aspect='auto')

## Check similarity between days

In [None]:
from scipy.spatial.distance import pdist, squareform

In [None]:
# Normalize the activation coefficients
k = H.T
k_norm = k.T-k.min(axis=1)
k_norm = k_norm/k_norm.max(axis=0)
D = pdist(k_norm, 'euclidean')
D_square = squareform(D)
similarity_m = 1-D_square/D_square.max()

# Check similarity between any two days within the observation period
fig = plt.figure(figsize=(6,4))
ax = fig.add_subplot(111)
plt.imshow(similarity_m,cmap='RdYlBu_r')
plt.xticks(np.arange(0,62,10),fontsize=14)
plt.yticks(np.arange(0,62,10),fontsize=14)
plt.xlabel('Day',fontsize=16)
plt.ylabel('Day',fontsize=16)

cbaxes = fig.add_axes([0.8, 0.125, 0.03, 0.755]) 
cbar = plt.colorbar(cax = cbaxes)  
cbar.ax.tick_params(labelsize=14) 
cbar.ax.set_ylabel('Similarity', rotation=90, fontsize=16)
plt.show()


## Check reconstruction error

In [None]:
recon = (W.T@H.T).T

In [None]:
recon_da = xr.DataArray(np.moveaxis(recon.reshape([62, 3, 37, 144]),[0,1,2],[2,0,1]).reshape([3,37,-1]),
                        coords=[('frequency', MVBS_rpca['frequency']),
                                ('depth', MVBS_rpca['depth']),
                                ('ping_time', MVBS['ping_time'])])

In [None]:
rpca_da = xr.DataArray(np.moveaxis(MVBS_rpca['low_rank'].values,[0,1,2],[2,0,1]).reshape([3,37,-1])-
                       MVBS_rpca['low_rank'].values.min(),
                       coords=[('frequency', MVBS_rpca['frequency']),
                                ('depth', MVBS_rpca['depth']),
                                ('ping_time', MVBS['ping_time'])])

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20,6), sharex=True)
for ifreq, freq in enumerate([38000,120000,200000]):
    (recon_da-rpca_da).sel(frequency=freq).plot(ax=ax[ifreq], yincrease=False)

# Smooth NMF

Next we run the smooth NMF which imposes smoothness (in time) on the activations by adding a Tikhonov regularization term on the gradient of $H$.

In [None]:
# perform this step once to install the ssnmf package
!pip install --upgrade git+https://github.com/valentina-s/ss-nmf.git

In [14]:
!pip install --upgrade ~/projects/ss-nmf

Processing /Users/valentina/projects/ss-nmf
Building wheels for collected packages: ss-nmf
  Building wheel for ss-nmf (setup.py) ... [?25ldone
[?25h  Created wheel for ss-nmf: filename=ss_nmf-VERSION-cp37-none-any.whl size=8526378 sha256=bd7fa54bf5469b10da42975884bc93d4bf8176e430cb28731c0869a7e6e3206f
  Stored in directory: /Users/valentina/Library/Caches/pip/wheels/d7/7c/ba/b54f0f3eb5c7145fb79e786f902b02752012e20b2c512b1619
Successfully built ss-nmf
Installing collected packages: ss-nmf
  Found existing installation: ss-nmf VERSION
    Uninstalling ss-nmf-VERSION:
      Successfully uninstalled ss-nmf-VERSION
Successfully installed ss-nmf-VERSION


In [15]:
import ssnmf
model = ssnmf.smoothNMF(n_components=3, max_iter=200, smoothness=5000000)

In [16]:
!mkdir checkpoints

mkdir: checkpoints: File exists


In [28]:
%%time
model.fit(low_rank_nonneg.T, init='random', checkpoint_idx=range(200), checkpoint_dir='./checkpoints')

CPU times: user 5.49 s, sys: 1.85 s, total: 7.34 s
Wall time: 2.01 s


In [22]:
ls checkpoints

chkpt-2020-01-04-19:25:54.294395.db


In [25]:
import shelve
# note: do not use the .db extension when opening
chkpt_data = shelve.open('./checkpoints/chkpt-2020-01-04-19:25:54.294395')


In [24]:
# display H and W for iteration 0
print(chkpt_data['0'])

{'H': array([[0.74441527, 0.25883616, 0.29606357, 0.2618198 , 0.67483027,
        0.40053551, 0.1355557 , 0.48577188, 0.95044621, 0.39910981,
        0.48518362, 0.56872403, 0.7192917 , 0.63569631, 0.24062707,
        0.71286757, 0.36736188, 0.26045187, 0.33173395, 0.59913353,
        0.94599763, 0.59575765, 0.15840071, 0.77353418, 0.45169876,
        0.48229674, 0.18152523, 0.65520947, 0.27829216, 0.52834016,
        0.91931638, 0.65759791, 0.74590604, 0.96634514, 0.58748376,
        0.47264468, 0.50026384, 0.82162832, 0.25815672, 0.25719532,
        0.87404874, 0.45467883, 0.21234013, 0.67701353, 0.79115411,
        0.44803404, 0.14691335, 0.29308487, 0.73653791, 0.33428112,
        0.33738193, 0.12942535, 0.79623141, 0.67605352, 0.80261227,
        0.38149407, 0.79014231, 0.31167767, 0.48705158, 0.53753171,
        0.32835519, 0.9129617 ],
       [0.21066478, 0.90315011, 0.37459952, 0.46080242, 0.19263532,
        0.47659064, 0.27180467, 0.73546275, 0.33796336, 0.44953831,
        0

In [None]:
# set W and H
H = model.H.T
W = model.W.T

In [None]:
(W.shape, H.shape)

In [None]:
hlines = plt.plot(H)

In [None]:
low_rank_nonneg.shape

In [None]:
W_reorg = W.reshape(3,3,37,144)

In [None]:
fig, ax = plt.subplots(1,3,figsize=(15,6))
for icomp in range(3):
    ax[icomp].imshow(tl.unfold(tl.tensor(W_reorg[icomp,:,:,:].squeeze()),mode=2).T,
                     aspect='auto')

## Check similarity between days

In [None]:
from scipy.spatial.distance import pdist, squareform

In [None]:
# Normalize the activation coefficients
k = H.T
k_norm = k.T-k.min(axis=1)
k_norm = k_norm/k_norm.max(axis=0)
D = pdist(k_norm, 'euclidean')
D_square = squareform(D)
similarity_m = 1-D_square/D_square.max()

# Check similarity between any two days within the observation period
fig = plt.figure(figsize=(6,4))
ax = fig.add_subplot(111)
plt.imshow(similarity_m,cmap='RdYlBu_r')
plt.xticks(np.arange(0,62,10),fontsize=14)
plt.yticks(np.arange(0,62,10),fontsize=14)
plt.xlabel('Day',fontsize=16)
plt.ylabel('Day',fontsize=16)

cbaxes = fig.add_axes([0.8, 0.125, 0.03, 0.755]) 
cbar = plt.colorbar(cax = cbaxes)  
cbar.ax.tick_params(labelsize=14) 
cbar.ax.set_ylabel('Similarity', rotation=90, fontsize=16)
plt.show()

## Check reconstruction error

In [None]:
recon = (W.T@H.T).T


In [None]:
recon_da = xr.DataArray(np.moveaxis(recon.reshape([62, 3, 37, 144]),[0,1,2],[2,0,1]).reshape([3,37,-1]),
                        coords=[('frequency', MVBS_rpca['frequency']),
                                ('depth', MVBS_rpca['depth']),
                                ('ping_time', MVBS['ping_time'])])

In [None]:
rpca_da = xr.DataArray(np.moveaxis(MVBS_rpca['low_rank'].values,[0,1,2],[2,0,1]).reshape([3,37,-1])-
                       MVBS_rpca['low_rank'].values.min(),
                       coords=[('frequency', MVBS_rpca['frequency']),
                                ('depth', MVBS_rpca['depth']),
                                ('ping_time', MVBS['ping_time'])])

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20,6), sharex=True)
for ifreq, freq in enumerate([38000,120000,200000]):
    (recon_da-rpca_da).sel(frequency=freq).plot(ax=ax[ifreq], yincrease=False)