In [1]:
import os
import xarray as xr
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
import tensorly as tl
import numpy as np

In [2]:
MVBS_path = '/Users/wu-jung/code_git/ooi_sonar/zplsc_data_2015fall/nc_MVBS_envFromFile/'
MVBS_path = '../data/reproduced_MVBS_files/'
MVBS_file = '20150817-20151017_MVBS_time_from_Sv_rangeBin5_all.nc'

In [3]:
MVBS = xr.open_dataset(os.path.join(MVBS_path, MVBS_file))
MVBS

In [4]:
#MVBS_PCP_path = '/Users/wu-jung/code_git/ooi_sonar/zplsc_data_2015fall/nc_PCP_envFromFile/'
MVBS_PCP_path = '../data/reproduced_MVBS_files/'
MVBS_rpca_file = '20150817-20151017_MVBS_time_from_Sv_rangeBin5_rpca.nc'

In [5]:
MVBS_rpca = xr.open_dataset(os.path.join(MVBS_PCP_path, MVBS_rpca_file))
MVBS_rpca

In [6]:
low_rank = MVBS_rpca['low_rank']
low_rank.shape

(62, 3, 37, 144)

In [7]:
#low_rank = low_rank.sel(frequency=[38000])

In [8]:
n_observations, n_frequencies, n_depth_levels, n_pins = low_rank.shape

In [9]:
low_rank = low_rank.values.reshape([n_observations,-1])
low_rank.shape

(62, 15984)

In [10]:
low_rank_nonneg = low_rank - low_rank.min()

In [11]:
low_rank_nonneg_scaled = (low_rank_nonneg/np.std(low_rank_nonneg.T, axis=1)).T

## Classic NMF

In [None]:
model = NMF(n_components=3, init='random', random_state=0)

In [None]:
H = model.fit_transform(low_rank_nonneg_scaled.T)
W = model.components_

In [None]:
(W.shape, H.shape)

In [None]:
hlines = plt.plot(H)

In [None]:
W_reorg = W.reshape(3,37,144)

In [None]:
fig, ax = plt.subplots(1,3,figsize=(15,6))
for icomp in range(3):
    ax[icomp].imshow(tl.unfold(tl.tensor(W_reorg[icomp,:,:].squeeze()),mode=1).T,
                     aspect='auto')

In [None]:
fig, ax = plt.subplots(1,3,figsize=(15,6))
for icomp in range(3):
    ax[icomp].imshow(tl.unfold(tl.tensor(W_reorg[icomp,:,:,:].squeeze()),mode=2).T,
                     aspect='auto')

In [None]:
model_scaled = NMF(n_components=3, init='random', random_state=0)
H = model_scaled.fit_transform(low_rank_nonneg)
W = model_scaled.components_
W_reorg = W.reshape(3,3,37,144)

In [None]:
hlines = plt.plot(H)
fig, ax = plt.subplots(1,3,figsize=(15,6))
for icomp in range(3):
    ax[icomp].imshow(tl.unfold(tl.tensor(W_reorg[icomp,:,:,:].squeeze()),mode=2).T,
                     aspect='auto')

## Check similarity between days

In [None]:
from scipy.spatial.distance import pdist, squareform

In [None]:
# Normalize the activation coefficients
k = H.T
k_norm = k.T-k.min(axis=1)
k_norm = k_norm/k_norm.max(axis=0)
D = pdist(k_norm, 'euclidean')
D_square = squareform(D)
similarity_m = 1-D_square/D_square.max()

# Check similarity between any two days within the observation period
fig = plt.figure(figsize=(6,4))
ax = fig.add_subplot(111)
plt.imshow(similarity_m,cmap='RdYlBu_r')
plt.xticks(np.arange(0,62,10),fontsize=14)
plt.yticks(np.arange(0,62,10),fontsize=14)
plt.xlabel('Day',fontsize=16)
plt.ylabel('Day',fontsize=16)

cbaxes = fig.add_axes([0.8, 0.125, 0.03, 0.755]) 
cbar = plt.colorbar(cax = cbaxes)  
cbar.ax.tick_params(labelsize=14) 
cbar.ax.set_ylabel('Similarity', rotation=90, fontsize=16)
plt.show()


## Check reconstruction error

In [None]:
recon = (W.T@H.T).T

In [None]:
recon_da = xr.DataArray(np.moveaxis(recon.reshape([62, 3, 37, 144]),[0,1,2],[2,0,1]).reshape([3,37,-1]),
                        coords=[('frequency', MVBS_rpca['frequency']),
                                ('depth', MVBS_rpca['depth']),
                                ('ping_time', MVBS['ping_time'])])

In [None]:
rpca_da = xr.DataArray(np.moveaxis(MVBS_rpca['low_rank'].values,[0,1,2],[2,0,1]).reshape([3,37,-1])-
                       MVBS_rpca['low_rank'].values.min(),
                       coords=[('frequency', MVBS_rpca['frequency']),
                                ('depth', MVBS_rpca['depth']),
                                ('ping_time', MVBS['ping_time'])])

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20,6), sharex=True)
for ifreq, freq in enumerate([38000,120000,200000]):
    (recon_da-rpca_da).sel(frequency=freq).plot(ax=ax[ifreq], yincrease=False)

# Smooth NMF

Next we run the smooth NMF which imposes smoothness (in time) on the activations by adding a Tikhonov regularization term on the gradient of $H$.

In [None]:
# perform this step once to install the ssnmf package
!pip install --upgrade git+https://github.com/valentina-s/ss-nmf.git

In [12]:
!pip install --upgrade ~/projects/ss-nmf

Processing /Users/valentina/projects/ss-nmf
Building wheels for collected packages: ss-nmf
  Building wheel for ss-nmf (setup.py) ... [?25ldone
[?25h  Created wheel for ss-nmf: filename=ss_nmf-VERSION-cp37-none-any.whl size=8525964 sha256=bd95759ca0906143812795ecdeaf4f6b5c3ca358598359d73f264dd522d7a262
  Stored in directory: /Users/valentina/Library/Caches/pip/wheels/d7/7c/ba/b54f0f3eb5c7145fb79e786f902b02752012e20b2c512b1619
Successfully built ss-nmf
Installing collected packages: ss-nmf
  Found existing installation: ss-nmf VERSION
    Uninstalling ss-nmf-VERSION:
      Successfully uninstalled ss-nmf-VERSION
Successfully installed ss-nmf-VERSION


In [13]:
import ssnmf
model = ssnmf.smoothNMF(n_components=3, max_iter=200, smoothness=5000000)

In [14]:
%%time
model.fit(low_rank_nonneg.T, init='random', checkpoint_idx=range(200))

CPU times: user 5.6 s, sys: 1.76 s, total: 7.35 s
Wall time: 2.07 s


In [21]:
ls

InitializationComparison.ipynb
PCP_sPCP_low-rank_data_distribution.ipynb
Untitled.ipynb
apply_PCP_sPCP_to_MVBS.ipynb
calibration_with_without_comparison.ipynb
chkpt-2020-01-04 02:53:40.871915.db
chkpt-2020-01-04-03:26:29.218704.db
chkpt-2020-01-04-03:33:05.926934.db
chkpt-2020-01-04-03:34:25.711101.db
chkpt-2020-01-04-03:42:23.327772.db
chkpt.db
cophenetic_rank_estimation_classicNMF.ipynb
cophenetic_rank_estimation_smoothNMF.ipynb
cophenetic_smoothNMF
cophenetic_smoothNMF.pkl
load_PCP_for_NMF.ipynb
load_PCP_for_smoothNMF.ipynb
load_PCP_for_tensor.ipynb
mvbs_200sec_loop_through_SvClean_files.ipynb
mvbs_200sec_loop_through_Sv_file.ipynb
mvbs_5min_chunk_filesize.ipynb
mvbs_multiple_files.ipynb
mvbs_single_file.ipynb
reproduce_noise_removal.ipynb
smoothNMF-ConvergenceStudy.ipynb


In [18]:
import shelve
chkpt_data = shelve.open('chkpt-2020-01-04-03:42:23.327772')
#chkpt_data = pickle.load(f)

In [20]:
# display H and W for iteration 0
print(chkpt_data['0'])

{'H': array([[0.36521301, 0.65084793, 0.73185693, 0.73188908, 0.9184741 ,
        0.79906472, 0.53379266, 0.16141139, 0.8532697 , 0.82371047,
        0.64515619, 0.69941631, 0.15192833, 0.21893258, 0.19288519,
        0.23106233, 0.5417792 , 0.35152098, 0.15417955, 0.4131557 ,
        0.81515252, 0.89339197, 0.79894743, 0.81430134, 0.99588875,
        0.93004175, 0.69285669, 0.17740341, 0.88851605, 0.27905588,
        0.92067194, 0.51384212, 0.65934419, 0.67851418, 0.56320372,
        0.4799584 , 0.63368933, 0.57830266, 0.27568146, 0.71028589,
        0.59657755, 0.88118811, 0.67367585, 0.2120636 , 0.47232588,
        0.7928573 , 0.41411867, 0.7903859 , 0.50327792, 0.50958114,
        0.51145969, 0.85526826, 0.28440952, 0.13935172, 0.63809407,
        0.06876922, 0.30033331, 0.3873784 , 0.36868469, 0.19252428,
        0.47946065, 0.49117439],
       [0.65719266, 0.05422065, 0.2941991 , 0.85636491, 0.32847586,
        0.54720803, 0.49760346, 0.61166733, 0.58331766, 0.72872993,
        0

In [None]:
# set W and H
H = model.H.T
W = model.W.T

In [None]:
(W.shape, H.shape)

In [None]:
hlines = plt.plot(H)

In [None]:
low_rank_nonneg.shape

In [None]:
W_reorg = W.reshape(3,3,37,144)

In [None]:
fig, ax = plt.subplots(1,3,figsize=(15,6))
for icomp in range(3):
    ax[icomp].imshow(tl.unfold(tl.tensor(W_reorg[icomp,:,:,:].squeeze()),mode=2).T,
                     aspect='auto')

## Check similarity between days

In [None]:
from scipy.spatial.distance import pdist, squareform

In [None]:
# Normalize the activation coefficients
k = H.T
k_norm = k.T-k.min(axis=1)
k_norm = k_norm/k_norm.max(axis=0)
D = pdist(k_norm, 'euclidean')
D_square = squareform(D)
similarity_m = 1-D_square/D_square.max()

# Check similarity between any two days within the observation period
fig = plt.figure(figsize=(6,4))
ax = fig.add_subplot(111)
plt.imshow(similarity_m,cmap='RdYlBu_r')
plt.xticks(np.arange(0,62,10),fontsize=14)
plt.yticks(np.arange(0,62,10),fontsize=14)
plt.xlabel('Day',fontsize=16)
plt.ylabel('Day',fontsize=16)

cbaxes = fig.add_axes([0.8, 0.125, 0.03, 0.755]) 
cbar = plt.colorbar(cax = cbaxes)  
cbar.ax.tick_params(labelsize=14) 
cbar.ax.set_ylabel('Similarity', rotation=90, fontsize=16)
plt.show()

## Check reconstruction error

In [None]:
recon = (W.T@H.T).T


In [None]:
recon_da = xr.DataArray(np.moveaxis(recon.reshape([62, 3, 37, 144]),[0,1,2],[2,0,1]).reshape([3,37,-1]),
                        coords=[('frequency', MVBS_rpca['frequency']),
                                ('depth', MVBS_rpca['depth']),
                                ('ping_time', MVBS['ping_time'])])

In [None]:
rpca_da = xr.DataArray(np.moveaxis(MVBS_rpca['low_rank'].values,[0,1,2],[2,0,1]).reshape([3,37,-1])-
                       MVBS_rpca['low_rank'].values.min(),
                       coords=[('frequency', MVBS_rpca['frequency']),
                                ('depth', MVBS_rpca['depth']),
                                ('ping_time', MVBS['ping_time'])])

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20,6), sharex=True)
for ifreq, freq in enumerate([38000,120000,200000]):
    (recon_da-rpca_da).sel(frequency=freq).plot(ax=ax[ifreq], yincrease=False)