<a href="https://colab.research.google.com/github/yskuchi/wf_denoising/blob/master/denoising2D_tf2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Waveform denoising 'denoising2D_tf2'
Author: Yusuke Uchiyama

A denoising convolutional autoencoder with Tensorflow2.x
applied to **a set of** waveform data.  
See [Bitbucket repository](https://bitbucket.org/meg_ilc_tokyo/wf_denoising/src/master/) or 
[GitHub repository](https://github.com/yskuchi/wf_denoising)

Noise from data is added to MC signal data.
You need datasets of signal and noise, separately, in pickle format.

## Environment
As of 2021 Feb, tested with the following:

* Google Colab
* CPU, GPU, or TPU (experimental)
* Python 3.7
* TensorFlow 2.4.1
* Comet ML


Note: If you are running this in a colab notebook, we recommend you enable a free GPU by going:
> Runtime   →   Change runtime type   →   Hardware Accelerator: GPU

## Setting

### Comet ML

In [None]:
! pip install typing-extensions==3.7.4 comet-ml
! pip install typing-extensions==3.7.4
#! [ ! -z "$COLAB_GPU" ] && pip install typing-extensions==3.7.4 comet-ml

In [None]:
# import comet_ml in the top of your file
from comet_ml import Experiment

### Other packages

In [None]:
import os, sys
import numpy as np
import pandas as pd
import json

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Conv2DTranspose
from tensorflow.keras.layers import Dense, Reshape, Flatten, Concatenate
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint

print ('Python version: ' + str(sys.version_info))
print ('TensorFlow version: ' + str(tf.__version__))

### GPU
To use GPU on Google colab, specify GPU in runtime type. 

In [None]:
# check GPU
tf.test.gpu_device_name()
!echo $COLAB_GPU

### TPU
To use TPU on Google colab, it is not enough to specify TPU in runtime type.
See "Setup TPU".

In [None]:
# check TPU
!echo $COLAB_TPU_ADDR

### Parameters

In [None]:
# arg
load_weights = False
plot_data = True 

import matplotlib
if not plot_data:
    matplotlib.use("Agg") # this is necessary when using plt without display (batch)
import matplotlib.pyplot as plt

In [None]:
# infer signal or noise?
extract_signal = True

# Number of channels (if 2, two-end signals are dealt with channel)
nchannels = 1

# Number of waveforms packed in an input data (= Hight of 2D data)
height = 16

if extract_signal:
  filename = f"denoising2D{height}_tf2"
else:
  filename = f'noiseextraction2D{height}ch_tf2'

# Waveform has 1024 sample-points
npoints = 1024 # 256 # number of sample-points to be used (= Width of 2D data)
scale = 1 #5
offset = 0 #0.05 # 50 mV

signal_dataset_file = 'wf11600.pkl'
#noise_dataset_file  = 'wf328469.pkl' #2018
noise_dataset_file  = 'wf356990.pkl.bz2' #2020

#### Hyper-parameters

In [None]:
#nkernels = [64, 32, 32,   32, 32, 64] # default for nchannels = 1
nkernels = [64, 64, 64,   64, 64, 64]
nkernels = [256, 128, 64,   64, 128, 256]
#kernel_size = [5, 5, 5]
kernel_size = [3, 3, 3]

# basic hyper-parameters
params = {
    'optimizer':   'adam',
    'loss':        'mae', #'mse', #'binary_crossentropy', 
    'metrics':     ['mae', 'mse'],
    'epochs':      50, # 20,
    'batch_size':  512, #256,
}
# additional parameters
params2 = {
    'loss_type':           params['loss'],
    'conv_activation':     'relu',
    'output_activation':   'linear', #'sigmoid',
    'signal_dataset_file': signal_dataset_file,
    'noise_dataset_file':  noise_dataset_file,
    'npoints':             npoints,
    'batch_size':          params['batch_size'],
    'scale':               scale,
    'offset':              offset,
    'nkernels':            nkernels,
    'skip_connection':     [True, True, True], # skip connections for UNet-like structure
    'nsublayers':          2,
}

## Prepare datasets
On Google Colb, data is loaded via Google Drive.
Files are supposed to be in `/content/drive/My Drive/ML/data`.

### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/My Drive/ML/data/'
output_dir = '/content/drive/My Drive/ML/results/'

### Load pickle files

In [None]:
x_signal = pd.read_pickle(data_dir+signal_dataset_file).to_numpy()
x_noise = pd.read_pickle(data_dir+noise_dataset_file ).to_numpy()

nsamples = min(len(x_signal), len(x_noise)) 
nsamples = int(nsamples / (nchannels * height)) * nchannels * height

print(f'signal samples:{len(x_signal)}, noise samples:{len(x_noise)}, nsamples: {nsamples}')

In [None]:
if extract_signal:
  print('Extract signal')
  x_tobe_removed = x_noise[0:nsamples]
  x_tobe_extracted = x_signal[0:nsamples]
else:
  print('Extract noise')
  x_tobe_removed = x_signal[0:nsamples]
  x_tobe_extracted = x_noise[0:nsamples]

### Shape data in appropriate format with adding noise

In [None]:
x_tobe_removed = x_tobe_removed.astype('float32')
x_tobe_removed = x_tobe_removed.T[-npoints:].T # keep last npoints
x_tobe_extracted = x_tobe_extracted.astype('float32')
x_tobe_extracted = x_tobe_extracted.T[-npoints:].T # keep last npoints

# Add noise
x_train_noisy = x_tobe_removed + x_tobe_extracted

# Adjust scale and offset of waveforms
x_tobe_removed *= scale
x_tobe_removed += offset * scale
x_tobe_extracted *= scale # scale
x_tobe_extracted += offset * scale;
x_train_noisy *= scale # scale
x_train_noisy += offset * scale; # add offset

## Values in [0,1]
#x_tobe_extracted = np.clip(x_tobe_extracted, 0, 1);
#x_train_noisy = np.clip(x_train_noisy, 0, 1);

# To match the input shape for Conv2D with n channels
ninputs = int(len(x_tobe_removed) / (nchannels * height))
x_tobe_removed = np.reshape(x_tobe_removed, (ninputs, height, nchannels, npoints)).transpose(0,1,3,2)
x_tobe_extracted = np.reshape(x_tobe_extracted, (ninputs, height, nchannels, npoints)).transpose(0,1,3,2)
x_train_noisy = np.reshape(x_train_noisy, (ninputs, height, nchannels, npoints)).transpose(0,1,3,2)

print(x_tobe_removed.shape)

## Model

In [None]:
# Add the following code anywhere in your machine learning file
# api_key and workspace are supposed to be set in .comet.config file,
# otherwise set here like Experiment(api_key="AAAXXX", workspace = "yyy", project_name="zzz")
# experiment = Experiment(project_name="wf_denoising")
experiment = Experiment(api_key="gBJn86Y1oAYKM2oxaoY0oV4Af", workspace="yskuchi", project_name="wf_denoising")

In [None]:
experiment.log_parameters(params2)

### Setup TPU
This part seems tf version dependent and may be changed.

In [None]:
if 'COLAB_TPU_ADDR' in os.environ:
  tpu_grpc_url = "grpc://"+os.environ["COLAB_TPU_ADDR"]
  tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu_grpc_url)
  tf.config.experimental_connect_to_cluster(tpu_cluster_resolver) # TF2.0の場合、ここを追加
  tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver) # TF2.0の場合、今後experimentialが取れる可能性がある    
  strategy = tf.distribute.experimental.TPUStrategy(tpu_cluster_resolver)  # ここも同様
  #model = tf.distribute.tpu.keras_to_tpu_model(model, strategy=strategy)

### Build model with functional API

In [None]:
def build_model():
  input_img = Input(shape=(height, npoints,nchannels))
  conv1 = Conv2D(params2['nkernels'][0], (1, kernel_size[0]), padding='same', activation=params2['conv_activation'])(input_img)
  for i in range(1, params2['nsublayers']):
    conv1 = Conv2D(params2['nkernels'][0], (1, kernel_size[0]), padding='same', activation=params2['conv_activation'])(conv1)
  pool1 = MaxPooling2D((1,2), padding='same')(conv1)
  conv2 = Conv2D(params2['nkernels'][1], (1, kernel_size[1]), padding='same', activation=params2['conv_activation'])(pool1)
  for i in range(1, params2['nsublayers']):
    conv2 = Conv2D(params2['nkernels'][1], (1, kernel_size[1]), padding='same', activation=params2['conv_activation'])(conv2)
  pool2 = MaxPooling2D((1,2), padding='same')(conv2)
  conv3 = Conv2D(params2['nkernels'][2], (1, kernel_size[2]), padding='same', activation=params2['conv_activation'])(pool2)
  for i in range(1, params2['nsublayers']):
    conv3 = Conv2D(params2['nkernels'][2], (1, kernel_size[2]), padding='same', activation=params2['conv_activation'])(conv32)
  encoded = MaxPooling2D((1,2), padding='same')(conv3)

  conv4 = Conv2D(params2['nkernels'][3], (1, kernel_size[2]), padding='same', activation=params2['conv_activation'])(encoded)
  for i in range(1, params2['nsublayers']):
    conv4 = Conv2D(params2['nkernels'][3], (1, kernel_size[2]), padding='same', activation=params2['conv_activation'])(conv4)
  up5 = UpSampling2D((1,2))(conv4)
  if params2['skip_connection'][2]:
    up5 = Concatenate()([up5, conv3])
  conv5 = Conv2D(params2['nkernels'][4], (1, kernel_size[1]), padding='same', activation=params2['conv_activation'])(up5)
  for i in range(1, params2['nsublayers']):
    conv5 = Conv2D(params2['nkernels'][4], (1, kernel_size[1]), padding='same', activation=params2['conv_activation'])(conv5)
  up6 = UpSampling2D((1,2))(conv5)
  if params2['skip_connection'][1]:
    up6 = Concatenate()([up6, conv2])
  conv6 = Conv2D(params2['nkernels'][5], (1, kernel_size[0]), padding='same', activation=params2['conv_activation'])(up6)
  for i in range(1, params2['nsublayers']):
    conv6 = Conv2D(params2['nkernels'][5], (1, kernel_size[0]), padding='same', activation=params2['conv_activation'])(conv6)
  up7 = UpSampling2D((1,2))(conv6)
  if params2['skip_connection'][0]:
    up7 = Concatenate()([up7, conv1])
  decoded = Conv2D(nchannels, (1, 3), padding='same', activation=params2['output_activation'])(up7)
  for i in range(1, params2['nsublayers']):
    decoded = Conv2D(nchannels, (1, 3), padding='same', activation=params2['output_activation'])(decode)

  autoencoder = Model(inputs=input_img, outputs=decoded)

  autoencoder.compile(optimizer=params['optimizer'], loss=params['loss'], metrics=params['metrics']) 
  autoencoder.summary()
  return autoencoder

In [None]:
try:
  strategy
  with strategy.scope():
    autoencoder = build_model()
except NameError:
  autoencoder = build_model()

## Fit

On Google Colb, the results (trained model) are saved in Google Drive. Files are supposed to be in /content/drive/My Drive/ML/results.

In [None]:
history=[]
if not load_weights:

    # Callback for model checkpoints
    checkpoint = ModelCheckpoint(
        filepath = output_dir + filename + "-{epoch:02d}.h5",
        save_best_only=True,
        save_weight_only=False)
    
    # 'labels' are the pictures themselves
    hist = autoencoder.fit(x_train_noisy, x_tobe_extracted,
                           epochs=params['epochs'],
                           batch_size=params['batch_size'],
                           shuffle=True,
                           validation_split=0.1)#,
                           #callbacks=[checkpoint])


    # Save history
    with open(output_dir + filename + '_hist.json', 'w') as f:
        json.dump(hist.history, f)
    history = hist.history
        
    # Save the weights
    autoencoder.save_weights(output_dir + filename + '_weights.h5')
else:
    # Load weights
    autoencoder.load_weights(f'{output_dir}{filename}_weights.h5')

    # Load history
    with open(f'{output_dir}{filename}_hist.json', 'r') as f:
        history = json.load(f)

autoencoder.save(output_dir + filename + '.h5', include_optimizer=False)
        
# Plot training history 
#plt.plot(history['loss'], linewidth=3, label='train')
#plt.plot(history['val_loss'], linewidth=3, label='valid')
plt.plot(history['mae'], linewidth=3, label='train')
plt.plot(history['val_mae'], linewidth=3, label='valid')
plt.grid()
plt.legend()
plt.xlabel('epoch')
plt.ylabel('loss')
if params['metrics'] == 'mse':
  plt.ylim(1e-6 * scale, 0.1e-3 * scale) #mse
else:
  plt.ylim(0.5e-3 * scale, 0.5e-2 * scale) #mae
plt.show()

## Test

In [None]:
x_test_tobe_removed = x_tobe_removed[0:]
x_test_tobe_extracted = x_tobe_extracted[0:]
x_test_noisy = x_train_noisy[0:]
print(x_test_noisy.shape)

decoded_imgs = autoencoder.predict(x_test_noisy)
decoded_imgs =decoded_imgs[0:len(x_test_noisy)]

print(decoded_imgs.shape)

# revert scale and offset
x_test_tobe_removed -= scale * offset
x_test_tobe_removed /= scale
x_test_tobe_extracted -= scale * offset
x_test_tobe_extracted /= scale
x_test_noisy -= scale * offset
x_test_noisy /= scale
decoded_imgs -= scale * offset
decoded_imgs /= scale
x_subtracted = x_test_tobe_extracted - decoded_imgs

print(x_test_tobe_extracted.shape)
print(x_test_noisy.shape)

x_test_tobe_removed = x_test_tobe_removed.transpose(0,1, 3, 2)
x_test_tobe_removed = np.reshape(x_test_tobe_removed, (len(x_test_tobe_removed) * height * nchannels, npoints))
x_test_tobe_extracted = x_test_tobe_extracted.transpose(0,1, 3, 2)
x_test_tobe_extracted = np.reshape(x_test_tobe_extracted, (len(x_test_tobe_extracted) * height * nchannels, npoints))
x_test_noisy = x_test_noisy.transpose(0,1,3,2)
x_test_noisy = np.reshape(x_test_noisy, (len(x_test_noisy) * height * nchannels, npoints))
decoded_imgs = decoded_imgs.transpose(0,1,3,2)
decoded_imgs = np.reshape(decoded_imgs, (len(decoded_imgs) * height * nchannels, npoints))
x_subtracted = x_subtracted.transpose(0,1,3,2)
x_subtracted = np.reshape(x_subtracted, (len(x_subtracted) * height * nchannels, npoints))

In [None]:
# How many waveforms to be displayed
n = 2
start = 16                
fig = plt.figure(figsize=(20, 6 * n))
j = 0
for i in range(start, start + n):
  ax = fig.add_subplot(n, 1, j+1)
  #ax.plot(x_test_tobe_removed[i], label="original")
  ax.plot(x_test_noisy[i], label="noisy", color='gray')
  if extract_signal:
    ax.plot(x_test_tobe_extracted[i], label="signal", color='green')
  else:
    ax.plot(x_test_tobe_extracted[i], label="noise", color='green')
  ax.plot(decoded_imgs[i], label="decoded", color='magenta')
  #ax.plot(x_subtracted[i], label="subtracted")
  ax.legend()
  j += 1

In [None]:
# Send this plot to comet
experiment.log_figure(figure=fig)

In [None]:
experiment.end()

In [None]:
if plot_data:
    plt.show()