<a href="https://colab.research.google.com/github/yskuchi/wf_denoising/blob/master/denoisingUNet1_tf2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Waveform denoising 'denoisingUNet1_tf2'
Author: Yusuke Uchiyama

A denoising convolutional autoencoder with Tensorflow2.x
applied to waveform data.  
This version is based on **U-Net** structure. 
See [Bitbucket repository](https://bitbucket.org/meg_ilc_tokyo/wf_denoising/src/master/) or 
[GitHub repository](https://github.com/yskuchi/wf_denoising)

Noise from data is added to MC signal data.
You need datasets of signal and noise, separately, in pickle format.

## Environment
As of 2021 Jan, tested with the following:

* Google Colab
* CPU, GPU, or TPU (experimental)
* Python 3.6
* TensorFlow 2.4.0
* Comet ML 3.2.10


Note: If you are running this in a colab notebook, we recommend you enable a free GPU by going:
> Runtime   →   Change runtime type   →   Hardware Accelerator: GPU

## Setting

### Comet ML

In [None]:
! pip install typing-extensions==3.7.4.3 comet-ml
#! pip install typing-extensions==3.7.4
#! [ ! -z "$COLAB_GPU" ] && pip install typing-extensions==3.7.4 comet-ml

In [None]:
# import comet_ml in the top of your file
from comet_ml import Experiment

### Other packages

In [None]:
import os, sys
import numpy as np
import pandas as pd
import json
import datetime

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPooling1D, UpSampling1D
from tensorflow.keras.layers import Dropout, Activation, BatchNormalization, Concatenate
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint

print ('Python version: ' + str(sys.version_info))
print ('TensorFlow version: ' + str(tf.__version__))

### GPU
To use GPU on Google colab, specify GPU in runtime type. 

In [None]:
# check GPU
tf.test.gpu_device_name()
!echo $COLAB_GPU

### TPU
To use TPU on Google colab, it is not enough to specify TPU in runtime type.
See "Setup TPU".

In [None]:
# check TPU
!echo $COLAB_TPU_ADDR

### Parameters

In [None]:
# arg
load_weights = False
plot_data = True 
filename = "denoisingUNet1_tf2"
filename =  f'{filename}_{datetime.date.today()}'

import matplotlib
if not plot_data:
    matplotlib.use("Agg") # this is necessary when using plt without display (batch)
import matplotlib.pyplot as plt

In [None]:
# Waveform has 1024 sample-points
npoints = 1024 # 256 # number of sample-points to be used
scale = 1 # 5
offset = 0.001 #0.05 # 50 mV

#signal_dataset_file = 'wf11100.pkl'
signal_dataset_file = 'wf11600.pkl' #realistic_laser_modified
#noise_dataset_file  = 'wf328469.pkl' #2018
noise_dataset_file  = 'wf356990.pkl.bz2' #2020

#### Hyper-parameters

In [None]:
# basic hyper-parameters
params = {
    'optimizer':   'adam',
    'loss':        'msle', #'mse', #'binary_crossentropy', 
    'metrics':     ['mae', 'mse'],
    'epochs':      50, # 20,
    'batch_size':  512, #256,
}
# additional parameters
params2 = {
    'loss_type':               params['loss'],
    'conv_activation':     'relu',
    'output_activation':   'linear', #'sigmoid',
    'signal_dataset_file': signal_dataset_file,
    'noise_dataset_file':  noise_dataset_file,
    'npoints':             npoints,
    'batch_size':          params['batch_size'],
    'scale':               scale,
    'offset':              offset,
    'nsublayers':          2,
    'nkernels':            [64, 32, 32,   32, 32, 64],
    'skip_connection':     [True, True, True],
}

kernel_size = [3, 3, 3]

## Prepare datasets
On Google Colb, data is loaded via Google Drive.
Files are supposed to be in `/content/drive/My Drive/ML/data`.

### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/My Drive/ML/data/'
output_dir = '/content/drive/My Drive/ML/results/'

### Load pickle files

In [None]:
x_signal = pd.read_pickle(data_dir+signal_dataset_file).to_numpy()
x_noise = pd.read_pickle(data_dir+noise_dataset_file ).to_numpy()

nsamples = min(len(x_signal), len(x_noise))
x_signal = x_signal[0:nsamples]
x_noise = x_noise[0:nsamples]

### Shape data in appropriate format with adding noise

In [None]:
x_tobe_extracted = x_signal.astype('float32')
x_tobe_extracted = x_tobe_extracted.T[-npoints:].T # keep last npoints
x_tobe_removed = x_noise.astype('float32')
x_tobe_removed = x_tobe_removed.T[-npoints:].T # keep last npoints

# Add noise
x_train_noisy = x_tobe_extracted + x_tobe_removed

# Adjust scale and offset of waveforms
x_tobe_extracted *= scale # scale
x_tobe_extracted += offset * scale;
x_train_noisy *= scale # scale
x_train_noisy += offset * scale; # add 50 mV offset

## Values in [0,1]
#x_tobe_extracted = np.clip(x_tobe_extracted, 0, 1);
#x_train_noisy = np.clip(x_train_noisy, 0, 1);

# To match the input shape for Conv1D with 1 channel
x_tobe_extracted = np.reshape(x_tobe_extracted, (len(x_tobe_extracted), npoints, 1))
x_train_noisy = np.reshape(x_train_noisy, (len(x_train_noisy), npoints, 1))

## Model

### Start COMET session

In [None]:
# Add the following code anywhere in your machine learning file
# api_key and workspace are supposed to be set in .comet.config file,
# otherwise set here like Experiment(api_key="AAAXXX", workspace = "yyy", project_name="zzz")
# experiment = Experiment(project_name="wf_denoising")
experiment = Experiment(api_key="gBJn86Y1oAYKM2oxaoY0oV4Af", workspace="yskuchi", project_name="wf_denoisingunet")

In [None]:
experiment.log_parameters(params2)

### Setup TPU
This part seems tf version dependent and may be changed.

In [None]:
if 'COLAB_TPU_ADDR' in os.environ:
  tpu_grpc_url = "grpc://"+os.environ["COLAB_TPU_ADDR"]
  tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu_grpc_url)
  tf.config.experimental_connect_to_cluster(tpu_cluster_resolver) # TF2.0の場合、ここを追加
  tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver) # TF2.0の場合、今後experimentialが取れる可能性がある    
  strategy = tf.distribute.experimental.TPUStrategy(tpu_cluster_resolver)  # ここも同様
  #model = tf.distribute.tpu.keras_to_tpu_model(model, strategy=strategy)

### Build model with functional API

In [None]:
def build_model():
  input_img = Input(shape=(npoints,1))
  conv1 = Conv1D(params2['nkernels'][0], kernel_size[0], padding='same')(input_img)
  #conv1 = BatchNormalization()(conv1)
  #conv1 = Dropout(0.2)(conv1)
  conv1 = Activation(params2['conv_activation'])(conv1)
  for i in range(1, params2['nsublayers']):
    conv1 = Conv1D(params2['nkernels'][0], kernel_size[0], padding='same', activation=params2['conv_activation'])(conv1)
  pool1 = MaxPooling1D(2, padding='same')(conv1)
  conv2 = Conv1D(params2['nkernels'][1], kernel_size[1], padding='same', activation=params2['conv_activation'])(pool1)
  for i in range(1, params2['nsublayers']):
    conv2 = Conv1D(params2['nkernels'][1], kernel_size[1], padding='same', activation=params2['conv_activation'])(conv2)
  pool2 = MaxPooling1D(2, padding='same')(conv2)
  conv3 = Conv1D(params2['nkernels'][2], kernel_size[2], padding='same', activation=params2['conv_activation'])(pool2)
  for i in range(1, params2['nsublayers']):
    conv3 = Conv1D(params2['nkernels'][2], kernel_size[2], padding='same', activation=params2['conv_activation'])(conv3)
  encoded = MaxPooling1D(2, padding='same')(conv3)

  conv4 = Conv1D(params2['nkernels'][3], kernel_size[2], padding='same', activation=params2['conv_activation'])(encoded)
  for i in range(1, params2['nsublayers']):
    conv4 = Conv1D(params2['nkernels'][3], kernel_size[2], padding='same', activation=params2['conv_activation'])(conv4)
  up5 = UpSampling1D(2)(conv4)
  if params2['skip_connection'][2]:
    up5 = Concatenate()([up5, conv3])
  conv5 = Conv1D(params2['nkernels'][4], kernel_size[1], padding='same', activation=params2['conv_activation'])(up5)
  for i in range(1, params2['nsublayers']):
    conv5 = Conv1D(params2['nkernels'][4], kernel_size[1], padding='same', activation=params2['conv_activation'])(conv5)
  up6 = UpSampling1D(2)(conv5)
  if params2['skip_connection'][1]:
    up6 = Concatenate()([up6, conv2])
  conv6 = Conv1D(params2['nkernels'][5], kernel_size[0], padding='same', activation=params2['conv_activation'])(up6)
  for i in range(1, params2['nsublayers']):
    conv6 = Conv1D(params2['nkernels'][5], kernel_size[0], padding='same', activation=params2['conv_activation'])(conv6)
  up7 = UpSampling1D(2)(conv6)
  if params2['skip_connection']:
    up7 = Concatenate()([up7, conv1])
  for i in range(params2['nsublayers'] - 1):
    up7 = Conv1D(1, kernel_size[0], padding='same', activation=params2['conv_activation'])(up7)
  decoded = Conv1D(1, kernel_size[0], padding='same', activation=params2['output_activation'])(up7)

  autoencoder = Model(inputs=input_img, outputs=decoded)

  autoencoder.compile(optimizer=params['optimizer'], loss=params['loss'], metrics=params['metrics']) 
  autoencoder.summary()
  return autoencoder

In [None]:
try:
  strategy
  with strategy.scope():
    autoencoder = build_model()
except NameError:
  autoencoder = build_model()

## Fit

On Google Colb, the results (trained model) are saved in Google Drive. Files are supposed to be in /content/drive/My Drive/ML/results.

In [None]:
history=[]
if not load_weights:

    # Callback for model checkpoints
    checkpoint = ModelCheckpoint(
        filepath = output_dir + filename + "-{epoch:02d}.h5",
        save_best_only=True,
        save_weight_only=False)
    
    # 'labels' are the pictures themselves
    hist = autoencoder.fit(x_train_noisy, x_tobe_extracted,
                           epochs=params['epochs'],
                           batch_size=params['batch_size'],
                           shuffle=True,
                           validation_split=0.1)
                           #, callbacks=[checkpoint])


    # Save history
    with open(output_dir + filename + '_hist.json', 'w') as f:
        json.dump(hist.history, f)
    history = hist.history
        
    # Save the weights
    autoencoder.save_weights(output_dir + filename + '_weights.h5')
else:
    # Load weights
    autoencoder.load_weights(f'{output_dir}{filename}_weights.h5')

    # Load history
    with open(f'{output_dir}{filename}_hist.json', 'r') as f:
        history = json.load(f)

autoencoder.save(output_dir + filename + '.h5', include_optimizer=False)
        
# Plot training history 
#plt.plot(history['loss'], linewidth=3, label='train')
#plt.plot(history['val_loss'], linewidth=3, label='valid')
plt.plot(history['mae'], linewidth=3, label='train')
plt.plot(history['val_mae'], linewidth=3, label='valid')
plt.grid()
plt.legend()
plt.xlabel('epoch')
plt.ylabel('loss')
if params['metrics'] == 'mse':
  plt.ylim(1e-6 * scale, 0.1e-3 * scale) #mse
else:
  plt.ylim(0.5e-3 * scale, 0.5e-2 * scale) #mae
plt.show()

## Test

In [None]:
x_test = x_tobe_extracted[0:]
x_test_noisy = x_train_noisy[0:]
decoded_imgs = autoencoder.predict(x_test_noisy)

# revert scale and offset
x_test -= scale * offset
x_test /= scale
x_test_noisy -= scale * offset
x_test_noisy /= scale
decoded_imgs -= scale * offset
decoded_imgs /= scale

In [None]:
# How many waveforms to be displayed
n = 2
start = 0
fig = plt.figure(figsize=(20, 6 * n))
j = 0
for i in range(start, start + n):
  ax = fig.add_subplot(n, 1, j+1)
  ax.plot(x_test_noisy[i], label="noisy", color='gray')
  ax.plot(x_test[i], label="signal", color='green')
  ax.plot(decoded_imgs[i], label="decoded", color='magenta')
  ax.legend()
  j += 1

In [None]:
# Send this plot to comet
experiment.log_figure(figure=fig)

In [None]:
experiment.end()

In [None]:
if plot_data:
    plt.show()