<a href="https://colab.research.google.com/github/yskuchi/wf_denoising/blob/master/noiseextraction2ch_tf2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Waveform denoising 'noiseextraction2ch_tf2'
Author: Yusuke Uchiyama

A denoising convolutional autoencoder with Tensorflow2.x
applied to **a set of** waveform data.  
See [Bitbucket repository](https://bitbucket.org/meg_ilc_tokyo/wf_denoising/src/master/) or 
[GitHub repository](https://github.com/yskuchi/wf_denoising)

Noise from data is added to MC signal data.
You need datasets of signal and noise, separately, in pickle format.

## Environment
As of 2020 Nov, tested with the following:

* Google Colab
* CPU, GPU, or TPU (experimental)
* Python 3.6
* TensorFlow 2.3.0
* Comet ML


Note: If you are running this in a colab notebook, we recommend you enable a free GPU by going:
> Runtime   →   Change runtime type   →   Hardware Accelerator: GPU

## Setting

### Comet ML

In [None]:
! pip install typing-extensions==3.7.4 comet-ml
! pip install typing-extensions==3.7.4
#! [ ! -z "$COLAB_GPU" ] && pip install typing-extensions==3.7.4 comet-ml

In [None]:
# import comet_ml in the top of your file
from comet_ml import Experiment

# Add the following code anywhere in your machine learning file
# api_key and workspace are supposed to be set in .comet.config file,
# otherwise set here like Experiment(api_key="AAAXXX", workspace = "yyy", project_name="zzz")
# experiment = Experiment(project_name="wf_denoising")
experiment = Experiment(api_key="gBJn86Y1oAYKM2oxaoY0oV4Af", workspace="yskuchi", project_name="wf_denoising")

### Other packages

In [None]:
import os, sys
import numpy as np
import pandas as pd
import json

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPooling1D, UpSampling1D
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint

print ('Python version: ' + str(sys.version_info))
print ('TensorFlow version: ' + str(tf.__version__))

### GPU
To use GPU on Google colab, specify GPU in runtime type. 

In [None]:
# check GPU
tf.test.gpu_device_name()
!echo $COLAB_GPU

### TPU
To use TPU on Google colab, it is not enough to specify TPU in runtime type.
See "Setup TPU".

In [None]:
# check TPU
!echo $COLAB_TPU_ADDR

### Parameters

In [None]:
# arg
load_weights = False
plot_data = True 
filename = "noiseextraction5_tf2"

import matplotlib
if not plot_data:
    matplotlib.use("Agg") # this is necessary when using plt without display (batch)
import matplotlib.pyplot as plt

In [None]:
# Waveform has 1024 sample-points
npoints = 1024 # 256 # number of sample-points to be used
scale = 5
offset = 0.05 # 50 mV

# Number of channels (CNN channel = waveform channanels)
nchannels = 2

signal_dataset_file = 'wf11100.pkl'
#noise_dataset_file  = 'wf328469.pkl' #2018
noise_dataset_file  = 'wf356990.pkl' #2020

#### Hyper-parameters

In [None]:
# basic hyper-parameters
params = {
    'optimizer':   'adam',
    'loss':        'mse', #'binary_crossentropy', 
    'epochs':      20, # 20,
    'batch_size':  512, #256,
}
# additional parameters
params2 = {
    'conv_activation':     'relu',
    'output_activation':   'linear', #'sigmoid',
    'signal_dataset_file': signal_dataset_file,
    'noise_dataset_file':  noise_dataset_file,
    'npoints':             npoints,
    'scale':               scale,
    'offset':              offset,
}

In [None]:
experiment.log_parameters(params2)

## Prepare datasets
On Google Colb, data is loaded via Google Drive.
Files are supposed to be in `/content/drive/My Drive/ML/data`.

### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/My Drive/ML/data/'
output_dir = '/content/drive/My Drive/ML/results/'

### Load pickle files

In [None]:
x_original = pd.read_pickle(data_dir+signal_dataset_file).to_numpy()
x_noise = pd.read_pickle(data_dir+noise_dataset_file ).to_numpy()

nsamples = min(len(x_original), len(x_noise)) 
nsamples = int(nsamples / nchannels) * nchannels

print(f'signal samples:{len(x_original)}, noise samples:{len(x_noise)}, nsamples: {nsamples}')

x_original = x_original[0:nsamples]
x_noise = x_noise[0:nsamples]

### Shape data in appropriate format with adding noise

In [None]:
x_original = x_original.astype('float32')
x_original = x_original.T[-npoints:].T # keep last npoints
x_noise = x_noise.astype('float32')
x_noise = x_noise.T[-npoints:].T # keep last npoints

# Add noise
x_train_noisy = x_original + x_noise

# Adjust scale and offset of waveforms
x_noise *= scale # scale
x_noise += offset * scale;
x_train_noisy *= scale # scale
x_train_noisy += offset * scale; # add 50 mV offset

# Values in [0,1]
x_noise = np.clip(x_noise, 0, 1);
x_train_noisy = np.clip(x_train_noisy, 0, 1);

# To match the input shape for Conv1D with 2 channel
#x_noise = np.reshape(x_noise, (int(len(x_noise) / nchannels), npoints, nchannels))
#x_train_noisy = np.reshape(x_train_noisy, (int(len(x_train_noisy) / nchannels), npoints, nchannels))
x_original = np.reshape(x_original, (int(len(x_original) / nchannels), nchannels, npoints)).transpose(0,2,1)
x_noise = np.reshape(x_noise, (int(len(x_noise) / nchannels), nchannels, npoints)).transpose(0,2,1)
x_train_noisy = np.reshape(x_train_noisy, (int(len(x_train_noisy) / nchannels), nchannels, npoints)).transpose(0,2,1)

## Model

### Setup TPU
This part seems tf version dependent and may be changed.

In [None]:
if 'COLAB_TPU_ADDR' in os.environ:
  tpu_grpc_url = "grpc://"+os.environ["COLAB_TPU_ADDR"]
  tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu_grpc_url)
  tf.config.experimental_connect_to_cluster(tpu_cluster_resolver) # TF2.0の場合、ここを追加
  tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver) # TF2.0の場合、今後experimentialが取れる可能性がある    
  strategy = tf.distribute.experimental.TPUStrategy(tpu_cluster_resolver)  # ここも同様
  #model = tf.distribute.tpu.keras_to_tpu_model(model, strategy=strategy)

### Build model with functional API

In [None]:
def build_model():
  input_img = Input(shape=(npoints,nchannels))
  x = Conv1D(64, 5, padding='same', activation=params2['conv_activation'])(input_img)
  x = MaxPooling1D(2, padding='same')(x)
  x = Conv1D(32, 5, padding='same', activation=params2['conv_activation'])(x)
  x = MaxPooling1D(2, padding='same')(x)
  x = Conv1D(32, 5, padding='same', activation=params2['conv_activation'])(x)
  encoded = MaxPooling1D(2, padding='same')(x)

  x = Conv1D(32, 5, padding='same', activation=params2['conv_activation'])(encoded)
  x = UpSampling1D(2)(x)
  x = Conv1D(32, 5, padding='same', activation=params2['conv_activation'])(x)
  x = UpSampling1D(2)(x)
  x = Conv1D(64, 5, padding='same', activation=params2['conv_activation'])(x)
  x = UpSampling1D(2)(x)
  decoded = Conv1D(nchannels, 5, padding='same', activation=params2['output_activation'])(x)

  autoencoder = Model(inputs=input_img, outputs=decoded)

  autoencoder.compile(optimizer=params['optimizer'], loss=params['loss']) 
  autoencoder.summary()
  return autoencoder

In [None]:
try:
  strategy
  with strategy.scope():
    autoencoder = build_model()
except NameError:
  autoencoder = build_model()

## Fit

On Google Colb, the results (trained model) are saved in Google Drive. Files are supposed to be in /content/drive/My Drive/ML/results.

In [None]:
history=[]
if not load_weights:

    # Callback for model checkpoints
    checkpoint = ModelCheckpoint(
        filepath = output_dir + filename + "-{epoch:02d}.h5",
        save_best_only=True,
        save_weight_only=False)
    
    # 'labels' are the pictures themselves
    hist = autoencoder.fit(x_train_noisy, x_noise,
                           epochs=params['epochs'],
                           batch_size=params['batch_size'],
                           shuffle=True,
                           validation_split=0.1,
                           callbacks=[checkpoint])


    # Save history
    with open(output_dir + filename + '_hist.json', 'w') as f:
        json.dump(hist.history, f)
    history = hist.history
        
    # Save the weights
    autoencoder.save_weights(output_dir + filename + '_weights.h5')
else:
    # Load weights
    autoencoder.load_weights(f'{output_dir}{filename}_weights.h5')

    # Load history
    with open(f'{output_dir}{filename}_hist.json', 'r') as f:
        history = json.load(f)

autoencoder.save(output_dir + filename + '.h5', include_optimizer=False)
        
# Plot training history 
plt.plot(history['loss'], linewidth=3, label='train')
plt.plot(history['val_loss'], linewidth=3, label='valid')
plt.grid()
plt.legend()
plt.xlabel('epoch')
plt.ylabel('loss')
plt.ylim(1e-2, 0.1)
plt.ylim(1e-5, 1e-3) #mse
plt.show()

## Test

In [None]:
x_test = x_original[2:]
x_noise_test = x_noise[2:]
x_test_noisy = x_train_noisy[2:]
decoded_imgs = autoencoder.predict(x_test_noisy)

# revert scale and offset
x_noise_test -= scale * offset
x_noise_test /= scale
x_test_noisy -= scale * offset
x_test_noisy /= scale
decoded_imgs -= scale * offset
decoded_imgs /= scale
x_subtracted = x_noise_test - decoded_imgs

# How many waveforms to be displayed
n = 1
plt.figure(figsize=(20, 6))
for i in range(n):
    plt.plot(x_test[i], label="original")
    plt.plot(x_test_noisy[i], label="noisy")
    plt.plot(decoded_imgs[i], label="decoded noise")
    #plt.plot(x_noise_test[i], label="noise")
    #plt.plot(x_subtracted[i], label="subtracted")
    plt.legend()

In [None]:
# Send this plot to comet
experiment.log_figure(figure=plt)

In [None]:
if plot_data:
    plt.show()