In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

# Install TensorFlow
try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf
import numpy as np

import pandas as pd

import time
import librosa
import librosa.display

import matplotlib.pyplot as plt
from scipy.io import wavfile as wav
from scipy import signal
from sklearn.preprocessing import MinMaxScaler

import gc
import sklearn

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
BATCH_SIZE = 16
EPOCHS = 200
BUFFER_SIZE = 500

## 設定路徑

In [None]:
PATH_TO_DATA = "../Comp4/dataset_kaggle/"
PATH_CKPTS = "../Comp4/dataset_kaggle/test2_3/"

checkpoint_name = "test2_3"

# Features

為了擷取重要的音檔資訊我們取幾個重要的features : 
(1) melspectrogram : 因為clip的音頻會較為特殊，因此把音檔轉換成頻譜圖，並取它在頻率域上的資訊。
(2) spectral centroid : 把頻譜的中心代表聲音的質心，當頻譜中心越小，就代表越多的頻譜能量集中在低頻範圍內。
(3) spectral_bandwidth : 計算所對應到的頻譜寬。 
(4) spectral rolloff : 比該頻率低的頻率的所有能量大於85%的整個頻率的能量。
(5) zero_crossing_rate : 為過零率，音檔的語音信息通過零點的次數。
(6) tonnetz : 計算它的音調質心。

In [None]:
def mel(wave_file):
    y, sr = librosa.load(wave_file, sr = None)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,fmax=8000) 
    return spectrogram

def centroid(wave_file):
    y, sr = librosa.load(wave_file, sr = None)
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    return centroid

def bandwidth(wave_file):
    y, sr = librosa.load(wave_file, sr = None)
    bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    return bandwidth

def rolloff(wave_file):
    y, sr = librosa.load(wave_file, sr = None)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    return rolloff

def zcr(wave_file):
    y, sr = librosa.load(wave_file, sr = None)
    zcr = librosa.feature.zero_crossing_rate(y=y)
    return zcr

def tonnetz(wave_file):
    y, sr = librosa.load(wave_file, sr = None)
    y = librosa.effects.harmonic(y)
    tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
    return tonnetz

In [None]:
def get_data(dname):
    tmp = pd.read_csv(PATH_TO_DATA + dname)
    tmp["mix_fname"] = tmp["mix_fname"].apply(lambda x: x[10:])
    
    return tmp

def get_features(ds, idx):
    Melspectrogram = mel(PATH_TO_DATA + ds["mix_fname"][idx])
    Melspectrogram = (librosa.power_to_db(Melspectrogram,ref = np.max)+40).T/40
    Centroid = sklearn.preprocessing.minmax_scale(centroid(PATH_TO_DATA + ds["mix_fname"][idx])[0],axis = 0) 
    Centroid = np.array([Centroid.tolist()],dtype = np.float32).T
    Bandwidth = sklearn.preprocessing.minmax_scale(bandwidth(PATH_TO_DATA + ds["mix_fname"][idx])[0],axis = 0)
    Bandwidth = np.array([Bandwidth.tolist()],dtype = np.float32).T
    Rolloff = sklearn.preprocessing.minmax_scale(rolloff(PATH_TO_DATA + ds["mix_fname"][idx])[0],axis = 0)
    Rolloff = np.array([Rolloff.tolist()],dtype = np.float32).T
    Zcr = sklearn.preprocessing.minmax_scale(rolloff(PATH_TO_DATA + ds["mix_fname"][idx])[0],axis = 0)
    Zcr = np.array([Zcr.tolist()],dtype = np.float32).T
    Tonnetz = tonnetz(PATH_TO_DATA + ds["mix_fname"][idx]).T
    return np.concatenate((Melspectrogram, Centroid, Bandwidth, Rolloff, Zcr, Tonnetz),axis = 1)


def get_wav_ds(ds):
    wav_ds = []
    
    for i in range(ds.shape[0]):
        
        wav_ds.append(get_features(ds, i))
        
    wav_ds = np.asarray(wav_ds)
    
    return wav_ds

將response的1、2、3轉換成三維的由01組成的陣列。

In [None]:
def get_label_ds(ds):
    label = []
    num = ds["n_cues"].tolist()
    for i in range(len(num)):
        if num[i]==1:
            label.append([1,0,0])
        if num[i]==2:
            label.append([0,1,0])
        if num[i]==3:
            label.append([0,0,1])
    return label

分別從train和validation以及要預測的sample取出上面所彙整的features，以及把各個response轉換成我們所要對應的三維陣列形式。

In [None]:
tr = get_data("train.csv")
val = get_data("val.csv")
te = get_data("sample.csv")

In [None]:
tr_label = get_label_ds(tr)
val_label = get_label_ds(val)
te_label = get_label_ds(te)

# Generate Dataset

分別將train和validation以及要預測的sample我們需要用到資料轉成tf的形式。

In [None]:
class ds_train_generator(object):
    def __init__(self, dataset,label):
        self.dataset = dataset
        
        self.dataset_wav = get_wav_ds(self.dataset)
        self.labels = label
      
    def preprocessing(self, wav_file, label):
        wav_file = tf.cast(wav_file, tf.float32)
      
        return wav_file, label
        
    def generate(self):
        dataset = tf.data.Dataset.from_tensor_slices((self.dataset_wav, self.labels))
        dataset = dataset.map(self.preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        
        return dataset

In [None]:
class ds_val_generator(object):
    def __init__(self, dataset,label):
        self.dataset = dataset
        
        self.dataset_wav = get_wav_ds(self.dataset)
        self.labels = label

    def preprocessing(self, wav_file, label):
        wav_file = tf.cast(wav_file, tf.float32)
      
        return wav_file, label
        
    def generate(self):
        dataset = tf.data.Dataset.from_tensor_slices((self.dataset_wav, self.labels))
        dataset = dataset.map(self.preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        dataset = dataset.batch(BATCH_SIZE)
        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        
        return dataset

In [None]:
class ds_te_generator(object):
    def __init__(self, dataset):
        self.dataset = dataset
        
        self.dataset_wav = get_wav_ds(self.dataset)

    def preprocessing(self, wav_file):
        wav_file = tf.cast(wav_file, tf.float32)
      
        return wav_file
        
    def generate(self):
        dataset = tf.data.Dataset.from_tensor_slices((self.dataset_wav))
        dataset = dataset.map(self.preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        dataset = dataset.batch(BATCH_SIZE)
        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        
        return dataset

In [None]:
train_gen = ds_train_generator(tr,tr_label)
val_gen = ds_val_generator(val,val_label)

In [None]:
ds_train = train_gen.generate()
ds_val = val_gen.generate()

In [None]:
te_gen = ds_te_generator(te)
ds_te = te_gen.generate()

In [None]:
ds_te

# Training net

這裡都是用keras裡的1D Convolutional Neural Networks去疊我們所需要的架構，它比較適用於time series的data或特定訊號在固定的時間區間中，而我們padding的方式為valid的方式，代表它輸出的維度會低於它輸入的維度，而我們activation的function取的是relu的方式，它表現比較好的原因為因為我們這裡取的音檔資訊，會有很多高低起伏的資訊，這裡我們用relu的特性all or none law，若音檔的振幅未達一定高度，可以合理懷疑它是不重要的聲音資訊，輸出就會為0，最後我們用soft.max使它輸出為0到1的數值，我們會挑選在三個中數值最大的為它的預測。

In [None]:
class net(tf.keras.models.Model):
    def __init__(self):
        super(net, self).__init__()
        self.net0_conv1 = tf.keras.layers.Conv1D(512, 9, padding='same')
        self.net0_conv2 = tf.keras.layers.Conv1D(512, 9, padding='same')
        self.net1_conv1 = tf.keras.layers.Conv1D(256, 6, padding='same')
        self.net1_conv2 = tf.keras.layers.Conv1D(256, 6, padding='same')
        self.net2_conv1 = tf.keras.layers.Conv1D(128, 3, padding='same')
        self.net2_conv2 = tf.keras.layers.Conv1D(128, 3, padding='same')
        #self.net3_conv1 = tf.keras.layers.Conv1D(128, 2,padding='valid', activation='relu')
        #self.net3_conv2 = tf.keras.layers.Conv1D(64, 2,padding='valid', activation='relu')
        
        self.net0_mp = tf.keras.layers.MaxPool1D(3)
        self.net1_mp = tf.keras.layers.MaxPool1D(3)
        self.net2_mp = tf.keras.layers.MaxPool1D(3)
        #self.net3_mp = tf.keras.layers.MaxPool1D(3)
        
        self.net1_do = tf.keras.layers.Dropout(0.3)
        self.net2_do = tf.keras.layers.Dropout(0.3)
        self.net4_do = tf.keras.layers.Dropout(0.5)
        self.net5_do = tf.keras.layers.Dropout(0.5)
        
        
        self.net4_flat = tf.keras.layers.Flatten()
        self.net4_den = tf.keras.layers.Dense(1024)
        self.net5_den = tf.keras.layers.Dense(256)
        self.net6_den = tf.keras.layers.Dense(3)
    
    def call(self, inputs):             
        net0 = self.net0_conv1(inputs)
        net0 = tf.nn.leaky_relu(net0)
        net0 = self.net0_conv2(net0)
        net0 = tf.nn.leaky_relu(net0)
        net0 = self.net0_mp(net0)
        
        net1 = self.net1_conv1(net0)
        net1 = tf.nn.leaky_relu(net1)
        net1 = self.net1_conv2(net1)
        net1 = tf.nn.leaky_relu(net1)
        net1 = self.net1_do(net1)
        net1 = self.net1_mp(net1)
        
        net2 = self.net2_conv1(net1)
        net2 = tf.nn.leaky_relu(net2)
        net2 = self.net2_conv2(net2)
        net2 = tf.nn.leaky_relu(net2)
        net2 = self.net2_do(net2)
        net2 = self.net2_mp(net2)
        
        #net3 = self.net3_conv1(net2)
        #net3 = self.net3_conv2(net3)
        #net3 = self.net3_mp(net3)
        
        net4 = self.net4_flat(net2)
        net4 = self.net4_den(net4)
        net4 = tf.nn.leaky_relu(net4)
        net4 = self.net4_do(net4)
        
        net5 = self.net5_den(net4)
        net5 = tf.nn.leaky_relu(net5)
        net5 = self.net5_do(net5)
        
        net6 = self.net6_den(net5)
        outputs = tf.nn.sigmoid(net6)

        return outputs

因為我們是考慮多類別的分類問題，因此在損失函數上也是考慮多類別的。

In [None]:
cross_entropy = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
def total_loss(true_answer,pred_answer):
    true_answer = tf.cast(true_answer,dtype = tf.float32)
    return cross_entropy(true_answer, pred_answer)

In [None]:
net = net()
optimizer = tf.keras.optimizers.Adam(lr = 1e-04)

In [None]:
last_ckp = tf.train.latest_checkpoint(PATH_CKPTS)
start_epoch = 0

if last_ckp:
    ckpt = tf.train.Checkpoint(optimizer=optimizer, net=net)
    ckpt.restore(last_ckp)
    start_epoch = int(last_ckp.split("-")[-1])

print(f'Resume training from epoch {start_epoch}') 

In [None]:
ckpt = tf.train.Checkpoint(optimizer=optimizer, net=net)

manager = tf.train.CheckpointManager(ckpt, PATH_CKPTS, max_to_keep=EPOCHS,
                                     checkpoint_name=checkpoint_name)

In [None]:
@tf.function
def train_step(audio, label):
    with tf.GradientTape() as tape:
        pred_label = net(audio)        
        # loss
        loss = total_loss(label, pred_label)
    
    grads = tape.gradient(loss, net.trainable_variables)
    optimizer.apply_gradients(zip(grads, net.trainable_variables))
    
    return loss

In [None]:
@tf.function
def test_step(audio):
    pred_label = net(audio)
    
    return pred_label

In [None]:
def val(dataset):
    out = []

    for audio,label in dataset:
        curr = test_step(audio)
        out.extend(curr.numpy().tolist())
    out = np.asarray(out)
    
    return out

def cut_thres(y_pred):
    pred = []
    for i in range(len(y_pred)):
        pred.append(np.where(y_pred[i]==np.max(y_pred[i]),1,0))
    pred = np.asarray(pred)
    return pred

def cal_acc(y_true, y_pred):
    output = []
    for i in range(len(y_pred)):
        is_same = (y_true[i] == y_pred[i].tolist())
        output.append(int(is_same))
        
    return sum(output)/len(output)

# Train

In [None]:
loss_hist = []
val_acc = []
for i in range(start_epoch, EPOCHS):
    epoch = i + 1
    t_loss = []
    start = time.time()
    
    for audio, label in ds_train:
        tmp_loss = train_step(audio, label)
        t_loss.append(tmp_loss)
        
    print("\nLoss: {}".format(np.mean(t_loss)))
    
    val_pred = val(ds_val)
    pred = cut_thres(val_pred)
    val_accuracy = cal_acc(val_label, pred)
    val_acc.append(val_accuracy)
    print("Validation Acc is {:.4f} ".format(val_accuracy))
    
    print('Time for epoch {} is {:.4f} sec'.format(epoch, time.time()-start))
    
    manager.save(checkpoint_number=epoch)
    loss_hist.append(np.mean(t_loss))

# Test

In [None]:
index = np.array(tf.math.argmax(val_acc)).tolist()

In [None]:
val_acc[index]

In [None]:
last_ckp =  PATH_CKPTS + "test2_3-124" # PATH_CKPTS + checkpoint_name + '-' + str(index+1)
start_epoch = 0

if last_ckp:
    ckpt = tf.train.Checkpoint(optimizer=optimizer, net=net)
    ckpt.restore(last_ckp)
    start_epoch = int(last_ckp.split("-")[-1])

print(f'Resume training from epoch {start_epoch}') 

In [None]:
def testing(dataset):
    out = []

    for audio in dataset:
        curr = test_step(audio)
        out.extend(curr.numpy().tolist())
    out = np.asarray(out)
    
    return out

def pred_value(y_pred):
    pred = []
    for i in range(len(y_pred)):
        pred.append(np.where(y_pred[i]==np.max(y_pred[i]))[0][0]+1)
    pred = np.asarray(pred)
    return pred

def output(pred):    
    output = pd.DataFrame({"id": range(len(pred)),
                           "n_cues": pred})
    
    return output

In [None]:
te_pred = testing(ds_te)
predict = pred_value(te_pred)

In [None]:
output(predict).to_csv(PATH_TO_DATA + "test2_3.csv", index=False)