# 載入函式庫 & 掛載雲端硬碟

In [None]:
import os
from google.colab import drive

import copy
import numpy
import numpy as np
import pandas as pd
import librosa

import tensorflow as tf
from tensorflow.keras.layers import Activation, BatchNormalization, Dense, LayerNormalization
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import datasets, layers, models

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, recall_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score

import scipy.io.wavfile
from scipy.fftpack import dct

drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/Colab Notebooks/Voice-Diease/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 載入訓練資料

In [None]:
# 讀取訓練資料集表單
source_df = pd.read_csv('./data/training_datalist.csv')

print("source_df.shape :", source_df.shape)
print("source_df.columns :", source_df.columns)

source_df.shape : (1000, 28)
source_df.columns : Index(['ID', 'Sex', 'Age', 'Disease category', 'Narrow pitch range',
       'Decreased volume', 'Fatigue', 'Dryness', 'Lumping', 'heartburn',
       'Choking', 'Eye dryness', 'PND', 'Smoking', 'PPD', 'Drinking',
       'frequency', 'Diurnal pattern', 'Onset of dysphonia ', 'Noise at work',
       'Occupational vocal demand', 'Diabetes', 'Hypertension', 'CAD',
       'Head and Neck Cancer', 'Head injury', 'CVA',
       'Voice handicap index - 10'],
      dtype='object')


## 切分訓練與驗證資料

In [None]:
class_weight = {0: 1.,
          1: 2.436,
          2: 3.19,
          3: 12.182,
          4: 16.75,}

In [None]:
training_df, validate_df = train_test_split(source_df, test_size=0.15, random_state=333)

print("training_df shape :", training_df.shape, ", test_df shape :", validate_df.shape)

training_df shape : (850, 28) , test_df shape : (150, 28)


# 資料處理

## 文字資料前處理

In [None]:
def medical_data_proccessing(input_df):
    df = copy.deepcopy(input_df)
    # 將性別編碼0,1
    df['Sex'] = df['Sex'] - 1

    # 將空值填0
    df['PPD'] = df['PPD'].fillna(0)
    df['Voice handicap index - 10'] = df['Voice handicap index - 10'].fillna(0)

    # 正規化過大的數值
    #problem : use training data to normalize 
    df['Age'] = df['Age'] / 100
    df['Voice handicap index - 10'] = df['Voice handicap index - 10'] / 40
    df['Occupational vocal demand'] /= 4
    df['Diurnal pattern'] /= 4
    df['Noise at work'] /= 3
    df['Onset of dysphonia '] /= 5
    df['frequency'] /= 3
    df['Drinking'] /= 2
    df['Smoking'] /= 3
    ## PPD just temp divided by 3
    df['PPD'] /= 3

    return df

## 音訊資料前處理

In [None]:
def MFCCs(filename, second = 1):
  # setup
  sample_rate, raw_signal = scipy.io.wavfile.read(filename) # File assumed to be in the same directory
  signal = np.zeros(int(second*44100)) #fix audio to 3.5 seconds
  if(len(raw_signal) <= len(signal)):
    signal[:len(raw_signal)] = raw_signal
  else:
    signal = raw_signal[:len(signal)]

  emphasized_signal = signal
  # #pre-emphasis
  # pre_emphasis = 0.97
  # emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])

  #framing
  frame_size = 0.025
  frame_stride = 0.01

  frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
  signal_length = len(emphasized_signal)
  frame_length = int(round(frame_length))
  frame_step = int(round(frame_step))
  num_frames = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame

  pad_signal_length = num_frames * frame_step + frame_length
  z = numpy.zeros((pad_signal_length - signal_length))
  pad_signal = numpy.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal

  indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile(numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
  frames = pad_signal[indices.astype(numpy.int32, copy=False)]

  #window
  frames *= numpy.hamming(frame_length)
  # frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1))  # Explicit Implementation **

  #Fourier-Transform and Power Spectrum
  NFFT = 512
  mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT))  # Magnitude of the FFT
  pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum

  #Filter Banks
  nfilt =40
  low_freq_mel = 0
  high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
  mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
  hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
  bin = numpy.floor((NFFT + 1) * hz_points / sample_rate)

  fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1))))
  for m in range(1, nfilt + 1):
      f_m_minus = int(bin[m - 1])   # left
      f_m = int(bin[m])             # center
      f_m_plus = int(bin[m + 1])    # right

      for k in range(f_m_minus, f_m):
          fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
      for k in range(f_m, f_m_plus):
          fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
  filter_banks = numpy.dot(pow_frames, fbank.T)
  filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks)  # Numerical Stability
  filter_banks = 20 * numpy.log10(filter_banks)  # dB
  filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8)

  return filter_banks

In [None]:
def audioData_preprocessing(input_df, file_path, second=1, self=0):
  id_list = input_df.ID.tolist()
  data_size = input_df.shape[0]
  audio_feature = []
  for i in range(0, data_size):
      audio_feature.append(MFCCs(file_path + "{}.wav".format(id_list[i]), second))

  return np.array(audio_feature)

## 訓練資料讀取

In [None]:
x_train_text = medical_data_proccessing(training_df).drop(['Disease category','ID'], axis=1).to_numpy()
x_val_text = medical_data_proccessing(validate_df).drop(['Disease category','ID'], axis=1).to_numpy()

x_train_audio = audioData_preprocessing(training_df, file_path = "./data/training_data/")
x_val_audio = audioData_preprocessing(validate_df, file_path = "./data/training_data/")

y_train = pd.get_dummies(training_df, columns=['Disease category']).to_numpy()[:,-5:].astype('float32')
y_val = pd.get_dummies(validate_df, columns=['Disease category']).to_numpy()[:,-5:].astype('float32')

print(x_train_text.shape, x_train_audio.shape, y_train.shape)
print(x_val_text.shape, x_val_audio.shape, y_val.shape)

(850, 26) (850, 101, 39) (850, 5)
(150, 26) (150, 101, 39) (150, 5)


In [None]:
x_train_multi = np.concatenate((x_train_text, x_train_audio.reshape(850,-1)), axis = 1)
x_val_multi = np.concatenate((x_val_text, x_val_audio.reshape(150,-1)), axis = 1)

x_train_audio = audioData_preprocessing(training_df, file_path = "./data/training_data/", second = 3.5, self=1)
x_val_audio = audioData_preprocessing(validate_df, file_path = "./data/training_data/", second = 3.5, self=1)

# Traditional Model

In [None]:
y_true = validate_df['Disease category'] - 1

In [None]:
clf = LogisticRegression(class_weight=class_weight, max_iter=1000)
clf = clf.fit(x_train_text, np.argmax(y_train, axis=1))
y_pred = clf.predict(x_val_text)

print(( clf.predict(x_train_text) == np.argmax(y_train, axis=1)).sum() / len(y_train))
print((clf.predict(x_val_text) == np.argmax(y_val, axis=1)).sum() / len(y_val))

results_recall = recall_score(y_true, y_pred, average=None)
print("Test UAR(Unweighted Average Recall) :", results_recall.mean())

# Intermediate Fusion

In [None]:
audio_model = load_model("audio_model(0.53).h5")
text_model = load_model("text_model.h5")
multi_model = load_model("multi_model_600*50(0.33).h5")

In [None]:
y_true = validate_df['Disease category'] - 1
text_pred = np.eye(5)[clf.predict(x_val_text)]
audio_pred = np.eye(5)[audio_model.predict(x_val_audio).argmax(axis=1)]
multi_pred = np.eye(5)[multi_model.predict(x_val_multi).argmax(axis=1)]

text_recall = recall_score(y_true, text_pred.argmax(axis=1), average=None).mean()
audio_recall = recall_score(y_true, audio_pred.argmax(axis=1), average=None).mean()
multi_recall = recall_score(y_true, multi_pred.argmax(axis=1), average=None).mean()
print(text_recall)
print(audio_recall)
print(multi_recall)

0.6846882660836149
0.5308619890015238
0.33450606241303915


In [None]:
def create_finalModel(input_shape, neurons = 20, hidden_layers = 3, learning_rate = 0.0001, verbose=0):
    model = Sequential()

    model.add(Dense(neurons, input_dim=input_shape, activation='relu'))
    model.add(layers.Dropout(0.5))
    #model.add(BatchNormalization())
    model.add(LayerNormalization())

    for i in range(hidden_layers-1):
        model.add(Dense(neurons, activation='relu'))
        model.add(layers.Dropout(0.5))
        model.add(BatchNormalization())
        #model.add(LayerNormalization())

    model.add(Dense(5, activation='softmax'))

    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) # categorical_crossentropy

    if verbose:
        model.summary()
    
    return model

final_model = create_finalModel(input_shape = final_train_x.shape[1])
print(final_train_x.shape)

(850, 3)


In [None]:

final_train_x = np.concatenate((clf.predict(x_train_text)[:,None],
                 audio_model.predict(x_train_audio).argmax(axis=1)[:,None], 
                 multi_model.predict(x_train_multi).argmax(axis=1)[:,None]), axis = 1)
final_val_x = np.concatenate((clf.predict(x_val_text)[:,None], 
                audio_model.predict(x_val_audio).argmax(axis=1)[:,None], 
                multi_model.predict(x_val_multi).argmax(axis=1)[:,None]), axis = 1)




In [None]:
final_model.fit(final_train_x, y_train, batch_size=256, epochs=3000, class_weight = class_weight,
                callbacks=[ModelCheckpoint("final_model.h5", save_best_only=True, monitor='val_accuracy')],
                validation_data=(final_val_x, y_val)
                )

In [None]:
# y_pred = final_model.predict(final_val_x).argmax(axis=1)
# y_pred = (text_model.predict(x_val_text) + audio_model.predict(x_val_audio) + multi_model.predict(x_val_muti)).argmax(axis=1)
y_pred = ((text_pred * text_recall + audio_pred * audio_recall + multi_pred * multi_recall)/(text_recall + audio_recall + multi_recall)).argmax(axis=1)
y_pred = (text_pred * 4.9 + audio_pred * 2.4 + multi_pred * 2.6).argmax(axis=1)

results_recall = recall_score(y_true, y_pred, average=None)
print("Test UAR(Unweighted Average Recall) :", results_recall.mean())
ConfusionMatrixDisplay(confusion_matrix(y_true, y_pred)).plot(cmap='Blues')


# Submission

In [None]:
test_df_pb = pd.read_csv('./data/Public Testing Dataset/test_datalist_public.csv')
x_test_audio_pb = audioData_preprocessing(test_df_pb, file_path = "./data/Public Testing Dataset/test_data_public/")
x_test_text_pb = medical_data_proccessing(test_df_pb).drop(['ID'], axis=1).to_numpy()

In [None]:
test_df_pv = pd.read_csv('./data/Private Testing Dataset/test_datalist_private.csv')
x_test_audio_pv = audioData_preprocessing(test_df_pv, file_path = "./data/Private Testing Dataset/test_data_private/")
x_test_text_pv = medical_data_proccessing(test_df_pv).drop(['ID'], axis=1).to_numpy()

In [None]:
test_df = pd.concat([test_df_pb, test_df_pv])
x_test_audio = np.concatenate((x_test_audio_pb, x_test_audio_pv))
x_test_text = np.concatenate((x_test_text_pb, x_test_text_pv)) 

In [None]:
x_test_multi = np.concatenate((x_test_text, x_test_audio.reshape(1000,-1)), axis = 1)

x_test_audio_pv = audioData_preprocessing(test_df_pv, file_path = "./data/Private Testing Dataset/test_data_private/", second = 3.5, self=1)
x_test_audio_pb = audioData_preprocessing(test_df_pb, file_path = "./data/Public Testing Dataset/test_data_public/", second = 3.5, self=1)
x_test_audio = np.concatenate((x_test_audio_pb, x_test_audio_pv))

In [None]:
y_true = validate_df['Disease category'] - 1
text_pred = np.eye(5)[clf.predict(x_test_text)]
audio_pred = np.eye(5)[audio_model.predict(x_test_audio).argmax(axis=1)]
multi_pred = np.eye(5)[multi_model.predict(x_test_multi).argmax(axis=1)]

# prediction of tradition model
y_pred = (text_pred * text_recall + audio_pred * 0 + multi_pred * 0).argmax(axis=1)

my_submission = pd.DataFrame({'Id': test_df.ID, 'category': y_pred + 1})
# you could use any filename. We choose submission here
my_submission.to_csv('submission8.csv', index=False, header=False)

