[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/v-artur/Golden_Oreos/blob/main/Modeling2.ipynb)

Install some of the dependencies

In [1]:
# if mcd does not work, fastdtw and pystpk are not needed
!pip install fastdtw
!pip install pysptk
!pip install gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pysptk
  Downloading pysptk-0.2.0.tar.gz (276 kB)
[K     |████████████████████████████████| 276 kB 5.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: pysptk
  Building wheel for pysptk (PEP 517) ... [?25l[?25hdone
  Created wheel for pysptk: filename=pysptk-0.2.0-cp37-cp37m-linux_x86_64.whl size=931381 sha256=ca8ac1c9f4d2620607ba2562aa93adc777171929c877293246dd4b48ca6e83eb
  Stored in directory: /root/.cache/pip/wheels/9a/70/a1/757bd6c0017f384831e6260a784f10ff6d7998a805719f9a2d
Successfully built pysptk
Installing collected packages: pysptk
Successfully installed pysptk-0.2.0
Looking in indexes: https://pypi.org/s

Import files from Google drive

In [4]:
#the data
!gdown https://drive.google.com/uc?id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp
#reconstruction module
!gdown https://drive.google.com/u/0/uc?id=1KYJD7INBZJ4Wip_ok6nGaJNGHZOCl8ra
#Melfiltebank applier
!gdown https://drive.google.com/u/0/uc?id=16XwS13GUAa7HuJv1Yi1XP6lQm0Gz42Ho

#extract the zip
import zipfile
zip_ref = zipfile.ZipFile("/content/features.zip", 'r')
zip_ref.extractall("/content/features")
zip_ref.close()

#creating a folder for the synthesized audio
!mkdir synth_audio

Downloading...
From: https://drive.google.com/uc?id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp
To: /content/features.zip
100% 2.14G/2.14G [00:18<00:00, 117MB/s]
Downloading...
From: https://drive.google.com/u/0/uc?id=1KYJD7INBZJ4Wip_ok6nGaJNGHZOCl8ra
To: /content/reconstructWave.py
100% 3.02k/3.02k [00:00<00:00, 4.78MB/s]
Downloading...
From: https://drive.google.com/u/0/uc?id=16XwS13GUAa7HuJv1Yi1XP6lQm0Gz42Ho
To: /content/MelFilterBank.py
100% 2.87k/2.87k [00:00<00:00, 4.71MB/s]


Necessary models and functions

In [63]:
import tensorflow as tf
from tensorflow.keras import regularizers, models
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM, GRU, Conv1D, MaxPooling1D, Flatten, TimeDistributed
import numpy as np


#Bottleneck FC-DNN

def create_bottleneck_model(inputsize, outputsize):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=(inputsize)))
    model.add(tf.keras.layers.Dense(128, activation="relu", kernel_initializer='HeNormal'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(64, activation="relu", kernel_initializer='HeNormal'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(16, activation="relu", kernel_initializer='HeNormal'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(4, activation="relu", kernel_initializer='HeNormal'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(16, activation="relu", kernel_initializer='HeNormal'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(outputsize))
    return model

#Normal FC-DNN

def create_dnn_model(inputsize, outputsize):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=(inputsize)))
    model.add(tf.keras.layers.Dense(256, activation="relu", kernel_initializer='HeNormal'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(128, activation="relu", kernel_initializer='HeNormal'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(64, activation="relu", kernel_initializer='HeNormal'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(outputsize))
    return model


#LSTM/GRU Network

def create_LSTM_model(inputsize, outputsize):
  model = tf.keras.Sequential()
  model.add(LSTM(units=300, dropout=0.2, return_sequences=True, input_shape=(inputsize,1)))
  model.add(LSTM(units=300, dropout=0.2, return_sequences=True, input_shape=(inputsize,1)))
  model.add(Flatten())
  model.add(Dense(outputsize))
  return(model)


#CNN network

def create_cnn_model(rows_per_feature, cols_per_feature, outputsize):
  model = tf.keras.Sequential()
  model.add(Conv1D(10, 8, activation="relu", input_shape=(rows_per_feature, cols_per_feature,)))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Conv1D(10, 20, activation="relu"))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Flatten())
  model.add(Dense(units=outputsize, activation='linear'))
  return(model)


#Function to make a 2D object out of the EEG featurevector with sliding window
#windowsize is minimum 10 and multiple of 2.
def cnnize(data, window):
  data = np.asarray(data)
  #the new array, the first coordinate is the number of samples,
  #the second and the third are the new features
  X = np.zeros((data.shape[0], int((data.shape[1]-window)/2)+1 , window))
  for index, elem in enumerate(data):
    # creating the feature-parts
    new_elems = np.array([elem[i:i+window] for i in range(0, data.shape[1] - window + 1, 2)])
    X[index] = new_elems
  return X


#Reconstructing the WAV file from the predicted mel-log spectrogram (Griffin-Lim method)

def createAudio(spectrogram, audiosr=16000, winLength=0.05, frameshift=0.01):
    mfb = mel.MelFilterBank(int((audiosr*winLength)/2+1), spectrogram.shape[1], audiosr)
    nfolds = 10
    hop = int(spectrogram.shape[0]/nfolds)
    rec_audio = np.array([])
    for_reconstruction = mfb.fromLogMels(spectrogram)
    for w in range(0,spectrogram.shape[0],hop):
        spec = for_reconstruction[w:min(w+hop,for_reconstruction.shape[0]),:]
        rec = rW.reconstructWavFromSpectrogram(spec,spec.shape[0]*spec.shape[1],fftsize=int(audiosr*winLength),overlap=int(winLength/frameshift))
        rec_audio = np.append(rec_audio,rec)
    scaled = np.int16(rec_audio/np.max(np.abs(rec_audio)) * 32767)
    return scaled



In [35]:
#For audio reconstruction and MCD measure 
#dependencies
import reconstructWave as rW
import MelFilterBank as mel
import numpy as np
import librosa
from scipy.io import wavfile
import pysptk
from scipy.spatial.distance import euclidean
import os
from fastdtw import fastdtw

# functions for MCD calculations 
# (source: https://github.com/ttslr/python-MCD?fbclid=IwAR2OFaz3-8kTfhJXC7F-cmTTHkY-egEzZdSYHsC0agwPw58N2G3hqhfdVNY)

natural_folder = '/content/features/'
synth_folder = '/content/synth_audio/' 


def readmgc(filename):
    sr, x = wavfile.read(filename)
    assert sr == 16000
    x = x.astype(np.float64)
    frame_length = 1024
    hop_length = 256  
    frames = librosa.util.frame(x, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T
    frames *= pysptk.blackman(frame_length)
    assert frames.shape[1] == frame_length 
    order = 25
    alpha = 0.41
    stage = 5
    gamma = -1.0 / stage

    mgc = pysptk.mgcep(frames, order, alpha, gamma)
    mgc = mgc.reshape(-1, order + 1)
    print("mgc of {} is ok!".format(filename))
    return mgc


def compute_mcd():

  #computational parameters
  _logdb_const = 10.0 / np.log(10.0) * np.sqrt(2.0)
  s = 0.0
  framesTot = 0

  # computing the MCD
  files = os.listdir(synth_folder)
  for subject in files:
    print("Processing -----------{}".format(subject))
    
    subject_ID = subject[0:7]
    
    filename1 = natural_folder + subject_ID + 'orig_audio.wav'
    mgc1 = readmgc(filename1)
    filename2 = synth_folder + subject_ID + 'predicted.wav'
    mgc2 = readmgc(filename2)
  
    x = mgc1
    y = mgc2

    distance, path = fastdtw(x, y, dist=euclidean)
  
    distance/= (len(x) + len(y))
    pathx = list(map(lambda l: l[0], path))
    pathy = list(map(lambda l: l[1], path))
    x, y = x[pathx], y[pathy]

    frames = x.shape[0]
    framesTot  += frames

    z = x - y
    s += np.sqrt((z * z).sum(-1)).sum()

  MCD_value = _logdb_const * float(s) / float(framesTot)

  print("MCD = : {:f}".format(MCD_value))


In [15]:
#Setting seed
tf.keras.utils.set_random_seed(1234)

# 1.) One person model

### Optimizing the bottleneck FC-DNN structure for one person



In [None]:
from scipy.stats import pearsonr
from sklearn.model_selection import KFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.io.wavfile as wavfile



data = np.load(r'/content/features/sub-01_feat.npy')
spectrogram = np.load(r'/content/features/sub-01_spec.npy')

#Inital parameters
nfolds = 10
kf = KFold(nfolds,shuffle=False)
pca = PCA()
numComps = 200
val_split = 0.2

#Audio Reconstruction parameters
winLength = 0.05
frameshift = 0.01
audiosr = 16000

#Initialize an empty spectrogram to save the reconstruction to
rec_spec = np.zeros(spectrogram.shape)
#Save the correlation coefficients for each fold
rs = np.zeros((nfolds,spectrogram.shape[1]))
for k,(train, test) in enumerate(kf.split(data)):
          
    #Train, validation and test data
    X_train_temp = data[train,:]
    y_train_temp = spectrogram[train,:]
    X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=val_split, random_state=0)
    X_test = data[test,:]
    y_test = spectrogram[test,:] # this one might not be needed
    
    #Normalization
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train[:] = scaler.transform(X_train)
    X_val[:] = scaler.transform(X_val)
    X_test[:] = scaler.transform(X_test)

    #Fit PCA to training data
    pca.fit(X_train)
    #Tranform data 
    X_train = np.dot(X_train, pca.components_[:numComps,:].T)
    X_val = np.dot(X_val, pca.components_[:numComps,:].T)
    X_test = np.dot(X_test, pca.components_[:numComps,:].T)

    # Bottleneck model
    early_stopping=EarlyStopping(patience=25, verbose=1, min_delta=1e-5)
    checkpointer=ModelCheckpoint(filepath='weights1.hdf5', save_best_only=True, verbose=1)

    model = create_bottleneck_model(numComps, spectrogram.shape[1])
    model.compile(loss='mse', optimizer='adam', metrics=['mse'])
    model.fit(X_train, y_train, batch_size=64, 
              epochs=100, verbose=0, validation_data=(X_val, y_val), shuffle=True,
              callbacks=[checkpointer, early_stopping])
            
    #predict with the Autoencoder
    model.load_weights('weights1.hdf5')
    rec_spec[test, :] = model.predict(X_test, verbose=0)

    #Evaluate reconstruction of this fold
    for specBin in range(spectrogram.shape[1]):
         r, p = pearsonr(spectrogram[test, specBin], rec_spec[test, specBin])
         rs[k,specBin] = r


#Show evaluation result
print('mean correlation', np.mean(rs))

#Make and save the synthesized audio
reconstructedWav = createAudio(rec_spec,audiosr=audiosr,winLength=winLength,frameshift=frameshift)
wavfile.write(os.path.join('/content/synth_audio/sub-01_predicted.wav'),int(audiosr),reconstructedWav)


Epoch 1: val_loss improved from inf to 5.08743, saving model to weights1.hdf5

Epoch 2: val_loss improved from 5.08743 to 3.29884, saving model to weights1.hdf5

Epoch 3: val_loss improved from 3.29884 to 2.55712, saving model to weights1.hdf5

Epoch 4: val_loss improved from 2.55712 to 2.20429, saving model to weights1.hdf5

Epoch 5: val_loss improved from 2.20429 to 2.04845, saving model to weights1.hdf5

Epoch 6: val_loss improved from 2.04845 to 1.91544, saving model to weights1.hdf5

Epoch 7: val_loss improved from 1.91544 to 1.71705, saving model to weights1.hdf5

Epoch 8: val_loss improved from 1.71705 to 1.50765, saving model to weights1.hdf5

Epoch 9: val_loss improved from 1.50765 to 1.50368, saving model to weights1.hdf5

Epoch 10: val_loss did not improve from 1.50368

Epoch 11: val_loss did not improve from 1.50368

Epoch 12: val_loss improved from 1.50368 to 1.47402, saving model to weights1.hdf5

Epoch 13: val_loss did not improve from 1.47402

Epoch 14: val_loss improv

KeyboardInterrupt: ignored

### Optimizing the normal FC-DNN structure for one person


In [64]:
from scipy.stats import pearsonr
from sklearn.model_selection import KFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.io.wavfile as wavfile


data = np.load(r'/content/features/sub-01_feat.npy')
spectrogram = np.load(r'/content/features/sub-01_spec.npy')

#Inital parameters for prediction
nfolds = 10
kf = KFold(nfolds,shuffle=False)
pca = PCA()
numComps = 300
val_split = 0.2

#Audio Reconstruction parameters
winLength = 0.05
frameshift = 0.01
audiosr = 16000


#Initialize an empty spectrogram to save the reconstruction to
rec_spec = np.zeros(spectrogram.shape)
#Save the correlation coefficients for each fold
rs = np.zeros((nfolds,spectrogram.shape[1]))
for k,(train, test) in enumerate(kf.split(data)):
          
    #Train, validation and test data
    X_train_temp = data[train,:]
    y_train_temp = spectrogram[train,:]
    X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=val_split, random_state=0)
    X_test = data[test,:]
    y_test = spectrogram[test,:] # this one might not be needed
    
    #Normalization
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train[:] = scaler.transform(X_train)
    X_val[:] = scaler.transform(X_val)
    X_test[:] = scaler.transform(X_test)

    #Fit PCA to training data
    pca.fit(X_train)
    #Tranform data 
    X_train = np.dot(X_train, pca.components_[:numComps,:].T)
    X_val = np.dot(X_val, pca.components_[:numComps,:].T)
    X_test = np.dot(X_test, pca.components_[:numComps,:].T)

    # normal DNN model
    early_stopping=EarlyStopping(patience=25, verbose=1, min_delta=1e-5)
    checkpointer=ModelCheckpoint(filepath='weights1.hdf5', save_best_only=True, verbose=1)

    model = create_dnn_model(numComps, spectrogram.shape[1])
    model.compile(loss='mse', optimizer='adam', metrics=['mse'])
    model.fit(X_train, y_train, batch_size=64, 
              epochs=100, verbose=0, validation_data=(X_val, y_val), shuffle=True,
              callbacks=[checkpointer, early_stopping])
            
    #predict with the Autoencoder
    model.load_weights('weights1.hdf5')
    rec_spec[test, :] = model.predict(X_test, verbose=0)

    #Evaluate reconstruction of this fold
    for specBin in range(spectrogram.shape[1]):
         r, p = pearsonr(spectrogram[test, specBin], rec_spec[test, specBin])
         rs[k,specBin] = r


#Show evaluation result
print('mean correlation', np.mean(rs))

#Make and save the synthesized audio
reconstructedWav = createAudio(rec_spec,audiosr=audiosr,winLength=winLength,frameshift=frameshift)
wavfile.write(os.path.join('/content/synth_audio/','sub-01_predicted.wav'),int(audiosr),reconstructedWav)


Epoch 1: val_loss improved from inf to 2.51148, saving model to weights1.hdf5

Epoch 2: val_loss improved from 2.51148 to 1.70454, saving model to weights1.hdf5

Epoch 3: val_loss did not improve from 1.70454

Epoch 4: val_loss improved from 1.70454 to 1.59680, saving model to weights1.hdf5

Epoch 5: val_loss improved from 1.59680 to 1.27844, saving model to weights1.hdf5

Epoch 6: val_loss improved from 1.27844 to 0.98948, saving model to weights1.hdf5

Epoch 7: val_loss did not improve from 0.98948

Epoch 8: val_loss improved from 0.98948 to 0.86860, saving model to weights1.hdf5

Epoch 9: val_loss did not improve from 0.86860

Epoch 10: val_loss improved from 0.86860 to 0.82983, saving model to weights1.hdf5

Epoch 11: val_loss improved from 0.82983 to 0.77544, saving model to weights1.hdf5

Epoch 12: val_loss did not improve from 0.77544

Epoch 13: val_loss improved from 0.77544 to 0.72470, saving model to weights1.hdf5

Epoch 14: val_loss improved from 0.72470 to 0.69793, saving 

### Optimizing the LSTM structure for one person

In [None]:
import numpy as np
from scipy.stats import pearsonr
from sklearn.model_selection import KFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.io.wavfile as wavfile


data = np.load(r'/content/features/sub-01_feat.npy')
spectrogram = np.load(r'/content/features/sub-01_spec.npy')

#Inital parameters
nfolds = 10
kf = KFold(nfolds,shuffle=False)
pca = PCA()
numComps = 300
val_split = 0.2

#Audio Reconstruction parameters
winLength = 0.05
frameshift = 0.01
audiosr = 16000

#Initialize an empty spectrogram to save the reconstruction to
rec_spec = np.zeros(spectrogram.shape)
#Save the correlation coefficients for each fold
rs = np.zeros((nfolds,spectrogram.shape[1]))
for k,(train, test) in enumerate(kf.split(data)):         
    #Train, validation and test data
    X_train_temp = data[train,:]
    y_train_temp = spectrogram[train,:]
    X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=val_split, random_state=0, shuffle=False)
    X_test = data[test,:]
    y_test = spectrogram[test,:] # this one might not be needed
    
    #Normalization
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train[:] = scaler.transform(X_train)
    X_val[:] = scaler.transform(X_val)
    X_test[:] = scaler.transform(X_test)

    #Fit PCA to training data
    pca.fit(X_train)
    #Tranform data 
    X_train = np.dot(X_train, pca.components_[:numComps,:].T)
    X_val = np.dot(X_val, pca.components_[:numComps,:].T)
    X_test = np.dot(X_test, pca.components_[:numComps,:].T)

    # LSTM model
    early_stopping=EarlyStopping(patience=25, verbose=1, min_delta=1e-5)
    checkpointer=ModelCheckpoint(filepath='weights1.hdf5', save_best_only=True, verbose=1)

    model = create_LSTM_model(numComps, spectrogram.shape[1])
    model.compile(loss='mse', optimizer='adam', metrics=['mse'])
    model.fit(X_train, y_train, batch_size=64, 
              epochs=100, verbose=0, validation_data=(X_val, y_val), shuffle=True,
              callbacks=[checkpointer, early_stopping])
            
    #predict with the Autoencoder
    model.load_weights('weights1.hdf5')
    rec_spec[test, :] = model.predict(X_test, verbose=0)

    #Evaluate reconstruction of this fold
    for specBin in range(spectrogram.shape[1]):
         r, p = pearsonr(spectrogram[test, specBin], rec_spec[test, specBin])
         rs[k,specBin] = r


#Show evaluation result
print('mean correlation', np.mean(rs))

#Make and save the synthesized audio
reconstructedWav = createAudio(rec_spec,audiosr=audiosr,winLength=winLength,frameshift=frameshift)
wavfile.write(os.path.join('/content/synth_audio/sub-01_predicted.wav'),int(audiosr),reconstructedWav)


Epoch 1: val_loss improved from inf to 2.36893, saving model to weights1.hdf5

Epoch 2: val_loss did not improve from 2.36893

Epoch 3: val_loss did not improve from 2.36893

Epoch 4: val_loss did not improve from 2.36893

Epoch 5: val_loss did not improve from 2.36893

Epoch 6: val_loss did not improve from 2.36893

Epoch 7: val_loss did not improve from 2.36893

Epoch 8: val_loss did not improve from 2.36893

Epoch 9: val_loss did not improve from 2.36893

Epoch 10: val_loss did not improve from 2.36893

Epoch 11: val_loss did not improve from 2.36893

Epoch 12: val_loss did not improve from 2.36893

Epoch 13: val_loss did not improve from 2.36893

Epoch 14: val_loss did not improve from 2.36893

Epoch 15: val_loss did not improve from 2.36893

Epoch 16: val_loss did not improve from 2.36893

Epoch 17: val_loss did not improve from 2.36893

Epoch 18: val_loss did not improve from 2.36893

Epoch 19: val_loss did not improve from 2.36893

Epoch 20: val_loss did not improve from 2.3689

### Optimizing the CNN structure for one person

In [None]:
import numpy as np
from scipy.stats import pearsonr
from sklearn.model_selection import KFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.io.wavfile as wavfile


data = np.load(r'/content/features/sub-01_feat.npy')
spectrogram = np.load(r'/content/features/sub-01_spec.npy')

#Inital parameters
nfolds = 10
kf = KFold(nfolds,shuffle=False)
pca = PCA()
numComps = 200
val_split = 0.2
window = 20

#Audio Reconstruction parameters
winLength = 0.05
frameshift = 0.01
audiosr = 16000

#Initialize an empty spectrogram to save the reconstruction to
rec_spec = np.zeros(spectrogram.shape)
#Save the correlation coefficients for each fold
rs = np.zeros((nfolds,spectrogram.shape[1]))
for k,(train, test) in enumerate(kf.split(data)):         
    #Train, validation and test data
    X_train_temp = data[train,:]
    y_train_temp = spectrogram[train,:]
    X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=val_split, random_state=0, shuffle=False)
    X_test = data[test,:]
    y_test = spectrogram[test,:] # this one might not be needed
    
    #Normalization
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train[:] = scaler.transform(X_train)
    X_val[:] = scaler.transform(X_val)
    X_test[:] = scaler.transform(X_test)

    #Fit PCA to training data
    pca.fit(X_train)
    #Tranform data 
    X_train = np.dot(X_train, pca.components_[:numComps,:].T)
    X_val = np.dot(X_val, pca.components_[:numComps,:].T)
    X_test = np.dot(X_test, pca.components_[:numComps,:].T)

    #reshaping the data
    X_train = cnnize(X_train, window)
    X_val = cnnize(X_val, window)
    X_test = cnnize(X_test, window)

    #CNN model
    early_stopping=EarlyStopping(patience=25, verbose=1, min_delta=1e-5)
    checkpointer=ModelCheckpoint(filepath='weights1.hdf5', save_best_only=True, verbose=1)

    model = create_cnn_model(int((numComps-window)/2)+1, window, spectrogram.shape[1])
    model.compile(loss='mse', optimizer='adam', metrics=['mse'])
    model.fit(X_train, y_train, batch_size=64, 
              epochs=100, verbose=0, validation_data=(X_val, y_val), shuffle=True,
              callbacks=[checkpointer, early_stopping])
            
    #predict with the Autoencoder
    model.load_weights('weights1.hdf5')
    rec_spec[test, :] = model.predict(X_test, verbose=0)

    #Evaluate reconstruction of this fold
    for specBin in range(spectrogram.shape[1]):
         r, p = pearsonr(spectrogram[test, specBin], rec_spec[test, specBin])
         rs[k,specBin] = r


#Show evaluation result
print('mean correlation', np.mean(rs))

#Make and save the synthesized audio
reconstructedWav = createAudio(rec_spec,audiosr=audiosr,winLength=winLength,frameshift=frameshift)
wavfile.write(os.path.join('/content/synth_audio/sub-01_predicted.wav'),int(audiosr),reconstructedWav)


Epoch 1: val_loss improved from inf to 3.65074, saving model to weights1.hdf5

Epoch 2: val_loss improved from 3.65074 to 3.44781, saving model to weights1.hdf5

Epoch 3: val_loss improved from 3.44781 to 3.12265, saving model to weights1.hdf5

Epoch 4: val_loss did not improve from 3.12265

Epoch 5: val_loss improved from 3.12265 to 3.09337, saving model to weights1.hdf5

Epoch 6: val_loss improved from 3.09337 to 2.97692, saving model to weights1.hdf5

Epoch 7: val_loss improved from 2.97692 to 2.92251, saving model to weights1.hdf5

Epoch 8: val_loss did not improve from 2.92251

Epoch 9: val_loss did not improve from 2.92251

Epoch 10: val_loss did not improve from 2.92251

Epoch 11: val_loss improved from 2.92251 to 2.87664, saving model to weights1.hdf5

Epoch 12: val_loss did not improve from 2.87664

Epoch 13: val_loss did not improve from 2.87664

Epoch 14: val_loss did not improve from 2.87664

Epoch 15: val_loss did not improve from 2.87664

Epoch 16: val_loss did not impro

KeyboardInterrupt: ignored

# 2.) Trying out the best configuration for every subject

In [None]:
##### MODELING ########

# So far, the best performance came from the normal DNN model

model = True

scores= []

if model == True:
    feat_path = r'/content/features'
    result_path = r'/content/synth_audio'
    pts = ['sub-%02d'%i for i in range(1,11)]

    winLength = 0.05
    frameshift = 0.01
    audiosr = 16000

    nfolds = 15
    kf = KFold(nfolds,shuffle=False)
    pca = PCA()
    numComps = 400
    
    #Initialize empty matrices for correlation results, randomized controls and amount of explained variance
    allRes = np.zeros((len(pts),nfolds,23))

    for pNr, pt in enumerate(pts):
        
        
        #Load the data
        #Dimensions of these data vary depending on the subject
        spectrogram = np.load(os.path.join(feat_path,f'{pt}_spec.npy'))  
        data = np.load(os.path.join(feat_path,f'{pt}_feat.npy'))

        
        #Initialize an empty spectrogram to save the reconstruction to
        rec_spec = np.zeros(spectrogram.shape)
        #Save the correlation coefficients for each fold
        rs = np.zeros((nfolds,spectrogram.shape[1]))
        for k,(train, test) in enumerate(kf.split(data)):
          
            #Train, validation and test data
            X_train_temp = data[train,:]
            y_train_temp = spectrogram[train,:]
            X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=val_split, random_state=0)
            X_test = data[test,:]
            y_test = spectrogram[test,:] # this one might not be needed
            
            #Normalization
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train[:] = scaler.transform(X_train)
            X_val[:] = scaler.transform(X_val)
            X_test[:] = scaler.transform(X_test)


            #Fit PCA to training data
            pca.fit(X_train)
            #Tranform data 
            X_train = np.dot(X_train, pca.components_[:numComps,:].T)
            X_val = np.dot(X_val, pca.components_[:numComps,:].T)
            X_test = np.dot(X_test, pca.components_[:numComps,:].T)

            # normal DNN model
            early_stopping=EarlyStopping(patience=25, verbose=0, min_delta=1e-5)
            checkpointer=ModelCheckpoint(filepath='weights1.hdf5', save_best_only=True, verbose=0)

            model = create_dnn_model(numComps, spectrogram.shape[1])
            model.compile(loss='mse', optimizer='adam', metrics=['mse'])
            model.fit(X_train, y_train, batch_size=64, 
                      epochs=100, verbose=0, validation_data=(X_val, y_val), shuffle=True,
                      callbacks=[checkpointer, early_stopping])
                    
            #predict with the Autoencoder
            model.load_weights('weights1.hdf5')
            rec_spec[test, :] = model.predict(X_test, verbose=0)

            #Evaluate reconstruction of this fold
            for specBin in range(spectrogram.shape[1]):
                if np.any(np.isnan(rec_spec)):
                    print('%s has %d broken samples in reconstruction' % (pt, np.sum(np.isnan(rec_spec))))
                r, p = pearsonr(spectrogram[test, specBin], rec_spec[test, specBin])
                rs[k,specBin] = r

        #Show evaluation result
        print('%s has mean correlation of %f' % (pt, np.mean(rs)))
        allRes[pNr,:,:]=rs
        scores.append(np.mean(rs))


        #Make and save the synthesized audio
        reconstructedWav = createAudio(rec_spec,audiosr=audiosr,winLength=winLength,frameshift=frameshift)
        wavfile.write(os.path.join('/content/synth_audio/','sub-01_predicted.wav'),int(audiosr),reconstructedWav)
        

In [None]:
scores

[0.5629875535958715,
 0.615317710223286,
 0.8574981748428402,
 0.816490556136459,
 0.5712122543230624,
 0.8819938942481294,
 0.7771201475020989,
 0.6878816278290169,
 0.7386449014724298,
 0.7146618900721041]