<a href="https://colab.research.google.com/github/v-artur/Golden_Oreos/blob/main/speaker_indep_data_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h2>Obtaining the data</h2>

In [1]:
#the data
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp" -O features.zip && rm -rf /tmp/cookies.txt
#reconstruction module
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1_eeG0d_r-RqazUkr-ZRPNC6L13sHYwIP' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1_eeG0d_r-RqazUkr-ZRPNC6L13sHYwIP" -O reconstructWave.py && rm -rf /tmp/cookies.txt
#Melfiltebank applier
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Bjf3ncRe8CcWHl3i0HxRo4unRYkz2fog' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Bjf3ncRe8CcWHl3i0HxRo4unRYkz2fog" -O MelFilterBank.py && rm -rf /tmp/cookies.txt


#extracting it
import zipfile
zip_ref = zipfile.ZipFile("/content/features.zip", 'r')
zip_ref.extractall("/content/features")
zip_ref.close()

--2022-12-08 11:23:42--  https://docs.google.com/uc?export=download&confirm=t&id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp
Resolving docs.google.com (docs.google.com)... 108.177.97.139, 108.177.97.138, 108.177.97.101, ...
Connecting to docs.google.com (docs.google.com)|108.177.97.139|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-08-9o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/32vnvbrhcbgc1ce0mqmp7qngpv1vffl1/1670498550000/17895932938140350971/*/1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp?e=download&uuid=0565305a-3b47-4bc1-b251-0a252362d176 [following]
--2022-12-08 11:23:43--  https://doc-08-9o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/32vnvbrhcbgc1ce0mqmp7qngpv1vffl1/1670498550000/17895932938140350971/*/1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp?e=download&uuid=0565305a-3b47-4bc1-b251-0a252362d176
Resolving doc-08-9o-docs.googleusercontent.com (doc-08-9o-docs.googleusercontent.com)... 74.125.204.132, 

<h2>Preparations and needed functions</h2>

In [1]:
import numpy as np
import os

#setting the path
feat_path = r'/content/features'

# Counting how many different features there are
all_electrodes = set()
for subject in ['01','02','03','04','05','06','07','08','09','10']:
  list1 = set(np.load(os.path.join(feat_path,f'sub-{subject}_feat_names.npy')).tolist())
  all_electrodes = all_electrodes.union(list1)  

print('Number of different features:', len(all_electrodes))

#we will use this list's indexes to correspond to the feature matrices
all_electrodes = list(all_electrodes) 

Number of different features: 4860


In [2]:
# Function for making every feature vector into a 4860 dimensional vector

def dim_adjust(data, feature_names):
  #creating an empty array with appropriate length
  feat_matrix = np.empty((0,len(all_electrodes)))

  #create a new matrix with zeros, insert the values into the columns
  #which correspond to the subject's feature names, then concatenate them
  new_matrix = np.zeros((data.shape[0],len(all_electrodes)))
  for column in range(data.shape[1]):
    insert_index = all_electrodes.index(feature_names[column])
    new_matrix[:,insert_index] = data[:,column]
  feat_matrix = np.concatenate((feat_matrix, new_matrix), axis=0)
    
  return feat_matrix


<h2>Making the iterated test, validation and test sets</h2>

In [3]:
def generate_features_and_spec():
  #Base arrays
  train_feat = np.empty((0, len(all_electrodes)))
  val_feat = np.empty((0, len(all_electrodes)))
  test_feat = np.empty((0, len(all_electrodes)))

  train_spec = np.empty((0, 23))
  val_spec = np.empty((0, 23))
  test_spec = np.empty((0, 23))

  for index, subject in enumerate(['01','02','03','04','05','06','07','08','09','10']):
    #loading the features, feature names and mel spectrogram of the subject
    data = np.load(os.path.join(feat_path,f'sub-{subject}_feat.npy'))
    feature_names = np.load(os.path.join(feat_path,f'sub-{subject}_feat_names.npy'))
    spectrogram = np.load(os.path.join(feat_path,f'sub-{subject}_spec.npy'))

    #splittig the features and the labels into 5 parts
    feat_splits = np.array_split(data, 5)
    spec_splits = np.array_split(spectrogram, 5)

    #making the train, val or test arrays for the subject using a 60-20-20 ratio
    #because of the "index" changing, the splitting position will iterate with each subject as well
    subject_train_feat = np.vstack((feat_splits[index % 5],feat_splits[(index+1) % 5],feat_splits[(index+2) % 5]))
    subject_train_spec = np.vstack((spec_splits[index % 5],spec_splits[(index+1) % 5],spec_splits[(index+2) % 5]))

    subject_val_feat = feat_splits[(index+3) % 5]
    subject_val_spec = spec_splits[(index+3) % 5]

    subject_test_feat = feat_splits[(index+4) % 5]
    subject_test_spec = spec_splits[(index+4) % 5]

    #concatenating the dimensionality-adjusted features with the pre-existing feature set
    train_feat = np.concatenate((train_feat, dim_adjust(subject_train_feat, feature_names)))
    val_feat = np.concatenate((val_feat, dim_adjust(subject_val_feat, feature_names)))
    test_feat = np.concatenate((test_feat, dim_adjust(subject_test_feat, feature_names)))

    #concatenating the appropriate arrays with the pre-existing train, val or test labels
    train_spec = np.concatenate((train_spec, subject_train_spec))
    val_spec = np.concatenate((val_spec, subject_val_spec))
    test_spec = np.concatenate((test_spec, subject_test_spec))

  return train_feat, train_spec, val_feat, val_spec, test_feat, test_spec 

# Generating the data
train_feat, train_spec, val_feat, val_spec, test_feat, test_spec = generate_features_and_spec()

In [4]:
from sklearn.preprocessing import StandardScaler
# Scaling the data 

scaler = StandardScaler()
scaler.fit(train_feat)
train_feat = scaler.transform(train_feat)
val_feat = scaler.transform(val_feat)
test_feat = scaler.transform(test_feat)

<h2>Tuning the AutoEncoder for dimensionality reduction</h2>

In [5]:
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization

def create_ae_model(inputsize):
    model = tf.keras.Sequential()
    model.add(Input(shape=(inputsize)))
    model.add(Dropout(0.5))
    model.add(Dense(2000, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(500, activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(2000, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(inputsize))
    return model

In [6]:
# Defining the data generator
from tensorflow.keras.utils import Sequence, set_random_seed

set_random_seed(1234)

class DataGenerator(Sequence):
    # Initialization
    def __init__(self, data, batch_size=32, dim=len(all_electrodes), shuffle=True):
        self.dim = dim
        self.batch_size = batch_size
        self.data = data
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        # Shows the number of batches per epoch
        return int(np.floor(self.data.shape[0] / self.batch_size))

    def __getitem__(self, index):
        # Generate one batch of data
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        batch = np.empty((self.batch_size, self.dim))
        for index1, elem in enumerate(indexes):
          batch[index1] = self.data[index1]

        return batch, batch

    def on_epoch_end(self):
        # Updating the index after each epoch
        self.indexes = np.arange(self.data.shape[0])
        if self.shuffle == True:
            np.random.shuffle(self.indexes)


# Creating the generators
train_gen = DataGenerator(train_feat, 256)
val_gen = DataGenerator(val_feat, 256)
test_gen = DataGenerator(test_feat, 256)

In [7]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Creating and training the model
model = create_ae_model(train_feat.shape[1])
model.compile(loss='mse', optimizer='adam', metrics=['mse'])

early_stopping=EarlyStopping(patience=20, verbose=1, min_delta=1e-5)
checkpointer=ModelCheckpoint(filepath='weights1.hdf5', save_best_only=True, verbose=1)

model.fit(train_gen, epochs=500, verbose=1, validation_data=val_gen, callbacks=[checkpointer, early_stopping])

Epoch 1/500
Epoch 1: val_loss improved from inf to 0.37256, saving model to weights1.hdf5
Epoch 2/500
Epoch 2: val_loss did not improve from 0.37256
Epoch 3/500
Epoch 3: val_loss did not improve from 0.37256
Epoch 4/500
Epoch 4: val_loss did not improve from 0.37256
Epoch 5/500
Epoch 5: val_loss did not improve from 0.37256
Epoch 6/500
Epoch 6: val_loss did not improve from 0.37256
Epoch 7/500
Epoch 7: val_loss did not improve from 0.37256
Epoch 8/500
Epoch 8: val_loss did not improve from 0.37256
Epoch 9/500
Epoch 9: val_loss did not improve from 0.37256
Epoch 10/500
Epoch 10: val_loss did not improve from 0.37256
Epoch 11/500
Epoch 11: val_loss did not improve from 0.37256
Epoch 12/500
Epoch 12: val_loss did not improve from 0.37256
Epoch 13/500
Epoch 13: val_loss did not improve from 0.37256
Epoch 14/500
Epoch 14: val_loss did not improve from 0.37256
Epoch 15/500
Epoch 15: val_loss did not improve from 0.37256
Epoch 16/500
Epoch 16: val_loss did not improve from 0.37256
Epoch 17/50

<keras.callbacks.History at 0x7f500a12ea60>

In [8]:
# Evaluation 
model.load_weights('weights1.hdf5')
model.evaluate(test_gen)



[0.3897421658039093, 0.3897421658039093]

<h2>Generating and exporting the lower dimensional data</h2>

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout (Dropout)           (None, 4860)              0         
                                                                 
 dense (Dense)               (None, 2000)              9722000   
                                                                 
 dropout_1 (Dropout)         (None, 2000)              0         
                                                                 
 dense_1 (Dense)             (None, 1000)              2001000   
                                                                 
 dropout_2 (Dropout)         (None, 1000)              0         
                                                                 
 dense_2 (Dense)             (None, 500)               500500    
                                                                 
 batch_normalization (BatchN  (None, 500)              2

In [10]:
from keras.models import Model
model2= Model(inputs=model.input, outputs=model.layers[-7].output)
model2.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 4860)]            0         
                                                                 
 dropout (Dropout)           (None, 4860)              0         
                                                                 
 dense (Dense)               (None, 2000)              9722000   
                                                                 
 dropout_1 (Dropout)         (None, 2000)              0         
                                                                 
 dense_1 (Dense)             (None, 1000)              2001000   
                                                                 
 dropout_2 (Dropout)         (None, 1000)              0         
                                                                 
 dense_2 (Dense)             (None, 500)               500500

In [11]:
# Generating the new data using the output of the bottleneck layer
train_new = model2.predict(train_gen)
val_new = model2.predict(val_gen)
test_new = model2.predict(test_gen)



In [13]:
# Exporting the feature-label set pairs as pickles onto Google Drive
import pickle

with open('train.pkl', 'wb') as f:  
    pickle.dump([train_new, train_spec], f)

with open('val.pkl', 'wb') as f:  
    pickle.dump([val_new, val_spec], f)

with open('test.pkl', 'wb') as f:  
    pickle.dump([test_new, test_spec], f)

In [15]:
from google.colab import drive
drive.mount('/content/drive')

!cp train.pkl drive/MyDrive/DeepLearning/train.pkl
!cp val.pkl drive/MyDrive/DeepLearning/val.pkl
!cp test.pkl drive/MyDrive/DeepLearning/test.pkl

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
