<a href="https://colab.research.google.com/github/v-artur/Golden_Oreos/blob/main/speaker_indep_dim_reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Obtaining the data

In [1]:
#features
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp" -O features.zip && rm -rf /tmp/cookies.txt
#original electrode names
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1A2CMLYAMOjET7Bdwt8bjRt8YLQeoVP80' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1A2CMLYAMOjET7Bdwt8bjRt8YLQeoVP80" -O subject_channels.zip && rm -rf /tmp/cookies.txt

# Data extraction
import zipfile
zip_ref = zipfile.ZipFile("/content/features.zip", 'r')
zip_ref.extractall("/content/features")
zip_ref.close()

# Electrode name extraction
zip_ref = zipfile.ZipFile("/content/subject_channels.zip", 'r')
zip_ref.extractall("/content")
zip_ref.close()


--2022-12-09 23:19:16--  https://docs.google.com/uc?export=download&confirm=t&id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp
Resolving docs.google.com (docs.google.com)... 173.194.79.138, 173.194.79.139, 173.194.79.113, ...
Connecting to docs.google.com (docs.google.com)|173.194.79.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-08-9o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/d1dp7p4k7u5vc6o49g18ces9nrl0bppm/1670627925000/17895932938140350971/*/1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp?e=download&uuid=fdc33b18-48bf-4c73-9a2b-5926e754534b [following]
--2022-12-09 23:19:17--  https://doc-08-9o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/d1dp7p4k7u5vc6o49g18ces9nrl0bppm/1670627925000/17895932938140350971/*/1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp?e=download&uuid=fdc33b18-48bf-4c73-9a2b-5926e754534b
Resolving doc-08-9o-docs.googleusercontent.com (doc-08-9o-docs.googleusercontent.com)... 142.251.18.132, 

<h3>Preparations and needed functions</h3>



Our plan is to transform every feature vector into a larger dimensional feature vector,  which contains all the different electrode names across all the subjects.

In [2]:
import numpy as np
import os

# Getting the different electrode names
all_electrodes = set()

feat_path = "/content/features"

for subject in ['01','02','03','04','05','06','07','08','09','10']:
  elecs = np.load(os.path.join(feat_path,f'sub-{subject}_feat_names.npy'))
  elecs = set(elecs)
  all_electrodes = all_electrodes.union(elecs)

# We will use this list's indexes to correspond to the feature matrices gained from the subjects
all_electrodes = list(all_electrodes) 

print('Number of different electrodes:', len(all_electrodes))

Number of different electrodes: 4860


In [3]:
# Function for making every feature vector into a 4860 dimensional vector

def dim_adjust(data, feature_names):
  #create a new matrix with zeros, insert the values into the columns
  #which correspond to the subject's feature names, then concatenate them

  new_matrix = np.zeros((data.shape[0],len(all_electrodes)))
  for column in range(data.shape[1]):
    insert_index = all_electrodes.index(feature_names[column])
    new_matrix[:,insert_index] = data[:,column]
    
  return new_matrix


## Making the iterated test, validation and test sets

In [4]:
# Function to generate the train, val and test features and mel spectrograms

def generate_features_and_spec():
  #Base arrays
  train_feat = np.empty((0, len(all_electrodes)))
  val_feat = np.empty((0, len(all_electrodes)))
  test_feat = np.empty((0, len(all_electrodes)))

  train_spec = np.empty((0, 23))
  val_spec = np.empty((0, 23))
  test_spec = np.empty((0, 23))

  for index, subject in enumerate(['01','02','03','04','05','06','07','08','09','10']):
    #loading the features, feature names and mel spectrogram of the subject
    data = np.load(os.path.join(feat_path,f'sub-{subject}_feat.npy'))
    feature_names = np.load(os.path.join(feat_path,f'sub-{subject}_feat_names.npy'))
    spectrogram = np.load(os.path.join(feat_path,f'sub-{subject}_spec.npy'))

    #splittig the features and the labels into 5 parts
    feat_splits = np.array_split(data, 5)
    spec_splits = np.array_split(spectrogram, 5)

    #making the train, val or test arrays for the subject using a 60-20-20 ratio
    #because of the "index" changing, the splitting position will iterate with each subject as well
    subject_train_feat = np.vstack((feat_splits[index % 5],feat_splits[(index+1) % 5],feat_splits[(index+2) % 5]))
    subject_train_spec = np.vstack((spec_splits[index % 5],spec_splits[(index+1) % 5],spec_splits[(index+2) % 5]))

    subject_val_feat = feat_splits[(index+3) % 5]
    subject_val_spec = spec_splits[(index+3) % 5]

    subject_test_feat = feat_splits[(index+4) % 5]
    subject_test_spec = spec_splits[(index+4) % 5]

    #concatenating the dimensionality-adjusted features with the pre-existing feature set
    train_feat = np.concatenate((train_feat, dim_adjust(subject_train_feat, feature_names)))
    val_feat = np.concatenate((val_feat, dim_adjust(subject_val_feat, feature_names)))
    test_feat = np.concatenate((test_feat, dim_adjust(subject_test_feat, feature_names)))

    #concatenating the appropriate arrays with the pre-existing train, val or test labels
    train_spec = np.concatenate((train_spec, subject_train_spec))
    val_spec = np.concatenate((val_spec, subject_val_spec))
    test_spec = np.concatenate((test_spec, subject_test_spec))

  return train_feat, train_spec, val_feat, val_spec, test_feat, test_spec 

# Generating the data
train_feat, train_spec, val_feat, val_spec, test_feat, test_spec = generate_features_and_spec()

In [5]:
from sklearn.preprocessing import StandardScaler
# Scaling the data 

scaler = StandardScaler()
scaler.fit(train_feat)
train_feat = scaler.transform(train_feat)
val_feat = scaler.transform(val_feat)
test_feat = scaler.transform(test_feat)

## Dimensionality reduction

<h3>Option 1: Tuned AutoEncoder</h3>

In [6]:
%%capture
!pip install keras-tuner

In [7]:
# Defining the data generator
from tensorflow.keras.utils import Sequence, set_random_seed

set_random_seed(1234)

class DataGenerator(Sequence):
    # Initialization
    def __init__(self, data, batch_size=32, dim=len(all_electrodes), shuffle=True):
        self.dim = dim
        self.batch_size = batch_size
        self.data = data
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        # Shows the number of batches per epoch
        return int(np.floor(self.data.shape[0] / self.batch_size))

    def __getitem__(self, index):
        # Generate one batch of data
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        batch = np.empty((self.batch_size, self.dim))
        for index1, elem in enumerate(indexes):
          batch[index1] = self.data[index1]

        return batch, batch

    def on_epoch_end(self):
        # Updating the index after each epoch
        self.indexes = np.arange(self.data.shape[0])
        if self.shuffle == True:
            np.random.shuffle(self.indexes)


# Creating the generators for training
# According to our early experiments, batch sizes of 256 or 512 proved to be the best,
# but to shorten the amount time for hyperparameter tuning, we set it for 512
train_gen = DataGenerator(train_feat, 512)
val_gen = DataGenerator(val_feat, 512)
test_gen = DataGenerator(test_feat, 512)

In [8]:
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Input, Dropout
import keras_tuner as kt
from keras.models import Model


# Creating the hyperparameter tuner function
# The AE has 5 Dense layers besides the output, and the 3rd one is the bottleneck layer
def create_ae_optimal(hp):
  input = Input(shape=(4860))
  
  # Layer 1
  hp_units_1 = hp.Int('units_1', min_value=1500, max_value=2500, step=250)
  encoded = Dense(units=hp_units_1, activation="relu", kernel_initializer='HeNormal')(input)
  encoded = Dropout(rate=hp.Float('dropout_1', min_value = 0.0, max_value = 0.5, default = 0.2, step = 0.1))(encoded)

  # Layer 2
  hp_units_2 = hp.Int('units_2', min_value=750, max_value=1250, step=125)
  encoded = Dense(units=hp_units_2, activation="relu", kernel_initializer='HeNormal')(encoded)
  encoded = Dropout(rate=hp.Float('dropout_2', min_value = 0.0, max_value = 0.5, default = 0.2, step = 0.1))(encoded)

  # Layer 3
  hp_units_3 = hp.Int('units_3', min_value=350, max_value=650, step=50)
  encoded = Dense(units=hp_units_3, activation="relu", kernel_initializer='HeNormal')(encoded)
  encoded = Dropout(rate=hp.Float('dropout_3', min_value = 0.0, max_value = 0.5, default = 0.2, step = 0.1))(encoded)

  # Layer 4
  hp_units_4 = hp.Int('units_4', min_value=750, max_value=1250, step=125)
  decoded = Dense(units=hp_units_4, activation="relu", kernel_initializer='HeNormal')(encoded)
  decoded = Dropout(rate=hp.Float('dropout_4', min_value = 0.0, max_value = 0.5, default = 0.2, step = 0.1))(decoded)

  # Layer 5
  hp_units_5 = hp.Int('units_5', min_value=1500, max_value=2500, step=250)
  decoded = Dense(units=hp_units_5, activation="relu", kernel_initializer='HeNormal')(decoded)
  decoded = Dropout(rate=hp.Float('dropout_5', min_value = 0.0, max_value = 0.5, default = 0.2, step = 0.1))(decoded)

  # Output layer
  output = Dense(4860, activation='linear')(decoded)

  # Optimizer parameters
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  model = Model(input, output)
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss='mse',
                metrics=['mse'])
  
  return model

In [9]:
# Setting up the tuner
tuner = kt.Hyperband(create_ae_optimal,
                     objective='val_loss',
                     max_epochs=10,
                     factor=3,
                     directory='/content/ae_opt',
                     project_name='ae_opt1')

In [10]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

#note: takes about 20 minutes to optimize
stop_early = EarlyStopping(monitor='val_loss', patience=10)
tuner.search(train_gen, epochs=100, validation_data=val_gen, verbose=0, shuffle=True, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]



In [11]:
# Fitting and retraining the model with the best params
hypermodel = tuner.hypermodel.build(best_hps)

early_stopping=EarlyStopping(patience=10, verbose=1, min_delta=1e-5)
checkpointer=ModelCheckpoint(filepath='weights1.hdf5', save_best_only=True, verbose=1)

hypermodel.fit(train_gen, epochs=100, verbose=1, validation_data=val_gen, callbacks=[checkpointer, early_stopping])

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.46640, saving model to weights1.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 0.46640 to 0.30801, saving model to weights1.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 0.30801 to 0.28483, saving model to weights1.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 0.28483 to 0.27579, saving model to weights1.hdf5
Epoch 5/100
Epoch 5: val_loss improved from 0.27579 to 0.26971, saving model to weights1.hdf5
Epoch 6/100
Epoch 6: val_loss did not improve from 0.26971
Epoch 7/100
Epoch 7: val_loss did not improve from 0.26971
Epoch 8/100
Epoch 8: val_loss did not improve from 0.26971
Epoch 9/100
Epoch 9: val_loss did not improve from 0.26971
Epoch 10/100
Epoch 10: val_loss did not improve from 0.26971
Epoch 11/100
Epoch 11: val_loss did not improve from 0.26971
Epoch 12/100
Epoch 12: val_loss did not improve from 0.26971
Epoch 13/100
Epoch 13: val_loss did not improve from 0.26971
Epoch 14/100
Epoch 14: val_loss did not improve from 0

<keras.callbacks.History at 0x7fb490079040>

In [12]:
# Checking the performance on the test set
hypermodel.load_weights('weights1.hdf5')
hypermodel.evaluate(test_gen)



[0.276035338640213, 0.276035338640213]

In [13]:
# Loading back the best weights and checking the layers
hypermodel.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 4860)]            0         
                                                                 
 dense_6 (Dense)             (None, 2000)              9722000   
                                                                 
 dropout_5 (Dropout)         (None, 2000)              0         
                                                                 
 dense_7 (Dense)             (None, 1000)              2001000   
                                                                 
 dropout_6 (Dropout)         (None, 1000)              0         
                                                                 
 dense_8 (Dense)             (None, 550)               550550    
                                                                 
 dropout_7 (Dropout)         (None, 550)               0   

In [14]:
# Keeping only the encoder
model2 = Model(inputs=hypermodel.input, outputs=hypermodel.layers[-7].output)
model2.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 4860)]            0         
                                                                 
 dense_6 (Dense)             (None, 2000)              9722000   
                                                                 
 dropout_5 (Dropout)         (None, 2000)              0         
                                                                 
 dense_7 (Dense)             (None, 1000)              2001000   
                                                                 
 dropout_6 (Dropout)         (None, 1000)              0         
                                                                 
 dense_8 (Dense)             (None, 550)               550550    
                                                                 
Total params: 12,273,550
Trainable params: 12,273,550
Non-t

In [15]:
# sizes of the new arrays
print('Train:', train_feat.shape) 
print('Validation:', val_feat.shape)
print('Test:', test_feat.shape)

Train: (179011, 4860)
Validation: (59672, 4860)
Test: (59672, 4860)


We need to redefine the generators for the predictions in order to retain every feature vector in every set, otherwise some of them would be left out because of the batch size.

Since the memory can't fit all the data, and using 1 as batch size would be slow, we need to use prime factorization to determine the appropriate batch sizes.

The factorizations are:
- 179011 = 7 x 107 x 239
- 59672 = 2 * 2 * 2 * 7459  

In [16]:
train_gen = DataGenerator(train_feat, 107*239)
val_gen = DataGenerator(val_feat, 7459)
test_gen = DataGenerator(test_feat, 7459)

# Generating the new data using the output of the bottleneck layer
train_new = model2.predict(train_gen)
val_new = model2.predict(val_gen)
test_new = model2.predict(test_gen)



In [17]:
# Checking if the new sets really has the same amount of vectors
print(train_new.shape[0] == train_feat.shape[0])
print(val_new.shape[0] == val_feat.shape[0])
print(test_new.shape[0] == test_feat.shape[0])

True
True
True


In [18]:
# Exporting the feature-label set pairs as pickle files onto Google Drive
import pickle

with open('train.pkl', 'wb') as f:  
    pickle.dump([train_new, train_spec], f)

with open('val.pkl', 'wb') as f:  
    pickle.dump([val_new, val_spec], f)

with open('test.pkl', 'wb') as f:  
    pickle.dump([test_new, test_spec], f)

In [19]:
from google.colab import drive
drive.mount('/content/drive')

!cp train.pkl drive/MyDrive/DeepLearning/train.pkl
!cp val.pkl drive/MyDrive/DeepLearning/val.pkl
!cp test.pkl drive/MyDrive/DeepLearning/test.pkl

Mounted at /content/drive


<h3>Option 2: Incremental PCA</h3>

In [20]:
from sklearn.decomposition import IncrementalPCA

n_comp = 250

# note: takes about 10 minutes to run
pca = IncrementalPCA(n_components=n_comp, batch_size=1024)
pca.fit(train_feat)
train_feat = pca.transform(train_feat)
val_feat = pca.transform(val_feat)
test_feat = pca.transform(test_feat)

In [21]:
# Exporting the new data

with open('train_v2.pkl', 'wb') as f:  
    pickle.dump([train_feat, train_spec], f)

with open('val_v2.pkl', 'wb') as f:  
    pickle.dump([val_feat, val_spec], f)

with open('test_v2.pkl', 'wb') as f:  
    pickle.dump([test_feat, test_spec], f)

!cp train_v2.pkl drive/MyDrive/DeepLearning/train_v2.pkl
!cp val_v2.pkl drive/MyDrive/DeepLearning/val_v2.pkl
!cp test_v2.pkl drive/MyDrive/DeepLearning/test_v2.pkl