<a href="https://colab.research.google.com/github/v-artur/Golden_Oreos/blob/main/speaker_indep_data_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Obtaining the data

In [1]:
#features
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp" -O features.zip && rm -rf /tmp/cookies.txt
#original electrode names
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1A2CMLYAMOjET7Bdwt8bjRt8YLQeoVP80' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1A2CMLYAMOjET7Bdwt8bjRt8YLQeoVP80" -O subject_channels.zip && rm -rf /tmp/cookies.txt

# Data extraction
import zipfile
zip_ref = zipfile.ZipFile("/content/features.zip", 'r')
zip_ref.extractall("/content/features")
zip_ref.close()

# Electrode name extraction
zip_ref = zipfile.ZipFile("/content/subject_channels.zip", 'r')
zip_ref.extractall("/content")
zip_ref.close()


--2022-12-09 07:21:10--  https://docs.google.com/uc?export=download&confirm=t&id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp
Resolving docs.google.com (docs.google.com)... 108.177.125.102, 108.177.125.139, 108.177.125.113, ...
Connecting to docs.google.com (docs.google.com)|108.177.125.102|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-08-9o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/usphsnju7cn0bgk6897l018i9lc74i2u/1670570400000/17895932938140350971/*/1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp?e=download&uuid=2190d4b1-aec8-4159-a55a-5ddb1b93a41a [following]
--2022-12-09 07:21:11--  https://doc-08-9o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/usphsnju7cn0bgk6897l018i9lc74i2u/1670570400000/17895932938140350971/*/1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp?e=download&uuid=2190d4b1-aec8-4159-a55a-5ddb1b93a41a
Resolving doc-08-9o-docs.googleusercontent.com (doc-08-9o-docs.googleusercontent.com)... 64.233.188.1

<h3>Preparations and needed functions</h3>

In [2]:
import pandas as pd
import numpy as np

# Our goal is to put the feature names in order to preserve the sequentiality within the feature vectors

# Getting the original electrode names
original_electrodes = set()

for subject in ['01','02','03','04','05','06','07','08','09','10']:
  table = pd.read_csv(f'/content/subject_channels/sub-{subject}_task-wordProduction_channels.tsv', sep='\t')
  elecs = set(table['name'])
  original_electrodes = original_electrodes.union(elecs)

# Now indexing them from -4 to 4 (9 in total)
all_electrodes = []
for i in range(9):
  for elec in original_electrodes:
    all_electrodes.append(elec + "T" + str(i-4))

print('Number of different features:', len(all_electrodes))

#we will use this list's indexes to correspond to the feature matrices
all_electrodes = list(all_electrodes) 

Number of different features: 4860


In [3]:
# Example
print(all_electrodes[:10])

['LH1T-4', 'LU10T-4', 'LY7T-4', 'RK8T-4', 'LK4T-4', 'RA1T-4', 'RC11T-4', 'LG13T-4', 'RM3T-4', 'LD7T-4']


In [4]:
# Function for making every feature vector into a 4860 dimensional vector

def dim_adjust(data, feature_names):
  #create a new matrix with zeros, insert the values into the columns
  #which correspond to the subject's feature names, then concatenate them

  new_matrix = np.zeros((data.shape[0],len(all_electrodes)))
  for column in range(data.shape[1]):
    insert_index = all_electrodes.index(feature_names[column])
    new_matrix[:,insert_index] = data[:,column]
    
  return new_matrix


## Making the iterated test, validation and test sets

In [5]:
import os

feat_path = "/content/features"

# Function to generate the train, val and test features and mel spectrograms
def generate_features_and_spec():
  #Base arrays
  train_feat = np.empty((0, len(all_electrodes)))
  val_feat = np.empty((0, len(all_electrodes)))
  test_feat = np.empty((0, len(all_electrodes)))

  train_spec = np.empty((0, 23))
  val_spec = np.empty((0, 23))
  test_spec = np.empty((0, 23))

  for index, subject in enumerate(['01','02','03','04','05','06','07','08','09','10']):
    #loading the features, feature names and mel spectrogram of the subject
    data = np.load(os.path.join(feat_path,f'sub-{subject}_feat.npy'))
    feature_names = np.load(os.path.join(feat_path,f'sub-{subject}_feat_names.npy'))
    spectrogram = np.load(os.path.join(feat_path,f'sub-{subject}_spec.npy'))

    #splittig the features and the labels into 5 parts
    feat_splits = np.array_split(data, 5)
    spec_splits = np.array_split(spectrogram, 5)

    #making the train, val or test arrays for the subject using a 60-20-20 ratio
    #because of the "index" changing, the splitting position will iterate with each subject as well
    subject_train_feat = np.vstack((feat_splits[index % 5],feat_splits[(index+1) % 5],feat_splits[(index+2) % 5]))
    subject_train_spec = np.vstack((spec_splits[index % 5],spec_splits[(index+1) % 5],spec_splits[(index+2) % 5]))

    subject_val_feat = feat_splits[(index+3) % 5]
    subject_val_spec = spec_splits[(index+3) % 5]

    subject_test_feat = feat_splits[(index+4) % 5]
    subject_test_spec = spec_splits[(index+4) % 5]

    #concatenating the dimensionality-adjusted features with the pre-existing feature set
    train_feat = np.concatenate((train_feat, dim_adjust(subject_train_feat, feature_names)))
    val_feat = np.concatenate((val_feat, dim_adjust(subject_val_feat, feature_names)))
    test_feat = np.concatenate((test_feat, dim_adjust(subject_test_feat, feature_names)))

    #concatenating the appropriate arrays with the pre-existing train, val or test labels
    train_spec = np.concatenate((train_spec, subject_train_spec))
    val_spec = np.concatenate((val_spec, subject_val_spec))
    test_spec = np.concatenate((test_spec, subject_test_spec))

  return train_feat, train_spec, val_feat, val_spec, test_feat, test_spec 

# Generating the data
train_feat, train_spec, val_feat, val_spec, test_feat, test_spec = generate_features_and_spec()

In [6]:
from sklearn.preprocessing import StandardScaler
# Scaling the data 

scaler = StandardScaler()
scaler.fit(train_feat)
train_feat = scaler.transform(train_feat)
val_feat = scaler.transform(val_feat)
test_feat = scaler.transform(test_feat)

## Dimensionality reduction

<h3>Option 1: Tuned AutoEncoder</h3>

In [7]:
%%capture
!pip install keras-tuner

In [8]:
# Defining the data generator
from tensorflow.keras.utils import Sequence, set_random_seed

set_random_seed(1234)

class DataGenerator(Sequence):
    # Initialization
    def __init__(self, data, batch_size=32, dim=len(all_electrodes), shuffle=True):
        self.dim = dim
        self.batch_size = batch_size
        self.data = data
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        # Shows the number of batches per epoch
        return int(np.floor(self.data.shape[0] / self.batch_size))

    def __getitem__(self, index):
        # Generate one batch of data
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        batch = np.empty((self.batch_size, self.dim))
        for index1, elem in enumerate(indexes):
          batch[index1] = self.data[index1]

        return batch, batch

    def on_epoch_end(self):
        # Updating the index after each epoch
        self.indexes = np.arange(self.data.shape[0])
        if self.shuffle == True:
            np.random.shuffle(self.indexes)


# Creating the generators
train_gen = DataGenerator(train_feat, 256)
val_gen = DataGenerator(val_feat, 256)
test_gen = DataGenerator(test_feat, 256)

In [9]:
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Input, Dropout
import keras_tuner as kt
from keras.models import Model


# Creating the hyperparameter tuner function
# The AE has 5 Dense layers besides the output, and the 3rd one is the bottleneck layer
def create_ae_optimal(hp):
  input = Input(shape=(4860))
  
  hp_units_1 = hp.Int('units_1', min_value=1500, max_value=2500, step=125)
  encoded = Dense(units=hp_units_1, activation="relu", kernel_initializer='HeNormal')(input)
  encoded = Dropout(rate=hp.Float('dropout_1', min_value = 0.0, max_value = 0.5, default = 0.25, step = 0.05))(encoded)

  hp_units_2 = hp.Int('units_2', min_value=750, max_value=1250, step=50)
  encoded = Dense(units=hp_units_2, activation="relu", kernel_initializer='HeNormal')(encoded)
  encoded = Dropout(rate=hp.Float('dropout_2', min_value = 0.0, max_value = 0.5, default = 0.25, step = 0.05))(encoded)

  hp_units_3 = hp.Int('units_3', min_value=400, max_value=600, step=25)
  encoded = Dense(units=hp_units_3, activation="relu", kernel_initializer='HeNormal')(encoded)
  encoded = Dropout(rate=hp.Float('dropout_3', min_value = 0.0, max_value = 0.5, default = 0.25, step = 0.05))(encoded)

  hp_units_4 = hp.Int('units_4', min_value=750, max_value=1250, step=50)
  decoded = Dense(units=hp_units_4, activation="relu", kernel_initializer='HeNormal')(encoded)
  decoded = Dropout(rate=hp.Float('dropout_4', min_value = 0.0, max_value = 0.5, default = 0.25, step = 0.05))(decoded)

  hp_units_5 = hp.Int('units_5', min_value=1500, max_value=2500, step=125)
  decoded = Dense(units=hp_units_5, activation="relu", kernel_initializer='HeNormal')(decoded)
  decoded = Dropout(rate=hp.Float('dropout_5', min_value = 0.0, max_value = 0.5, default = 0.25, step = 0.05))(decoded)

  output = Dense(4860)(decoded)

  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
  hp_momentum = hp.Choice('momentum', values=[0.9, 0.95, 0.99])

  model = Model(input, output)

  # Our early experiments showed that SGD is slightly better here than ADAM
  model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=hp_learning_rate, momentum=hp_momentum),
                loss='mse',
                metrics=['mse'])
  
  return model

In [10]:
# Setting up the tuner
tuner = kt.Hyperband(create_ae_optimal,
                     objective='val_loss',
                     max_epochs=10,
                     factor=3,
                     directory='/content/ae_opt',
                     project_name='ae_opt1')

In [11]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

#note: takes about 20-25 minutes to optimize
stop_early = EarlyStopping(monitor='val_loss', patience=10)
tuner.search(train_gen, epochs=100, validation_data=val_gen, verbose=0, shuffle=True, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]



In [12]:
# Fitting and retraining the model with the best params
hypermodel = tuner.hypermodel.build(best_hps)

early_stopping=EarlyStopping(patience=20, verbose=1, min_delta=1e-5)
checkpointer=ModelCheckpoint(filepath='weights1.hdf5', save_best_only=True, verbose=1)

hypermodel.fit(train_gen, epochs=500, verbose=1, validation_data=val_gen, callbacks=[checkpointer, early_stopping])

Epoch 1/500
Epoch 1: val_loss improved from inf to 0.56570, saving model to weights1.hdf5
Epoch 2/500
Epoch 2: val_loss improved from 0.56570 to 0.42814, saving model to weights1.hdf5
Epoch 3/500
Epoch 3: val_loss improved from 0.42814 to 0.36683, saving model to weights1.hdf5
Epoch 4/500
Epoch 4: val_loss improved from 0.36683 to 0.34375, saving model to weights1.hdf5
Epoch 5/500
Epoch 5: val_loss improved from 0.34375 to 0.33557, saving model to weights1.hdf5
Epoch 6/500
Epoch 6: val_loss improved from 0.33557 to 0.33057, saving model to weights1.hdf5
Epoch 7/500
Epoch 7: val_loss did not improve from 0.33057
Epoch 8/500
Epoch 8: val_loss did not improve from 0.33057
Epoch 9/500
Epoch 9: val_loss improved from 0.33057 to 0.32984, saving model to weights1.hdf5
Epoch 10/500
Epoch 10: val_loss did not improve from 0.32984
Epoch 11/500
Epoch 11: val_loss did not improve from 0.32984
Epoch 12/500
Epoch 12: val_loss did not improve from 0.32984
Epoch 13/500
Epoch 13: val_loss did not impro

<keras.callbacks.History at 0x7f696c4bf520>

In [13]:
# Loading back the best weights and checking the layers
hypermodel.load_weights('weights1.hdf5')
hypermodel.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 4860)]            0         
                                                                 
 dense_6 (Dense)             (None, 2000)              9722000   
                                                                 
 dropout_5 (Dropout)         (None, 2000)              0         
                                                                 
 dense_7 (Dense)             (None, 800)               1600800   
                                                                 
 dropout_6 (Dropout)         (None, 800)               0         
                                                                 
 dense_8 (Dense)             (None, 575)               460575    
                                                                 
 dropout_7 (Dropout)         (None, 575)               0   

In [14]:
# Keeping only the encoder
model2 = Model(inputs=hypermodel.input, outputs=hypermodel.layers[-7].output)
model2.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 4860)]            0         
                                                                 
 dense_6 (Dense)             (None, 2000)              9722000   
                                                                 
 dropout_5 (Dropout)         (None, 2000)              0         
                                                                 
 dense_7 (Dense)             (None, 800)               1600800   
                                                                 
 dropout_6 (Dropout)         (None, 800)               0         
                                                                 
 dense_8 (Dense)             (None, 575)               460575    
                                                                 
Total params: 11,783,375
Trainable params: 11,783,375
Non-t

In [15]:
# Generating the new data using the output of the bottleneck layer
train_new = model2.predict(train_gen)
val_new = model2.predict(val_gen)
test_new = model2.predict(test_gen)



In [None]:
# Exporting the feature-label set pairs as pickle files onto Google Drive
import pickle

with open('train.pkl', 'wb') as f:  
    pickle.dump([train_new, train_spec], f)

with open('val.pkl', 'wb') as f:  
    pickle.dump([val_new, val_spec], f)

with open('test.pkl', 'wb') as f:  
    pickle.dump([test_new, test_spec], f)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp train.pkl drive/MyDrive/DeepLearning/train.pkl
!cp val.pkl drive/MyDrive/DeepLearning/val.pkl
!cp test.pkl drive/MyDrive/DeepLearning/test.pkl

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<h3>Option 2: Incremental PCA</h3>

In [None]:
from sklearn.decomposition import IncrementalPCA

n_comp = 200

# note: takes about 10 minutes to run
pca = IncrementalPCA(n_components=n_comp, batch_size=1024)
pca.fit(train_feat)
train_feat = pca.transform(train_feat)
val_feat = pca.transform(val_feat)
test_feat = pca.transform(test_feat)

In [None]:
# Exporting the new data

with open('train_v2.pkl', 'wb') as f:  
    pickle.dump([train_feat, train_spec], f)

with open('val_v2.pkl', 'wb') as f:  
    pickle.dump([val_feat, val_spec], f)

with open('test_v2.pkl', 'wb') as f:  
    pickle.dump([test_feat, test_spec], f)

# Copying to drive
from google.colab import drive
drive.mount('/content/drive')

!cp train_v2.pkl drive/MyDrive/DeepLearning/train_v2.pkl
!cp val_v2.pkl drive/MyDrive/DeepLearning/val_v2.pkl
!cp test_v2.pkl drive/MyDrive/DeepLearning/test_v2.pkl

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
