<a href="https://colab.research.google.com/github/v-artur/Golden_Oreos/blob/main/speak_indep_bigru_conv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Obtaining the data

In [None]:
#features
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp" -O features.zip && rm -rf /tmp/cookies.txt
#original electrode names
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1A2CMLYAMOjET7Bdwt8bjRt8YLQeoVP80' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1A2CMLYAMOjET7Bdwt8bjRt8YLQeoVP80" -O subject_channels.zip && rm -rf /tmp/cookies.txt

# Data extraction
import zipfile
zip_ref = zipfile.ZipFile("/content/features.zip", 'r')
zip_ref.extractall("/content/features")
zip_ref.close()

# Electrode name extraction
zip_ref = zipfile.ZipFile("/content/subject_channels.zip", 'r')
zip_ref.extractall("/content")
zip_ref.close()


--2022-12-09 07:21:10--  https://docs.google.com/uc?export=download&confirm=t&id=1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp
Resolving docs.google.com (docs.google.com)... 108.177.125.102, 108.177.125.139, 108.177.125.113, ...
Connecting to docs.google.com (docs.google.com)|108.177.125.102|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-08-9o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/usphsnju7cn0bgk6897l018i9lc74i2u/1670570400000/17895932938140350971/*/1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp?e=download&uuid=2190d4b1-aec8-4159-a55a-5ddb1b93a41a [following]
--2022-12-09 07:21:11--  https://doc-08-9o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/usphsnju7cn0bgk6897l018i9lc74i2u/1670570400000/17895932938140350971/*/1vtZchVzl424pSQBXQ8EBxvVOzcEQPKIp?e=download&uuid=2190d4b1-aec8-4159-a55a-5ddb1b93a41a
Resolving doc-08-9o-docs.googleusercontent.com (doc-08-9o-docs.googleusercontent.com)... 64.233.188.1

<h3>Preparations and needed functions</h3>

In [None]:
import pandas as pd
import numpy as np

# Our goal is to put the feature names in order to preserve the sequentiality within the feature vectors

# Getting the original electrode names
original_electrodes = set()

for subject in ['01','02','03','04','05','06','07','08','09','10']:
  table = pd.read_csv(f'/content/subject_channels/sub-{subject}_task-wordProduction_channels.tsv', sep='\t')
  elecs = set(table['name'])
  original_electrodes = original_electrodes.union(elecs)

# Now indexing them from -4 to 4 (9 in total)
all_electrodes = []
for i in range(9):
  for elec in original_electrodes:
    all_electrodes.append(elec + "T" + str(i-4))

print('Number of different features:', len(all_electrodes))

#we will use this list's indexes to correspond to the feature matrices
all_electrodes = list(all_electrodes) 

Number of different features: 4860


In [None]:
# Example
print(all_electrodes[:10])

['LH1T-4', 'LU10T-4', 'LY7T-4', 'RK8T-4', 'LK4T-4', 'RA1T-4', 'RC11T-4', 'LG13T-4', 'RM3T-4', 'LD7T-4']


In [None]:
# Function for making every feature vector into a 4860 dimensional vector

def dim_adjust(data, feature_names):
  #create a new matrix with zeros, insert the values into the columns
  #which correspond to the subject's feature names, then concatenate them

  new_matrix = np.zeros((data.shape[0],len(all_electrodes)))
  for column in range(data.shape[1]):
    insert_index = all_electrodes.index(feature_names[column])
    new_matrix[:,insert_index] = data[:,column]
    
  return new_matrix


## Making the iterated test, validation and test sets

In [None]:
import os

feat_path = "/content/features"

# Function to generate the train, val and test features and mel spectrograms
def generate_features_and_spec():
  #Base arrays
  train_feat = np.empty((0, len(all_electrodes)))
  val_feat = np.empty((0, len(all_electrodes)))
  test_feat = np.empty((0, len(all_electrodes)))

  train_spec = np.empty((0, 23))
  val_spec = np.empty((0, 23))
  test_spec = np.empty((0, 23))

  for index, subject in enumerate(['01','02','03','04','05','06','07','08','09','10']):
    #loading the features, feature names and mel spectrogram of the subject
    data = np.load(os.path.join(feat_path,f'sub-{subject}_feat.npy'))
    feature_names = np.load(os.path.join(feat_path,f'sub-{subject}_feat_names.npy'))
    spectrogram = np.load(os.path.join(feat_path,f'sub-{subject}_spec.npy'))

    #splittig the features and the labels into 5 parts
    feat_splits = np.array_split(data, 5)
    spec_splits = np.array_split(spectrogram, 5)

    #making the train, val or test arrays for the subject using a 60-20-20 ratio
    #because of the "index" changing, the splitting position will iterate with each subject as well
    subject_train_feat = np.vstack((feat_splits[index % 5],feat_splits[(index+1) % 5],feat_splits[(index+2) % 5]))
    subject_train_spec = np.vstack((spec_splits[index % 5],spec_splits[(index+1) % 5],spec_splits[(index+2) % 5]))

    subject_val_feat = feat_splits[(index+3) % 5]
    subject_val_spec = spec_splits[(index+3) % 5]

    subject_test_feat = feat_splits[(index+4) % 5]
    subject_test_spec = spec_splits[(index+4) % 5]

    #concatenating the dimensionality-adjusted features with the pre-existing feature set
    train_feat = np.concatenate((train_feat, dim_adjust(subject_train_feat, feature_names)))
    val_feat = np.concatenate((val_feat, dim_adjust(subject_val_feat, feature_names)))
    test_feat = np.concatenate((test_feat, dim_adjust(subject_test_feat, feature_names)))

    #concatenating the appropriate arrays with the pre-existing train, val or test labels
    train_spec = np.concatenate((train_spec, subject_train_spec))
    val_spec = np.concatenate((val_spec, subject_val_spec))
    test_spec = np.concatenate((test_spec, subject_test_spec))

  return train_feat, train_spec, val_feat, val_spec, test_feat, test_spec 

# Generating the data
train_feat, train_spec, val_feat, val_spec, test_feat, test_spec = generate_features_and_spec()

In [None]:
from sklearn.preprocessing import StandardScaler
# Scaling the data 

scaler = StandardScaler()
scaler.fit(train_feat)
train_feat = scaler.transform(train_feat)
val_feat = scaler.transform(val_feat)
test_feat = scaler.transform(test_feat)

In [None]:
# Reshaping the data for the sequential models

train_feat = np.reshape(train_feat, (train_feat.shape[0], 9, int(train_feat.shape[1]/9)))
val_feat = np.reshape(val_feat, (val_feat.shape[0], 9, int(val_feat.shape[1]/9)))
test_feat = np.reshape(test_feat, (test_feat.shape[0], 9, int(test_feat.shape[1]/9)))

# Modeling

In [None]:
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Input, Dropout, Bidirectional, GRU, Flatten

def create_bigru_model(channels, outputsize):
  model = tf.keras.Sequential()
  model.add(Input(shape=(9, channels)))
  model.add(Bidirectional(GRU(units=256, return_sequences=True, dropout=0.2)))
  model.add(Bidirectional(GRU(units=128, return_sequences=True, dropout=0.2)))
  model.add(Bidirectional(GRU(units=64, return_sequences=True, dropout=0.2)))
  model.add(Flatten())
  model.add(Dense(outputsize))
  return model

In [None]:
# Defining the data generator
from tensorflow.keras.utils import Sequence, set_random_seed

set_random_seed(1234)

class DataGenerator(Sequence):
    # Initialization
    def __init__(self, data, spec, batch_size=32, dim=(9, 540), shuffle=True):
        self.dim = dim
        self.batch_size = batch_size
        self.data = data
        self.shuffle = shuffle
        self.spec = spec
        self.on_epoch_end()

    def __len__(self):
        # Shows the number of batches per epoch
        return int(np.floor(self.data.shape[0] / self.batch_size))

    def __getitem__(self, index):
        # Generate one batch of data
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        X_batch = np.empty((self.batch_size, self.dim[0], self.dim[1]))
        y_batch = np.empty((self.batch_size, 23))
        for index1, elem in enumerate(indexes):
          X_batch[index1] = self.data[index1]
          y_batch[index1] = self.spec[index1]

        return X_batch, y_batch

    def on_epoch_end(self):
        # Updating the index after each epoch
        self.indexes = np.arange(self.data.shape[0])
        if self.shuffle == True:
            np.random.shuffle(self.indexes)


# Creating the generators
train_gen = DataGenerator(train_reshaped, train_spec, 256)
val_gen = DataGenerator(val_reshaped, train_spec, 256)
test_gen = DataGenerator(test_reshaped, train_spec, 256)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopping=EarlyStopping(patience=20, verbose=1, min_delta=1e-5)
checkpointer=ModelCheckpoint(filepath='weights2.hdf5', save_best_only=True, verbose=1)

model = create_bigru_model(540, 23)
model.compile(loss='mse', optimizer=tf.keras.optimizers.SGD(learning_rate=1e-4, momentum=0.9), metrics=['mse'])

model.fit(train_gen, epochs=500, verbose=1, validation_data=val_gen, callbacks=[checkpointer, early_stopping])

Epoch 1/500


  super(SGD, self).__init__(name, **kwargs)


Epoch 1: val_loss improved from inf to 3.76635, saving model to weights1.hdf5
Epoch 2/500
Epoch 2: val_loss improved from 3.76635 to 3.66678, saving model to weights1.hdf5
Epoch 3/500
Epoch 3: val_loss improved from 3.66678 to 3.56214, saving model to weights1.hdf5
Epoch 4/500
Epoch 4: val_loss did not improve from 3.56214
Epoch 5/500
Epoch 5: val_loss improved from 3.56214 to 3.53116, saving model to weights1.hdf5
Epoch 6/500
Epoch 6: val_loss did not improve from 3.53116
Epoch 7/500
Epoch 7: val_loss did not improve from 3.53116
Epoch 8/500

KeyboardInterrupt: ignored

In [None]:
model.load_weights('weights2.hdf5')
model.evaluate(test_gen)



[5.404033184051514, 5.404033184051514]