In [61]:
import tensorflow as tf
tf.config.experimental.enable_tensor_float_32_execution(False)

In [62]:


gpus = tf.config.list_physical_devices('GPU')


if gpus:
    try:
   
        tf.config.set_visible_devices(gpus[1], 'GPU')
        
       
        tf.config.experimental.set_virtual_device_configuration(
            gpus[1],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=32768)]
        )

      
        
        print(f"Successfully configured GPU {gpus[1]} with memory limit of 2048MB and memory growth enabled.")
    
    except RuntimeError as e:
        print("Error during GPU configuration:", e)
else:
    print("No GPU found.")

No GPU found.


In [63]:
print(tf.config.list_physical_devices('GPU'))

[]


In [64]:
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus, 'GPU')

In [65]:

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


a = tf.constant([[1.0, 2.0], [3.0, 4.0]])
b = tf.constant([[1.0, 1.0], [0.0, 1.0]])
c = tf.matmul(a, b)

print("Result of matrix multiplication: \n", c.numpy())

Num GPUs Available:  0
Result of matrix multiplication: 
 [[1. 3.]
 [3. 7.]]


In [66]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.layers import (
    Input, Conv2D, BatchNormalization, Activation, MaxPooling2D, Dropout,
    Permute, Reshape, Bidirectional, GRU, TimeDistributed, Dense,
    LayerNormalization, Add, MultiHeadAttention, GlobalAveragePooling1D
)
from tensorflow.keras.callbacks import ModelCheckpoint

from tensorflow.keras.layers import Input, Dense, Conv2D, BatchNormalization, Activation, MaxPooling2D, Dropout, Reshape, Permute, GlobalAveragePooling1D, Add, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.layers import Input, Dense, Conv2D, BatchNormalization, Activation, MaxPooling2D, Dropout, Permute, Reshape, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K



In [67]:
# from tensorflow.keras.layers import MultiHeadAttention, Dense, LayerNormalization, Conv1D, Add, Dropout

# def conformer_block(x, head_size, num_heads, ff_dim, dropout=0.1, kernel_size=31):
#     # Feed Forward Module (first half)
#     ff1 = Dense(ff_dim, activation='relu')(x)
#     ff1 = Dropout(dropout)(ff1)
#     ff1 = Dense(x.shape[-1])(ff1)
#     x = Add()([x, 0.5 * ff1])

#     # Multi-Head Self Attention Module
#     attn = LayerNormalization(epsilon=1e-6)(x)
#     attn = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(attn, attn)
#     attn = Dropout(dropout)(attn)
#     x = Add()([x, attn])

#     # Convolution Module
#     conv = LayerNormalization(epsilon=1e-6)(x)
#     conv = Conv1D(filters=x.shape[-1], kernel_size=1, padding='same', activation='relu')(conv)
#     conv = Conv1D(filters=x.shape[-1], kernel_size=kernel_size, padding='same', activation='relu', groups=x.shape[-1])(conv)
#     conv = Conv1D(filters=x.shape[-1], kernel_size=1, padding='same')(conv)
#     conv = Dropout(dropout)(conv)
#     x = Add()([x, conv])

#     # Feed Forward Module (second half)
#     ff2 = Dense(ff_dim, activation='relu')(x)
#     ff2 = Dropout(dropout)(ff2)
#     ff2 = Dense(x.shape[-1])(ff2)
#     x = Add()([x, 0.5 * ff2])

#     # Final Layer Norm
#     x = LayerNormalization(epsilon=1e-6)(x)
#     return x
import tensorflow as tf
from tensorflow.keras.layers import LayerNormalization, Dense, Dropout, MultiHeadAttention, Conv1D, Add, Multiply, Lambda, Activation, DepthwiseConv1D, BatchNormalization

def branchformer_block(x, head_size, num_heads, ff_dim, dropout=0.1, kernel_size=31, block_idx=0):
    prefix = f"branchformer{block_idx}"

    # FFN Module (first half)
    ff1 = Dense(ff_dim, activation='relu', name=f"{prefix}_ff1_dense1")(x)
    ff1 = Dropout(dropout, name=f"{prefix}_ff1_dropout")(ff1)
    ff1 = Dense(x.shape[-1], name=f"{prefix}_ff1_dense2")(ff1)
    x = Add(name=f"{prefix}_ff1_add")([x, Lambda(lambda z: 0.5 * z)(ff1)])

    # Multi-Head Attention Branch
    x_ln_attn = LayerNormalization(epsilon=1e-6, name=f"{prefix}_attn_ln")(x)
    attn_out = MultiHeadAttention(
        num_heads=num_heads,
        key_dim=head_size,
        dropout=dropout,
        name=f"{prefix}_attn"
    )(x_ln_attn, x_ln_attn)
    attn_out = Dropout(dropout, name=f"{prefix}_attn_dropout")(attn_out)

    # Convolution Branch
    conv_input = LayerNormalization(epsilon=1e-6, name=f"{prefix}_conv_ln")(x)


    conv_u = Conv1D(filters=x.shape[-1], kernel_size=1, padding='same', name=f"{prefix}_conv_u")(conv_input)
    conv_v = Conv1D(filters=x.shape[-1], kernel_size=1, padding='same', activation='sigmoid', name=f"{prefix}_conv_v")(conv_input)
    
    # Create the Gated Linear Unit output
    conv_glu = Multiply(name=f"{prefix}_glu_out")([conv_u, conv_v])
    # --- END OF CORRECTION ---

    # Depthwise Conv
    conv_dw = DepthwiseConv1D(kernel_size=kernel_size, padding='same', name=f"{prefix}_depthwise")(conv_glu)
    conv_dw = BatchNormalization(name=f"{prefix}_dw_bn")(conv_dw)
    conv_dw = Activation('swish', name=f"{prefix}_swish")(conv_dw)
    
    # Pointwise Conv + Dropout
    conv_out = Conv1D(filters=x.shape[-1], kernel_size=1, padding='same', name=f"{prefix}_conv_pw2")(conv_dw)
    conv_out = Dropout(dropout, name=f"{prefix}_conv_dropout")(conv_out)
    
    # Merge Branches
    merged = Add(name=f"{prefix}_merge")([attn_out, conv_out])
    x = Add(name=f"{prefix}_residual_merge")([x, merged])

    # FFN Module (second half)
    ff2 = Dense(ff_dim, activation='relu', name=f"{prefix}_ff2_dense1")(x)
    ff2 = Dropout(dropout, name=f"{prefix}_ff2_dropout")(ff2)
    ff2 = Dense(x.shape[-1], name=f"{prefix}_ff2_dense2")(ff2)
    x = Add(name=f"{prefix}_ff2_add")([x, Lambda(lambda z: 0.5 * z)(ff2)])

    # Final Layer Norm
    x = LayerNormalization(epsilon=1e-6, name=f"{prefix}_ln_out")(x)
    return x

In [68]:
import tensorflow.keras.backend as K

def masked_categorical_crossentropy(y_true, y_pred):

    # categorical_crossentropy (None, 8)
    loss = K.categorical_crossentropy(y_true, y_pred)

    
    mask = K.cast(K.any(y_true > 0, axis=-1), K.floatx())  # (None,)

    #  Loss
    masked_loss = loss * mask  # (None, 8) * (None,) -> (None, 8)

    valid_samples = K.maximum(K.sum(mask), 1.0)  


    return K.sum(masked_loss) / valid_samples


In [None]:
'''PREPROCESSING BLOCK'''
# wav -> stft -> mel spec -> 
import os
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

DATA = "/Users/neilkalipersad/Desktop/Research Lab Files/ML Program Code/all_audio/"
SETS_TO_PROCESS = ['clean_audio', 'noisy_audio']

# --- Parameters ---
sample_rate = 16000
duration = 1.0 
n_mels = 256
desired_frames = 61

# --- Initialize empty lists ---
x_data = []
y_labels = []

emergency_phrases = [
                    #stop
                    'stop','stop flying', 'halt', 'freeze', 'cease', 'terminate', 'abort', 'wait', 'stay', 'pause', 'hold', 'brake', 'end', 'cut', 

                     #no
                    'no', 'nope',
                    
                    #screams
                    'ahh', 'ah', 'woah', 'wow',
                    
                    #profanity
                    'shit', 'fuck', 'god', 'damn' 'crap'
                     
                     ]
movement_phrases = [
                    #general
                    'hover', 'fly', 'turn', 'move', 'go',
                    
                    
                    #backward
                    'backward', 'back' , 'reverse' , 'back out', 'backwards',
                    
                    #down
                    'down', 'drop', 'fall',
                    
                    #forward
                    'forward', 'straight', 'ahead', 'forward', 'in',
                    
                    #right
                    'left',
                    
                    #right
                    'right',
                    
                    
                    #up
                    'up', 'lift', 'start',
                    
                    #yes
                    'yes','ok', 'sure', 'yea', 'yup'
                    
                    ]

used_emergency_phrases = []
used_movement_phrases = []
used_other_phrases = []

print("Starting data loading...")
#  Loop through each data set ('clean_audio', 'noisy_audio') 
for data_set in SETS_TO_PROCESS:
    set_path = os.path.join(DATA, data_set)
    
    if not os.path.isdir(set_path):
        print(f"Warning: Directory not found, skipping: {set_path}")
        continue

    print(f"Processing set: {set_path}")
    #  Loop through each keyword folder ('backward', 'down', etc.) 
    for keyword_folder in os.listdir(set_path):
        keyword_path = os.path.join(set_path, keyword_folder)
        if not os.path.isdir(keyword_path):
            continue

        
        if keyword_folder in emergency_phrases:
            label = 'emergency'
            if keyword_folder not in used_emergency_phrases:
                used_emergency_phrases.append(keyword_folder)
        elif keyword_folder in movement_phrases:
            label = 'movement'
            if keyword_folder not in used_movement_phrases:
                used_movement_phrases.append(keyword_folder)
        else:
            label = 'other'
            if keyword_folder not in used_other_phrases:
                used_other_phrases.append(keyword_folder)
   
            
        # --- Loop through each .wav file ---
        for file in os.listdir(keyword_path):    
            if file.endswith(".wav"):
                y_labels.append(label)
                
                # Load and process the audio file
                path = os.path.join(keyword_path, file)
                audio, _ = librosa.load(path, sr=sample_rate, duration=duration)
                
                if len(audio) < sample_rate:
                    audio = np.pad(audio, (0, sample_rate - len(audio)))
                else:
                    audio = audio[:sample_rate]

                # Feature extraction
                mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=n_mels)
                mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
                mel_spec_db = mel_spec_db / 80.0 + 1.0  
                x_data.append(mel_spec_db)

print("Data loading complete.")


x_train = np.array(x_data)
x_train = x_train[..., np.newaxis]
current_frames = x_train.shape[2]
if current_frames < desired_frames:
    pad_width = ((0, 0), (0, 0), (0, desired_frames - current_frames), (0, 0))
    x_train = np.pad(x_train, pad_width, mode='constant')
elif current_frames > desired_frames:
    x_train = x_train[:, :, :desired_frames, :]


label_encoder = LabelEncoder()
y_doa_int = label_encoder.fit_transform(y_labels)

y_doa = to_categorical(y_doa_int) 
y_sed = np.ones((len(y_doa), 1))

num_classes = len(label_encoder.classes_)
print(f"\n✅ Found {num_classes} unique classes: {label_encoder.classes_}")
print('movement_phrases:', used_movement_phrases)
print('emergency_phrases:', used_emergency_phrases)
print('other_phrases:', used_other_phrases)
print(f"Number of samples: {len(x_train)}")

Starting data loading...
Processing set: /Users/neilkalipersad/Desktop/Research Lab Files/ML Program Code/all_audio/clean_audio
Processing set: /Users/neilkalipersad/Desktop/Research Lab Files/ML Program Code/all_audio/noisy_audio
Data loading complete.

✅ Found 3 unique classes: ['emergency' 'movement' 'other']
movement_phrases: ['right', 'backward', 'go', 'left', 'up', 'down', 'yes', 'forward']
emergency_phrases: ['no', 'wow', 'stop']
other_phrases: ['on', 'off']
Number of samples: 54459


In [None]:
from tensorflow.keras.layers import Input, Permute, Reshape, GlobalAveragePooling1D, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

def build_resnet_conformer_seld(input_shape=(61,256,6), num_layers=1, head_size=32, num_heads=4, ff_dim=256, dropout_rate=0.15, fnn_units=[128]):

    spec_input = Input(shape=input_shape)


    x = Conv2D(filters=64, kernel_size=(3, 3), padding='same')(spec_input)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(1, 4))(x) 

    x = Conv2D(filters=64, kernel_size=(3, 3), padding='same')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(1, 4))(x) 

    x = Conv2D(filters=64, kernel_size=(3, 3), padding='same')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D(pool_size=(1, 2))(x)

    
    x = Permute((1, 2, 3))(x)
    x = Reshape((input_shape[0], -1))(x)
    
    for i in range(num_layers):
        x = branchformer_block(x, head_size=head_size, num_heads=num_heads, ff_dim=ff_dim, dropout=dropout_rate, block_idx=i)

    embedding_output = GlobalAveragePooling1D(name='embedding_output')(x)

    sed_output = Dense(fnn_units[0], activation='relu', name="sed_dense")(embedding_output)
    sed_output = Dropout(dropout_rate, name="sed_dropout")(sed_output)
    sed_output = Dense(1, activation='sigmoid', name='sed_output')(sed_output)

    doa_output = Dense(fnn_units[0], activation='relu', name="doa_dense")(embedding_output)
    doa_output = Dropout(dropout_rate, name="doa_dropout")(doa_output)
    doa_output = Dense(num_classes, activation='softmax', name='doa_output')(doa_output)
    
    full_model = Model(inputs=spec_input, outputs=[sed_output, doa_output, embedding_output])
    train_model = Model(inputs=spec_input, outputs=[sed_output, doa_output])

    return full_model, train_model

full_model, train_model = build_resnet_conformer_seld(input_shape=(256, 61, 1))

train_model.compile(
    optimizer=Adam(learning_rate=0.00008), 
    # loss=['binary_crossentropy', masked_categorical_crossentropy],  
    loss_weights=[10.0, 10.0]  
)

train_model.summary()


In [None]:
import joblib
import matplotlib.pyplot as plt

print("Compiling model...")
train_model.compile(
    optimizer='adam',
    loss={
        'sed_output': 'binary_crossentropy',
        'doa_output': 'categorical_crossentropy'
    },
    loss_weights={
        'sed_output': 1.0,
        'doa_output': 0.0
          }
)

print("Starting model training...")
history = train_model.fit(x_train, [y_sed, y_doa], epochs=170, batch_size=64)
print("✅ Training complete.")

print("\nGenerating training loss graph...")

loss_history = history.history


total_loss = loss_history['loss']
sed_loss = loss_history['sed_output_loss']
doa_loss = loss_history['doa_output_loss']

epochs = range(1, len(total_loss) + 1)

# Plot the losses
plt.figure(figsize=(12, 6))
plt.plot(epochs, total_loss, 'r', label='Total Loss')
plt.plot(epochs, doa_loss, 'b', label='Word Prediction Loss (DOA)')
plt.plot(epochs, sed_loss, 'g', label='Event Detection Loss (SED)')
plt.title('Model Training Losses')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()
plt.show()


print("\n--- Finding Best Epoch ---")
best_epoch_index = np.argmin(total_loss)
min_loss = total_loss[best_epoch_index]
print(f"✅ Best Epoch: #{best_epoch_index + 1}")
print(f"   - Minimum Total Loss: {min_loss:.4f}")

print("\nSaving model and label encoder...")
model_path = 'model.keras'
train_model.save(model_path)
print(f"Model saved to: {model_path}")

encoder_path = 'label_encoder.joblib'
joblib.dump(label_encoder, encoder_path)
print(f"LabelEncoder saved to: {encoder_path}")

Compiling model...
Starting model training...
Epoch 1/170
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m693s[0m 810ms/step - doa_output_loss: 0.7480 - loss: 0.7586 - sed_output_loss: 0.0106
Epoch 2/170
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m758s[0m 890ms/step - doa_output_loss: 0.5240 - loss: 0.5241 - sed_output_loss: 4.6816e-05
Epoch 3/170
[1m649/851[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m2:19[0m 692ms/step - doa_output_loss: 0.4499 - loss: 0.4499 - sed_output_loss: 1.8640e-05

KeyboardInterrupt: 

In [None]:
import os
import librosa
import numpy as np
import tensorflow as tf
import joblib

model_path = 'drone_up_model.keras'
train_model = tf.keras.models.load_model(model_path, compile=False, safe_mode=False)
print(f"Model loaded from {model_path}")

encoder_path = 'drone_up_label_encoder.joblib'
label_encoder = joblib.load(encoder_path)
print(f"LabelEncoder loaded from {encoder_path}")

TEST_DATA_DIR = "/Users/neilkalipersad/Desktop/Research Lab Files/ML Program Code/test_audio/"


def process_audio_for_prediction(file_paths, sr=16000, n_mels=256, duration=1.0, desired_frames=61):
    x_data = []
    for path in file_paths:
        audio, _ = librosa.load(path, sr=sr, duration=duration)
        if len(audio) < sr * duration:
            audio = np.pad(audio, (0, int(sr * duration) - len(audio)))
        else:
            audio = audio[:int(sr * duration)]

        mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_db = mel_spec_db / 80.0 + 1.0
        x_data.append(mel_spec_db)

    x_batch = np.array(x_data)
    x_batch = x_batch[..., np.newaxis]
    current_frames = x_batch.shape[2]
    if current_frames < desired_frames:
        pad_width = ((0, 0), (0, 0), (0, desired_frames - current_frames), (0, 0))
        x_batch = np.pad(x_batch, pad_width, mode='constant')
    elif current_frames > desired_frames:
        x_batch = x_batch[:, :, :desired_frames, :]
    return x_batch

# --- 3. Predict and Interpret Results ---
test_files = [os.path.join(TEST_DATA_DIR, f) for f in os.listdir(TEST_DATA_DIR) if f.endswith('.wav')]

if not test_files:
    print(f"⚠️ No .wav files found in '{TEST_DATA_DIR}'. Please check the path.")
else:
    x_test = process_audio_for_prediction(test_files)
    predictions = train_model.predict(x_test)
    word_predictions = predictions[1]
    predicted_indices = np.argmax(word_predictions, axis=1)
    predicted_words = label_encoder.inverse_transform(predicted_indices)

    correct_predictions = 0
    total_predictions = len(test_files)

    print("\n--- Prediction Results ---")
    for i in range(total_predictions):
        filename = os.path.basename(test_files[i])
        confidence = np.max(word_predictions[i])
        
        # Get the true label from the filename
        true_label = filename.split('_')[1]
        predicted_label = predicted_words[i]

        print(f"File: {filename}")
        print(f"  - True Label: '{true_label}'")
        print(f"  - Predicted Intent: '{predicted_label}' (Confidence: {confidence:.2f})")

        # Compare prediction to the true label
        if predicted_label == true_label:
            correct_predictions += 1
            print("  - Result: Correct ✅")
        else:
            print("  - Result: Incorrect ❌")
        print("-" * 25)

    if total_predictions > 0:
        accuracy = (correct_predictions / total_predictions) * 100
        print("\n--- Final Tally ---")
        print(f"Total Correct Predictions: {correct_predictions}")
        print(f"Total Incorrect Predictions: {total_predictions - correct_predictions}")
        print(f"Accuracy: {accuracy:.2f}%")



Model loaded from drone_up_model.keras
LabelEncoder loaded from drone_up_label_encoder.joblib
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step

--- Prediction Results ---
File: noisy_left_330_8.wav
  - True Label: 'left'
  - Predicted Word: 'left' (Confidence: 0.99)
  - Result: Correct ✅
-------------------------
File: noisy_stop_0_182.wav
  - True Label: 'stop'
  - Predicted Word: 'stop' (Confidence: 1.00)
  - Result: Correct ✅
-------------------------
File: noisy_left_0_189.wav
  - True Label: 'left'
  - Predicted Word: 'left' (Confidence: 1.00)
  - Result: Correct ✅
-------------------------
File: noisy_left_0_188.wav
  - True Label: 'left'
  - Predicted Word: 'left' (Confidence: 1.00)
  - Result: Correct ✅
-------------------------
File: noisy_stop_0_183.wav
  - True Label: 'stop'
  - Predicted Word: 'stop' (Confidence: 1.00)
  - Result: Correct ✅
-------------------------
File: noisy_stop_0_181.wav
  - True Label: 'stop'
  - Predicted Word: 'stop' (Confide