In [None]:
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
import librosa.display
import glob


os.makedirs('spectrograms/bed', exist_ok=True)
os.makedirs('spectrograms/happy', exist_ok=True)


audio_dir = "/audio" 


bed_files = glob.glob(os.path.join(audio_dir, 'bed/*.wav'))
happy_files = glob.glob(os.path.join(audio_dir, 'happy/*.wav'))


print("Bed files:", bed_files)
print("Happy files:", happy_files)

def create_spectrogram(audio_file, label, index):
    
    y, sr = librosa.load(audio_file, sr=None)
    
   
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)  # Updated this line
    
    
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    
    
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mel_spectrogram_db, sr=sr, x_axis='time', y_axis='mel', fmax=8000)
    plt.colorbar(format='%+2.0f dB')
    plt.title(f'Mel Spectrogram - {label}')
    plt.tight_layout()
    
    
    filename = os.path.join('spectrograms', label, f'{index}_{label}_spectrogram.png')
    plt.savefig(filename)
    plt.close()  


for idx, audio_file in enumerate(bed_files):
    create_spectrogram(audio_file, 'bed', idx + 1)


for idx, audio_file in enumerate(happy_files):
    create_spectrogram(audio_file, 'happy', idx + 1)

print("Spectrograms generated and saved.")


In [None]:
import os
import numpy as np
import cv2 
import glob


spectrogram_dir = 'spectrograms/'  


def load_data(spectrogram_dir):
    X = []  
    y = []  

    
    bed_images = glob.glob(os.path.join(spectrogram_dir, 'bed/*.png'))
    for img_path in bed_images:
        img = cv2.imread(img_path)
        img = cv2.resize(img, (128, 128)) 
        X.append(img)
        y.append(0)  

 
    happy_images = glob.glob(os.path.join(spectrogram_dir, 'happy/*.png'))
    for img_path in happy_images:
        img = cv2.imread(img_path)
        img = cv2.resize(img, (128, 128))
        X.append(img)
        y.append(1)  

    return np.array(X), np.array(y)


X, y = load_data(spectrogram_dir)

# Normalize the images to [0, 1]
X = X.astype('float32') / 255.0  

print("Loaded data shapes:", X.shape, y.shape)  


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)


In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert labels to categorical format
y_train = to_categorical(y_train, num_classes=2)  # Assuming 2 classes: bed and happy
y_test = to_categorical(y_test, num_classes=2)

print("Training data shape:", X_train.shape, y_train.shape)
print("Testing data shape:", X_test.shape, y_test.shape)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model

# Define the base network to be shared
def create_base_network(input_shape):
    model = tf.keras.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))

    return model

# Create the base network
input_shape = (128, 128, 3) 
base_network = create_base_network(input_shape)

# Define the input layers for the two images
input_a = layers.Input(shape=input_shape)
input_b = layers.Input(shape=input_shape)


encoded_a = base_network(input_a)
encoded_b = base_network(input_b)


merged = layers.Subtract()([encoded_a, encoded_b])
output = layers.Dense(1, activation='sigmoid')(merged)  


siamese_model = Model(inputs=[input_a, input_b], outputs=output)

siamese_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

siamese_model.summary()


In [None]:
def create_pairs(X, y):
    pairs = []
    labels = []

    # Create positive pairs
    unique_classes = np.unique(y)
    for cls in unique_classes:
        class_indices = np.where(y == cls)[0]
        for i in range(len(class_indices)):
            for j in range(i + 1, len(class_indices)):
                pairs.append([X[class_indices[i]], X[class_indices[j]]])
                labels.append(1)  # Positive pair

    # Create negative pairs
    for i in range(len(X)):
        for j in range(len(X)):
            if y[i] != y[j]:
                pairs.append([X[i], X[j]])
                labels.append(0)  # Negative pair

    return np.array(pairs), np.array(labels)


In [None]:
# Assuming X and y are your image data and labels
pairs, labels = create_pairs(X, y)

# Split into training and validation sets
from sklearn.model_selection import train_test_split

pairs_train, pairs_val, labels_train, labels_val = train_test_split(pairs, labels, test_size=0.2, random_state=42)

# Reshape pairs for model input
input_a_train = np.array([pair[0] for pair in pairs_train])
input_b_train = np.array([pair[1] for pair in pairs_train])
input_a_val = np.array([pair[0] for pair in pairs_val])
input_b_val = np.array([pair[1] for pair in pairs_val])


In [None]:
# Train the model
siamese_model.fit([input_a_train, input_b_train], labels_train, 
                  validation_data=([input_a_val, input_b_val], labels_val),
                  epochs=10, 
                  batch_size=32)  


In [None]:
# Evaluate the model
loss, accuracy = siamese_model.evaluate([input_a_val, input_b_val], labels_val)
print(f'Validation Loss: {loss:.4f}, Validation Accuracy: {accuracy:.4f}')


In [None]:
siamese_model.save('siamese_model.h5')  


# PIPELINE

In [None]:
import os
import numpy as np
import cv2 
import librosa  
from tensorflow.keras.models import load_model


model = load_model('siamese_model.h5')


def audio_to_spectrogram(audio_file, output_dir='spectrograms/temp'):
  
    y, sr = librosa.load(audio_file, sr=None)
    
  
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
    
    
    log_S = librosa.power_to_db(S, ref=np.max)
    
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    spectrogram_path = os.path.join(output_dir, 'temp_spectrogram.png')
    librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel', fmax=8000)
    plt.colorbar(format='%+2.0f dB')
    plt.savefig(spectrogram_path)
    plt.close()
    
    return spectrogram_path


def prepare_spectrogram_for_prediction(spectrogram_path):
    img = cv2.imread(spectrogram_path)
    img = cv2.resize(img, (128, 128))  # Resize to the model input size
    img = img.astype('float32') / 255.0  # Normalize to [0, 1]
    return img


def predict_keyword(audio_file):
   
    spectrogram_path = audio_to_spectrogram(audio_file)

    # Step 2: Prepare the spectrogram for prediction
    spectrogram_img = prepare_spectrogram_for_prediction(spectrogram_path)

    # Create pairs for prediction (using a reference spectrogram from the training set)
    # Load a reference spectrogram for 'bed' and 'happy'
    bed_reference = cv2.imread('spectrograms/1_bed.png') 
    happy_reference = cv2.imread('spectrograms/1_happy/.png')  
    
    bed_reference = prepare_spectrogram_for_prediction(bed_reference)
    happy_reference = prepare_spectrogram_for_prediction(happy_reference)

   
    input_a = np.expand_dims(spectrogram_img, axis=0)
    input_b_bed = np.expand_dims(bed_reference, axis=0)
    input_b_happy = np.expand_dims(happy_reference, axis=0)


    score_bed = model.predict([input_a, input_b_bed])
    score_happy = model.predict([input_a, input_b_happy])


    if score_bed > 0.5:
        print("The audio contains 'bed'. Similarity Score:", score_bed[0][0])
    else:
        print("The audio does not contain 'bed'. Similarity Score:", score_bed[0][0])

    if score_happy > 0.5:
        print("The audio contains 'happy'. Similarity Score:", score_happy[0][0])
    else:
        print("The audio does not contain 'happy'. Similarity Score:", score_happy[0][0])


audio_file = ''  
predict_keyword(audio_file)
