In [9]:
import os
import pandas as pd
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical

data_dir = '../dataset'
annotations_file = f'{data_dir}/development_scene_annotations.csv'

# Load annotations
annotations = pd.read_csv(annotations_file)

def extract_features(y, sr, n_mels=128):
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    return log_mel_spec.T

def pad_features(features, max_len):
    padded_features = []
    for feature in features:
        pad_width = max_len - feature.shape[0]
        if pad_width > 0:
            feature = np.pad(feature, ((0, pad_width), (0, 0)), mode='constant')
        padded_features.append(feature)
    return np.array(padded_features)

def prepare_command_data(annotations, data_dir, sr=16000, n_mels=128):
    command_features = []
    command_labels = []
    command_mapping = {}  # Mapping of command texts to numerical labels
    current_label = 0
    max_len = 0  # To determine the maximum length of features

    for index, row in annotations.iterrows():
        audio_path = os.path.join(data_dir, 'scenes', 'wav', row['filename'] + '.wav')
        y, _ = librosa.load(audio_path, sr=sr)
        start_sample = int(row['start'] * sr)
        end_sample = int(row['end'] * sr)
        
        command_text = row['command']  # Assuming the command text is in this column
        if command_text not in command_mapping:
            command_mapping[command_text] = current_label
            current_label += 1
        
        command_label = command_mapping[command_text]
        command_segment = y[start_sample:end_sample]
        features = extract_features(command_segment, sr, n_mels)
        max_len = max(max_len, features.shape[0])  # Update max_len
        
        command_features.append(features)
        command_labels.append(command_label)

    # Pad features to the same length
    command_features = pad_features(command_features, max_len)

    return np.array(command_features), np.array(command_labels), command_mapping

def create_boundary_detection_data(annotations, data_dir, window_size=0.5, step_size=0.1, sr=16000, n_mels=128):
    windows = []
    labels = []
    for index, row in annotations.iterrows():
        audio_path = os.path.join(data_dir, 'scenes', 'wav', row['filename'] + '.wav')
        y, _ = librosa.load(audio_path, sr=sr)
        start_sample = int(row['start'] * sr)
        end_sample = int(row['end'] * sr)

        for i in range(start_sample, end_sample - int(window_size * sr), int(step_size * sr)):
            window = y[i:i + int(window_size * sr)]
            features = extract_features(window, sr, n_mels)
            label = 1 if (i == start_sample or i + int(window_size * sr) >= end_sample) else 0
            windows.append(features)
            labels.append(label)

    return np.array(windows), np.array(labels)

def build_boundary_detection_model(input_shape):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_command_recognition_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def detect_command_pairs(audio_path, boundary_model, command_model, sr=16000, window_size=0.5, step_size=0.1, n_mels=128):
    y, _ = librosa.load(audio_path, sr=sr)
    window_samples = int(window_size * sr)
    step_samples = int(step_size * sr)

    windows = librosa.util.frame(y, frame_length=window_samples, hop_length=step_samples)
    windows = windows.T.reshape((-1, window_samples))

    boundaries = []
    for window in windows:
        features = extract_features(window, sr, n_mels).reshape((1, n_mels, -1, 1))
        boundary_prediction = boundary_model.predict(features)
        boundaries.append(boundary_prediction)

    boundaries = np.where(np.array(boundaries) > 0.5)[0] * step_samples

    command_segments = []
    for start in boundaries:
        end = start + window_samples
        segment = y[start:end]
        segment_features = extract_features(segment, sr, n_mels).reshape((1, n_mels, -1, 1))
        command_prediction = command_model.predict(segment_features)
        command_segments.append((segment, command_prediction))

    return command_segments

# Prepare command features and labels
command_features, command_labels, command_mapping = prepare_command_data(annotations, data_dir)

# Reshape features for the CNN
command_features = command_features.reshape((command_features.shape[0], command_features.shape[1], command_features.shape[2], 1))

# Normalize features
command_features = command_features / np.max(command_features)

# One-hot encode labels
num_classes = len(command_mapping)
command_labels = to_categorical(command_labels, num_classes=num_classes)

print(command_mapping)

# Prepare data for boundary detection
windows, labels = create_boundary_detection_data(annotations, data_dir)

# Reshape data for the CNN
windows = windows.reshape((windows.shape[0], windows.shape[1], windows.shape[2], 1))

# Build and train the boundary detection model
input_shape = (windows.shape[1], windows.shape[2], 1)
boundary_model = build_boundary_detection_model(input_shape)
boundary_model.fit(windows, labels, epochs=10, batch_size=32, validation_split=0.2)

# Build and train the command recognition model
input_shape = (command_features.shape[1], command_features.shape[2], 1)
command_model = build_command_recognition_model(input_shape, num_classes)
command_model.fit(command_features, command_labels, epochs=10, batch_size=32, validation_split=0.2)

# Detect commands in a new audio file
new_audio_path = f'{data_dir}/scenes/wav/2015_speech_true_Ofen_aus_Alarm_an.wav'
detected_commands = detect_command_pairs(new_audio_path, boundary_model, command_model)

# Print recognized commands
for segment, command in detected_commands:
    print(command)


2024-06-04 08:27:44.690828: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'Licht aus': 0, 'Ofen an': 1, 'Radio an': 2, 'Fernseher an': 3, 'Heizung aus': 4, 'Alarm an': 5, 'Lüftung aus': 6, 'Staubsauger aus': 7, 'Heizung an': 8, 'Staubsauger an': 9, 'Alarm aus': 10, 'Licht an': 11, 'Ofen aus': 12, 'Radio aus': 13, 'Lüftung an': 14, 'Fernseher aus': 15}


  super().__init__(


Epoch 1/10


ValueError: Exception encountered when calling Conv2D.call().

[1mNegative dimension size caused by subtracting 3 from 2 for '{{node sequential_1/conv2d_2_1/convolution}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], explicit_paddings=[], padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true](sequential_1/max_pooling2d_1_2/MaxPool2d, sequential_1/conv2d_2_1/convolution/ReadVariableOp)' with input shapes: [?,2,30,64], [3,3,64,128].[0m

Arguments received by Conv2D.call():
  • inputs=tf.Tensor(shape=(None, 2, 30, 64), dtype=float32)