In [11]:
import os
import pandas as pd
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

data_dir = '/mnt/data'
annotations_file = f'{data_dir}/development_scene_annotations.csv'

# Load annotations
logging.info('Loading annotations...')
annotations = pd.read_csv(annotations_file)
logging.info('Annotations loaded.')

def extract_features(y, sr, n_mels=128):
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    return log_mel_spec.T

def pad_features(features, max_len):
    padded_features = []
    for feature in features:
        pad_width = max_len - feature.shape[0]
        if pad_width > 0:
            feature = np.pad(feature, ((0, pad_width), (0, 0)), mode='constant')
        padded_features.append(feature)
    return np.array(padded_features)

def prepare_command_data(annotations, data_dir, sr=16000, n_mels=128):
    command_features = []
    command_labels = []
    command_mapping = {}  # Mapping of command texts to numerical labels
    current_label = 0
    max_len = 0  # To determine the maximum length of features

    logging.info('Preparing command data...')
    for index, row in tqdm(annotations.iterrows(), total=annotations.shape[0]):
        audio_path = os.path.join(data_dir, 'scenes', 'wav', row['filename'] + '.wav')
        y, _ = librosa.load(audio_path, sr=sr)
        start_sample = int(row['start'] * sr)
        end_sample = int(row['end'] * sr)
        
        command_text = row['command']  # Assuming the command text is in this column
        if command_text not in command_mapping:
            command_mapping[command_text] = current_label
            current_label += 1
        
        command_label = command_mapping[command_text]
        command_segment = y[start_sample:end_sample]
        features = extract_features(command_segment, sr, n_mels)
        max_len = max(max_len, features.shape[0])  # Update max_len
        
        command_features.append(features)
        command_labels.append(command_label)

    # Pad features to the same length
    command_features = pad_features(command_features, max_len)
    logging.info('Command data prepared.')

    return np.array(command_features), np.array(command_labels), command_mapping

def create_boundary_detection_data(annotations, data_dir, window_size=0.5, step_size=0.1, sr=16000, n_mels=128):
    windows = []
    labels = []
    logging.info('Creating boundary detection data...')
    for index, row in tqdm(annotations.iterrows(), total=annotations.shape[0]):
        audio_path = os.path.join(data_dir, 'scenes', 'wav', row['filename'] + '.wav')
        y, _ = librosa.load(audio_path, sr=sr)
        start_sample = int(row['start'] * sr)
        end_sample = int(row['end'] * sr)

        for i in range(start_sample, end_sample - int(window_size * sr), int(step_size * sr)):
            window = y[i:i + int(window_size * sr)]
            features = extract_features(window, sr, n_mels)
            label = 1 if (i == start_sample or i + int(window_size * sr) >= end_sample) else 0
            windows.append(features)
            labels.append(label)
    
    logging.info('Boundary detection data created.')
    return np.array(windows), np.array(labels)

def build_boundary_detection_model(input_shape):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_command_recognition_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def detect_command_pairs(audio_path, boundary_model, command_model, sr=16000, window_size=0.5, step_size=0.1, n_mels=128):
    y, _ = librosa.load(audio_path, sr=sr)
    window_samples = int(window_size * sr)
    step_samples = int(step_size * sr)

    windows = librosa.util.frame(y, frame_length=window_samples, hop_length=step_samples)
    windows = windows.T.reshape((-1, window_samples))

    boundaries = []
    for window in windows:
        features = extract_features(window, sr, n_mels).reshape((1, n_mels, -1, 1))
        boundary_prediction = boundary_model.predict(features)
        boundaries.append(boundary_prediction)

    boundaries = np.where(np.array(boundaries) > 0.5)[0] * step_samples

    command_segments = []
    for start in boundaries:
        end = start + window_samples
        segment = y[start:end]
        segment_features = extract_features(segment, sr, n_mels).reshape((1, n_mels, -1, 1))
        command_prediction = command_model.predict(segment_features)
        command_segments.append((segment, command_prediction))

    return command_segments

# Prepare command features and labels
command_features, command_labels, command_mapping = prepare_command_data(annotations, data_dir)

# Reshape features for the CNN
command_features = command_features.reshape((command_features.shape[0], command_features.shape[1], command_features.shape[2], 1))

# Normalize features
command_features = command_features / np.max(command_features)

# One-hot encode labels
num_classes = len(command_mapping)
command_labels = to_categorical(command_labels, num_classes=num_classes)

logging.info(f'Command mapping: {command_mapping}')

# Prepare data for boundary detection
windows, labels = create_boundary_detection_data(annotations, data_dir)

# Reshape data for the CNN
windows = windows.reshape((windows.shape[0], windows.shape[1], windows.shape[2], 1))

# Build and train the boundary detection model
input_shape = (windows.shape[1], windows.shape[2], 1)
boundary_model = build_boundary_detection_model(input_shape)
logging.info('Training boundary detection model...')
boundary_model.fit(windows, labels, epochs=10, batch_size=32, validation_split=0.2)
logging.info('Boundary detection model trained.')

# Build and train the command recognition model
input_shape = (command_features.shape[1], command_features.shape[2], 1)
command_model = build_command_recognition_model(input_shape, num_classes)
logging.info('Training command recognition model...')
command_model.fit(command_features, command_labels, epochs=10, batch_size=32, validation_split=0.2)
logging.info('Command recognition model trained.')

# Detect commands in a new audio file
new_audio_path = f'{data_dir}/scenes/wav/2015_speech_true_Ofen_aus_Alarm_an.wav'
logging.info(f'Detecting commands in {new_audio_path}...')
detected_commands = detect_command_pairs(new_audio_path, boundary_model, command_model)

# Print recognized commands
logging.info('Detected commands:')
for segment, command in detected_commands:
    print(command)


2024-06-04 08:41:45,749 - INFO - Loading annotations...
2024-06-04 08:41:45,756 - INFO - Annotations loaded.
2024-06-04 08:41:45,758 - INFO - Preparing command data...
100%|██████████| 1065/1065 [00:12<00:00, 88.11it/s]
2024-06-04 08:41:58,111 - INFO - Command data prepared.
2024-06-04 08:41:58,211 - INFO - Command mapping: {'Licht aus': 0, 'Ofen an': 1, 'Radio an': 2, 'Fernseher an': 3, 'Heizung aus': 4, 'Alarm an': 5, 'Lüftung aus': 6, 'Staubsauger aus': 7, 'Heizung an': 8, 'Staubsauger an': 9, 'Alarm aus': 10, 'Licht an': 11, 'Ofen aus': 12, 'Radio aus': 13, 'Lüftung an': 14, 'Fernseher aus': 15}
2024-06-04 08:41:58,212 - INFO - Creating boundary detection data...
100%|██████████| 1065/1065 [01:17<00:00, 13.73it/s]
2024-06-04 08:43:15,788 - INFO - Boundary detection data created.
  super().__init__(
2024-06-04 08:43:15,954 - INFO - Training boundary detection model...


Epoch 1/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 71ms/step - accuracy: 0.8844 - loss: 0.8289 - val_accuracy: 0.9003 - val_loss: 0.2506
Epoch 2/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 85ms/step - accuracy: 0.9040 - loss: 0.2319 - val_accuracy: 0.8993 - val_loss: 0.2146
Epoch 3/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 95ms/step - accuracy: 0.9015 - loss: 0.2229 - val_accuracy: 0.8797 - val_loss: 0.2376
Epoch 4/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 99ms/step - accuracy: 0.9058 - loss: 0.2053 - val_accuracy: 0.9012 - val_loss: 0.2247
Epoch 5/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 102ms/step - accuracy: 0.9019 - loss: 0.2106 - val_accuracy: 0.9036 - val_loss: 0.2114
Epoch 6/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 103ms/step - accuracy: 0.9162 - loss: 0.1795 - val_accuracy: 0.9017 - val_loss: 0.2116
Epoch 7/10
[1

2024-06-04 08:47:35,398 - INFO - Boundary detection model trained.
2024-06-04 08:47:35,478 - INFO - Training command recognition model...


Epoch 1/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 281ms/step - accuracy: 0.0638 - loss: 2367489.2500 - val_accuracy: 0.1033 - val_loss: 2002.7955
Epoch 2/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 276ms/step - accuracy: 0.0663 - loss: 1076.9802 - val_accuracy: 0.0469 - val_loss: 12.5413
Epoch 3/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 271ms/step - accuracy: 0.0476 - loss: 14.9981 - val_accuracy: 0.0423 - val_loss: 2.7734
Epoch 4/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 294ms/step - accuracy: 0.0576 - loss: 2.9529 - val_accuracy: 0.0657 - val_loss: 2.7731
Epoch 5/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 286ms/step - accuracy: 0.0808 - loss: 2.7737 - val_accuracy: 0.0657 - val_loss: 2.7730
Epoch 6/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 283ms/step - accuracy: 0.0733 - loss: 2.7703 - val_accuracy: 0.0657 - val_loss: 2.7728
Epoch 7/10
[1

2024-06-04 08:48:56,397 - INFO - Command recognition model trained.
2024-06-04 08:48:56,398 - INFO - Detecting commands in ../dataset/scenes/wav/2015_speech_true_Ofen_aus_Alarm_an.wav...


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2

ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense_4" is incompatible with the layer: expected axis -1 of input shape to have value 22528, but received input with shape (1, 4096)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(1, 128, 16, 1), dtype=float32)
  • training=False
  • mask=None