1. Parse the CSV File
First, let's inspect the structure of the CSV file to understand its contents.

In [2]:
import pandas as pd

# Load the CSV file to inspect its structure
file_path = '../dataset/development.csv'  # Update with your actual file path
metadata = pd.read_csv(file_path)

# Display the first few rows of the metadata
print(metadata.head())


   id                 filename  speaker_id         word
0   0     words/Brötchen/1.wav           1     Brötchen
1   1         words/kann/1.wav           1         kann
2   2  words/Staubsauger/1.wav           1  Staubsauger
3   3      words/Spiegel/1.wav           1      Spiegel
4   4        words/Alarm/1.wav           1        Alarm


2. Load the WAV Files
Assuming the CSV contains columns like file_path and label, we can use the librosa library to load the WAV files.

In [5]:
import os
import pandas as pd
import librosa

# Define the root directory where the dataset is located
root_dir = '../dataset'  # Replace <root> with the actual path to your root directory

# Load the CSV file
file_path = '../dataset/development.csv'  # Update with your actual file path
metadata = pd.read_csv(file_path)

# Function to load a WAV file using the full path
def load_wav(file_path):
    audio, sr = librosa.load(file_path, sr=None)
    return audio, sr

# Get the relative file path from the CSV and construct the full path
relative_file_path = metadata.loc[0, 'filename']  # Assuming the column name is 'filename'
full_file_path = os.path.join(root_dir, relative_file_path)

# Load the first audio file
audio, sr = load_wav(full_file_path)

# Print the shape of the audio array and the sample rate
print(audio.shape, sr)


(17600,) 16000


In [12]:
import pandas as pd
import librosa
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import FastICA
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the root directory where the dataset is located
root_dir = '../dataset'  # Update with your actual path to the dataset directory

# Load the CSV file
file_path = os.path.join(root_dir, 'development.csv')
metadata = pd.read_csv(file_path)

# Function to load a WAV file using the full path
def load_wav(file_path):
    audio, sr = librosa.load(file_path, sr=None)
    return audio, sr

# Directory to save preprocessed data
preprocessed_dir = 'preprocessed_data'
os.makedirs(preprocessed_dir, exist_ok=True)

# Function to extract the label from the directory name
def extract_label(file_path):
    # Assuming the structure is "root_dir/label/filename.wav"
    return os.path.basename(os.path.dirname(file_path))

# Function to check and handle NaNs and Infs
def check_and_handle_nans_infs(array):
    if np.isnan(array).any() or np.isinf(array).any():
        logging.warning(f"NaNs or Infs found in array: replacing with 0s")
        # Replace NaNs with 0
        array = np.nan_to_num(array)
        # Replace Infs with 0
        array[np.isinf(array)] = 0
    return array

# Function to preprocess and save all audio files
def preprocess_and_save(metadata, root_dir):
    scaler = StandardScaler()
    ica = FastICA(n_components=1, whiten='unit-variance')
    
    for i, row in metadata.iterrows():
        relative_file_path = row['filename']
        full_file_path = os.path.join(root_dir, relative_file_path)
        label = extract_label(full_file_path)
        
        try:
            # Load and preprocess audio
            logging.info(f"Processing file {full_file_path}")
            audio, sr = load_wav(full_file_path)
            logging.info(f"Loaded audio shape: {audio.shape}, sample rate: {sr}")
            
            # Check for NaNs or Infs in the original audio
            audio = check_and_handle_nans_infs(audio)
            
            audio_scaled = scaler.fit_transform(audio.reshape(-1, 1)).flatten()
            logging.info(f"Scaled audio shape: {audio_scaled.shape}")
            
            # Check for NaNs or Infs in the scaled audio
            audio_scaled = check_and_handle_nans_infs(audio_scaled)
            
            audio_ica = ica.fit_transform(audio_scaled.reshape(-1, 1)).flatten()
            logging.info(f"ICA transformed audio shape: {audio_ica.shape}")
            
            # Save preprocessed audio and label
            np.save(os.path.join(preprocessed_dir, f'audio_{i}.npy'), audio_ica)
            np.save(os.path.join(preprocessed_dir, f'label_{i}.npy'), label)
        except Exception as e:
            logging.error(f"Error processing file {full_file_path}: {e}")
            continue

# Preprocess and save all audio files
preprocess_and_save(metadata, root_dir)

# Function to load preprocessed data
def load_preprocessed_data(preprocessed_dir):
    X = []
    y = []
    
    for file_name in os.listdir(preprocessed_dir):
        if file_name.startswith('audio'):
            audio = np.load(os.path.join(preprocessed_dir, file_name))
            label_file = file_name.replace('audio', 'label')
            label = np.load(os.path.join(preprocessed_dir, label_file))
            
            X.append(audio)
            y.append(label)
    
    return np.array(X), np.array(y)

# Load preprocessed data
X, y = load_preprocessed_data(preprocessed_dir)

# Split data into training, validation, and test sets
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f'Training set size: {len(X_train)}')
print(f'Validation set size: {len(X_val)}')
print(f'Test set size: {len(X_test)}')


2024-05-20 18:17:45,361 - INFO - Processing file ../dataset/words/Brötchen/1.wav
2024-05-20 18:17:45,365 - INFO - Loaded audio shape: (17600,), sample rate: 16000
2024-05-20 18:17:45,371 - INFO - Scaled audio shape: (17600,)
2024-05-20 18:17:45,374 - INFO - ICA transformed audio shape: (17600,)
2024-05-20 18:17:45,377 - INFO - Processing file ../dataset/words/kann/1.wav
2024-05-20 18:17:45,380 - INFO - Loaded audio shape: (17600,), sample rate: 16000
2024-05-20 18:17:45,383 - INFO - Scaled audio shape: (17600,)
2024-05-20 18:17:45,388 - INFO - ICA transformed audio shape: (17600,)
2024-05-20 18:17:45,391 - INFO - Processing file ../dataset/words/Staubsauger/1.wav
2024-05-20 18:17:45,394 - INFO - Loaded audio shape: (17600,), sample rate: 16000
2024-05-20 18:17:45,397 - INFO - Scaled audio shape: (17600,)
2024-05-20 18:17:45,401 - INFO - ICA transformed audio shape: (17600,)
2024-05-20 18:17:45,404 - INFO - Processing file ../dataset/words/Spiegel/1.wav
2024-05-20 18:17:45,409 - INFO - 

1. Verify Preprocessed Data
Ensure that the preprocessed data was saved correctly and can be loaded for model training.


In [1]:
import os
import numpy as np

# Function to load preprocessed data
def load_preprocessed_data(preprocessed_dir):
    X = []
    y = []
    
    for file_name in os.listdir(preprocessed_dir):
        if file_name.startswith('audio'):
            audio = np.load(os.path.join(preprocessed_dir, file_name))
            label_file = file_name.replace('audio', 'label')
            label = np.load(os.path.join(preprocessed_dir, label_file))
            
            X.append(audio)
            y.append(label)
    
    return np.array(X), np.array(y)

# Load preprocessed data
preprocessed_dir = 'preprocessed_data'
X, y = load_preprocessed_data(preprocessed_dir)

print(f'Loaded {len(X)} preprocessed audio files.')
print(f'Labels: {set(y)}')

# Split data into training, validation, and test sets
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f'Training set size: {len(X_train)}')
print(f'Validation set size: {len(X_val)}')
print(f'Test set size: {len(X_test)}')


Loaded 45278 preprocessed audio files.
Labels: {'Radio', 'Leitung', 'Licht', 'aus', 'Heizung', 'Lüftung', 'Fernseher', 'Alarm', 'an', 'Brötchen', 'nicht', 'Staubsauger', 'other', 'Spiegel', 'wunderbar', 'warm', 'kann', 'offen', 'Schraube', 'Haus', 'Ofen'}
Training set size: 31694
Validation set size: 6792
Test set size: 6792


2. Train and Evaluate Models
Use the preprocessed data to train and evaluate your classifiers.

Random Forest

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)
rf.fit(X_train, y_train)

# Evaluate the classifier on the validation set
y_pred_val = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_pred_val)
precision_val = precision_score(y_val, y_pred_val, average='weighted')
recall_val = recall_score(y_val, y_pred_val, average='weighted')
f1_val = f1_score(y_val, y_pred_val, average='weighted')

print(f'Random Forest Validation Accuracy: {accuracy_val}')
print(f'Random Forest Validation Precision: {precision_val}')
print(f'Random Forest Validation Recall: {recall_val}')
print(f'Random Forest Validation F1-Score: {f1_val}')


Random Forest Validation Accuracy: 0.41843345111896346
Random Forest Validation Precision: 0.3982329813061643
Random Forest Validation Recall: 0.41843345111896346
Random Forest Validation F1-Score: 0.3938377038849544


Nearest Neighbour

In [3]:
from sklearn.neighbors import KNeighborsClassifier

# Train the Nearest Neighbour classifier
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(X_train, y_train)

# Evaluate the classifier on the validation set
y_pred_val = knn.predict(X_val)
accuracy_val = accuracy_score(y_val, y_pred_val)
precision_val = precision_score(y_val, y_pred_val, average='weighted')
recall_val = recall_score(y_val, y_pred_val, average='weighted')
f1_val = f1_score(y_val, y_pred_val, average='weighted')

print(f'Nearest Neighbour Validation Accuracy: {accuracy_val}')
print(f'Nearest Neighbour Validation Precision: {precision_val}')
print(f'Nearest Neighbour Validation Recall: {recall_val}')
print(f'Nearest Neighbour Validation F1-Score: {f1_val}')


Nearest Neighbour Validation Accuracy: 0.2046525323910483
Nearest Neighbour Validation Precision: 0.24980720257801528
Nearest Neighbour Validation Recall: 0.2046525323910483
Nearest Neighbour Validation F1-Score: 0.20450539164436662


CNN

In [11]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Print the original shape of the data
print(f'Original X_train shape: {X_train.shape}')
print(f'Original X_val shape: {X_val.shape}')
print(f'Original X_test shape: {X_test.shape}')

# Reshape data for 1D CNN
X_train_cnn = X_train.reshape(-1, X_train.shape[1], 1)
X_val_cnn = X_val.reshape(-1, X_val.shape[1], 1)
X_test_cnn = X_test.reshape(-1, X_test.shape[1], 1)

# Print the reshaped data shape
print(f'Reshaped X_train_cnn shape: {X_train_cnn.shape}')
print(f'Reshaped X_val_cnn shape: {X_val_cnn.shape}')
print(f'Reshaped X_test_cnn shape: {X_test_cnn.shape}')

# Encode labels as numeric
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Convert labels to categorical
num_classes = len(np.unique(y_train_encoded))
y_train_cnn = to_categorical(y_train_encoded, num_classes)
y_val_cnn = to_categorical(y_val_encoded, num_classes)
y_test_cnn = to_categorical(y_test_encoded, num_classes)

# Print unique encoded labels to check
print(f'Encoded labels: {np.unique(y_train_encoded)}')

# Build the 1D CNN model
model = Sequential([
    Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Conv1D(64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the 1D CNN model
history = model.fit(X_train_cnn, y_train_cnn, validation_data=(X_val_cnn, y_val_cnn), epochs=20, batch_size=32)

# Evaluate the model on the validation set
val_loss, val_accuracy = model.evaluate(X_val_cnn, y_val_cnn)
print(f'1D CNN Validation Accuracy: {val_accuracy}')


Original X_train shape: (31694, 17600)
Original X_val shape: (6792, 17600)
Original X_test shape: (6792, 17600)
Reshaped X_train_cnn shape: (31694, 17600, 1)
Reshaped X_val_cnn shape: (6792, 17600, 1)
Reshaped X_test_cnn shape: (6792, 17600, 1)
Encoded labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]


  super().__init__(


Epoch 1/20
[1m991/991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m560s[0m 561ms/step - accuracy: 0.2889 - loss: 2.4782 - val_accuracy: 0.6787 - val_loss: 1.0791
Epoch 2/20
[1m991/991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m660s[0m 666ms/step - accuracy: 0.5367 - loss: 1.3132 - val_accuracy: 0.7304 - val_loss: 0.8256
Epoch 3/20
[1m991/991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m704s[0m 711ms/step - accuracy: 0.6502 - loss: 0.9776 - val_accuracy: 0.7607 - val_loss: 0.7045
Epoch 4/20
[1m991/991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m550s[0m 555ms/step - accuracy: 0.7029 - loss: 0.8108 - val_accuracy: 0.7727 - val_loss: 0.6843
Epoch 5/20
[1m991/991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m809s[0m 816ms/step - accuracy: 0.7532 - loss: 0.6684 - val_accuracy: 0.7764 - val_loss: 0.6786
Epoch 6/20
[1m991/991[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m667s[0m 673ms/step - accuracy: 0.7836 - loss: 0.5770 - val_accuracy: 0.7761 - val_loss: 0.6903
Epoc

KeyboardInterrupt: 