In [1]:
from utils.attributes import load_data

In [2]:
dataset, labels = load_data()

### Wavelet transform + RNN

In [3]:
import pywt
import numpy as np
# Function to extract wavelet features from an audio signal
def extract_wavelet_features(signal, wavelet='db4', level=5):
    coeffs = pywt.wavedec(signal, wavelet, level=level)
    features = []
    for coeff in coeffs:
        features.extend([np.mean(coeff), np.std(coeff)])
    return np.array(features)

In [4]:
features = []
for data in dataset:
    y, sr = data.audio, data.sr
    feature_vector = extract_wavelet_features(y)
    # Assuming data.language is already encoded as a numerical value or one-hot vector
    combined_feature = np.hstack((feature_vector, data.language))
    features.append(combined_feature)

In [5]:
labels = np.array(labels)
features = np.array(features)

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

# Define a custom Dataset class for PyTorch
class AudioDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.as_tensor(features, dtype=torch.float32)
        self.labels = torch.as_tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [7]:
# Create Dataset and DataLoader
audio_dataset = AudioDataset(features, labels)
train_size = int(0.8 * len(audio_dataset))
test_size = len(audio_dataset) - train_size
train_dataset, test_dataset = random_split(audio_dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define the RNN model
class AudioRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(AudioRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Initialize hidden state
        h0 = torch.zeros(num_layers, x.size(0), hidden_size).to(device)
        # Forward propagate RNN
        out, _ = self.rnn(x, h0)
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = features.shape[1]  # Number of input features
hidden_size = 64
num_layers = 5
num_classes = len(np.unique(labels))  # Number of output classes
num_epochs = 200
learning_rate = 0.05

# Device configuration
if torch.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [8]:
import torch
from tqdm import tqdm

# Initialize the model, loss function, and optimizer
model = AudioRNN(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop with progress bar
for epoch in range(num_epochs):
    model.train()
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")

    for _features, _labels in train_loader_tqdm:
        _features = _features.to(device)
        _labels = _labels.to(device)

        # Forward pass
        outputs = model(_features.unsqueeze(1))  # Add sequence dimension
        loss = criterion(outputs, _labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loader_tqdm.set_postfix(loss=loss.item())

# Evaluation with progress bar
model.eval()
all_preds = []
all_labels = []
test_loader_tqdm = tqdm(test_loader, desc="Evaluating", unit="batch")

with torch.no_grad():
    for _features, _labels in test_loader_tqdm:
        _features = _features.to(device)
        _labels = _labels.to(device)
        outputs = model(_features.unsqueeze(1))  # Add sequence dimension
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(_labels.cpu().numpy())

# Calculate accuracy
accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
print(f'Test Accuracy: {accuracy:.4f}')

# Save the model weights
model_save_path = "model_weights.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model weights saved to {model_save_path}")


Epoch 1/200: 100%|██████████| 35/35 [00:00<00:00, 64.71batch/s, loss=0.707]
Epoch 2/200: 100%|██████████| 35/35 [00:00<00:00, 153.94batch/s, loss=0.698]
Epoch 3/200: 100%|██████████| 35/35 [00:00<00:00, 163.88batch/s, loss=0.804]
Epoch 4/200: 100%|██████████| 35/35 [00:00<00:00, 163.72batch/s, loss=0.917]
Epoch 5/200: 100%|██████████| 35/35 [00:00<00:00, 165.40batch/s, loss=1.1]  
Epoch 6/200: 100%|██████████| 35/35 [00:00<00:00, 165.44batch/s, loss=0.772]
Epoch 7/200: 100%|██████████| 35/35 [00:00<00:00, 156.97batch/s, loss=0.744]
Epoch 8/200: 100%|██████████| 35/35 [00:00<00:00, 167.71batch/s, loss=0.669]
Epoch 9/200: 100%|██████████| 35/35 [00:00<00:00, 162.76batch/s, loss=0.705]
Epoch 10/200: 100%|██████████| 35/35 [00:00<00:00, 164.61batch/s, loss=0.703]
Epoch 11/200: 100%|██████████| 35/35 [00:00<00:00, 168.21batch/s, loss=0.698]
Epoch 12/200: 100%|██████████| 35/35 [00:00<00:00, 165.60batch/s, loss=0.696]
Epoch 13/200: 100%|██████████| 35/35 [00:00<00:00, 167.87batch/s, loss=0.8

Test Accuracy: 0.5143
Model weights saved to model_weights.pth



