In [None]:
import os
from pathlib import Path
import pandas as pd
import librosa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader



# file_path
audio_path = Path("dataset") / "audio"
csv_path = Path("dataset") / "esc50.csv"

metadata = pd.read_csv(csv_path)

In [None]:

sr = 22050
duration = 5 # ESC50 has 5 sec duration
n_mfcc = 13
n_mels = 128

# Output
mfcc_dir = Path("processed_data") / "mfcc"
mel_dir = Path("processed_data") / "mel_spectrogram"
mfcc_dir.mkdir(parents=True, exist_ok=True)
mel_dir.mkdir(parents=True, exist_ok=True)

# Function to save MFCC and Mel Spectrogram
def save_features(audio_path, filename, sr=22050):
    # Load audio
    y, _ = librosa.load(audio_path, sr=sr, duration=duration)

    # Compute MFCC and Mel_spectrogram
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)

    # Save as .npy
    np.save(f"processed_data/mfcc/{filename}_mfcc.npy", mfcc)
    np.save(f"processed_data/mel_spectrogram/{filename}_mel_spectrogram.npy", mel_spectrogram)

# Save features
for _, row in metadata.iterrows():
    audio_file = f"dataset/audio/{row['filename']}"
    filename = row["filename"].split(".")[0]
    save_features(audio_file, filename)

In [2]:
# Load metadata
metadata = pd.read_csv("dataset/esc50.csv")

# Function to load features and flatten them
def load_features(feature_type="mfcc"):
    feature_dir = Path(f"processed_data/{feature_type}")
    X, y = [], []
    for _, row in metadata.iterrows():
        class_label = row["category"]
        file_name = row["filename"].split(".")[0]
        
        # Load .npy file
        feature_path = feature_dir / f"{file_name}_{feature_type}.npy"
        features = np.load(feature_path)
        
        # Flatten the features to 1D for simple models
        X.append(features.flatten())
        y.append(class_label)
        
    return np.array(X), np.array(y)

# Load and split data for MFCC
X_mfcc, y_mfcc = load_features("mfcc")
X_train_mfcc, X_test_mfcc, y_train_mfcc, y_test_mfcc = train_test_split(X_mfcc, y_mfcc, test_size=0.2, random_state=12)

# Load and split data for Mel Spectrogram
X_mel, y_mel = load_features("mel_spectrogram")
X_train_mel, X_test_mel, y_train_mel, y_test_mel = train_test_split(X_mel, y_mel, test_size=0.2, random_state=12)



In [3]:
# ML Models 

# Initialize models
knn = KNeighborsClassifier(n_neighbors=4)
rf = RandomForestClassifier(n_estimators=100, random_state=12)
svm = SVC(kernel="linear")

# Model List
models = {"KNN": knn, "Random Forest": rf, "SVM": svm}

# Function to train and evaluate models
def train_and_evaluate(X_train, X_test, y_train, y_test, feature_type):
    print(f"\nEvaluating models for {feature_type} features:\n")
    for name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        # Evaluation
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{name} Model Accuracy: {accuracy:.2f}")

# MFCC
train_and_evaluate(X_train_mfcc, X_test_mfcc, y_train_mfcc, y_test_mfcc, "MFCC")

# Mel Spectrogram
train_and_evaluate(X_train_mel, X_test_mel, y_train_mel, y_test_mel, "Mel Spectrogram")


Evaluating models for MFCC features:

KNN Model Accuracy: 0.26
Random Forest Model Accuracy: 0.39
SVM Model Accuracy: 0.33

Evaluating models for Mel Spectrogram features:

KNN Model Accuracy: 0.09
Random Forest Model Accuracy: 0.34
SVM Model Accuracy: 0.14


In [40]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.8)

        dummy_input = torch.zeros(1, 1, 128, 216)  # (batch_size, channels, height, width)
        out = self.pool(self.relu(self.conv1(dummy_input)))
        out = self.pool(self.relu(self.conv2(out)))
        self.flattened_size = out.numel()

        # Fully connected layers
        self.fc1 = nn.Linear(self.flattened_size, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        # Convolutional layers
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)

        # Flatten the output
        x = x.view(x.size(0), -1)

        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

In [None]:
class AudioDataset(Dataset):
    def __init__(self, metadata, feature_type, feature_dir, num_classes):
        self.metadata = metadata
        self.feature_type = feature_type
        self.feature_dir = feature_dir
        self.num_classes = num_classes
        self.label_map = {label: idx for idx, label in enumerate(metadata['category'].unique())}

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        file_name = row["filename"].split(".")[0]
        feature_path = f"{self.feature_dir}/{file_name}_{self.feature_type}.npy"
        features = np.load(feature_path)

        # Normalize features and add channel dimension
        features = (features - np.mean(features)) / np.std(features)
        features = torch.tensor(features, dtype=torch.float32).unsqueeze(0)

        label = self.label_map[row["category"]]
        label = torch.tensor(label, dtype=torch.long)
        return features, label


In [None]:
class CRNN(nn.Module):
    def __init__(self, input_channels=1, num_classes=50, rnn_hidden_size=128, num_rnn_layers=2):
        super(CRNN, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(input_channels, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout = nn.Dropout(0.8)

        # Compute input size for RNN
        dummy_input = torch.randn(1, input_channels, 128, 216)  # (batch, channels, height, width)
        self.rnn_input_size = self._get_rnn_input_size(dummy_input)

        # Recurrent layers
        self.rnn = nn.LSTM(
            input_size=self.rnn_input_size,
            hidden_size=rnn_hidden_size,
            num_layers=num_rnn_layers,
            batch_first=True,
            bidirectional=True
        )

        # Fully connected layer
        self.fc = nn.Linear(rnn_hidden_size * 2, num_classes)  # *2 for bidirectional

    def _get_rnn_input_size(self, x):
        """Pass dummy data through CNN layers to calculate flattened output size."""
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        _, _, height, width = x.shape
        return height * 32

    def forward(self, x):
        # Pass through CNN layers
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.dropout(x)

        # Flatten for RNN input
        batch_size, channels, height, width = x.shape
        x = x.view(batch_size, width, channels * height)

        # Pass through RNN
        x, _ = self.rnn(x)
        x = x[:, -1, :]

        # Pass through fully connected layer
        x = self.fc(x)
        return x


In [14]:
def train_model(model, train_loader, test_loader, device, num_epochs=25):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_loader):.4f}")

    print("Training complete!")

In [None]:
def evaluate_model(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for features, labels in data_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

In [45]:

if __name__ == "__main__":

    feature_type = "mel_spectrogram"
    feature_dir = f"processed_data/{feature_type}"
    metadata = pd.read_csv("dataset/esc50.csv")
    num_classes = len(metadata["category"].unique())
    num_epochs = 100
    # Create dataset and dataloaders
    dataset = AudioDataset(metadata, feature_type, feature_dir, num_classes)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # Initialize model, loss, and optimizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CRNN(input_channels=1, num_classes=num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Train SimpleCNN
    print("Training SimpleCNN...")
    cnn_model = SimpleCNN(num_classes=num_classes).to(device)
    train_model(cnn_model, train_loader, test_loader, device, num_epochs)

    # Evaluate SimpleCNN
    cnn_accuracy = evaluate_model(cnn_model, test_loader, device)
    print(f"SimpleCNN Test Accuracy: {cnn_accuracy:.2f}%")

    # Train CRNN
    print("\nTraining CRNN...")
    crnn_model = CRNN(input_channels=1, num_classes=num_classes).to(device)
    train_model(crnn_model, train_loader, test_loader, device, num_epochs)

    # Evaluate CRNN
    crnn_accuracy = evaluate_model(crnn_model, test_loader, device)
    print(f"CRNN Test Accuracy: {crnn_accuracy:.2f}%")

Training SimpleCNN...
Epoch [1/100], Loss: 4.1750
Epoch [2/100], Loss: 3.7834
Epoch [3/100], Loss: 3.6422
Epoch [4/100], Loss: 3.4423
Epoch [5/100], Loss: 3.3508
Epoch [6/100], Loss: 3.1063
Epoch [7/100], Loss: 3.0427
Epoch [8/100], Loss: 2.9605
Epoch [9/100], Loss: 2.8171
Epoch [10/100], Loss: 2.6855
Epoch [11/100], Loss: 2.6092
Epoch [12/100], Loss: 2.5533
Epoch [13/100], Loss: 2.5229
Epoch [14/100], Loss: 2.4346
Epoch [15/100], Loss: 2.2952
Epoch [16/100], Loss: 2.2480
Epoch [17/100], Loss: 2.1982
Epoch [18/100], Loss: 2.1823
Epoch [19/100], Loss: 2.1255
Epoch [20/100], Loss: 2.0263
Epoch [21/100], Loss: 2.0018
Epoch [22/100], Loss: 1.9456
Epoch [23/100], Loss: 1.9355
Epoch [24/100], Loss: 1.9034
Epoch [25/100], Loss: 1.8603
Epoch [26/100], Loss: 1.8246
Epoch [27/100], Loss: 1.8208
Epoch [28/100], Loss: 1.7674
Epoch [29/100], Loss: 1.7669
Epoch [30/100], Loss: 1.6801
Epoch [31/100], Loss: 1.6958
Epoch [32/100], Loss: 1.6524
Epoch [33/100], Loss: 1.6088
Epoch [34/100], Loss: 1.5915
E