# LTU Model
https://github.com/YuanGongND/ltu?tab=readme-ov-file#for-ltu-as-openasqa


For this project, I will make a simpler model.

1. Audio Input:
    Load an audio file and preprocess it to extract Mel spectrograms.
2. CRNN Processing:
    Predict the class probabilities using the CRNN model.
3. Natural Language Conversion:
    Translate probabilities into human-readable text.
4. LLM Interaction:
    Use the text as input for the LLM.
    Query the LLM for reasoning, explanation, or decision-making tasks.
5. Output:
    Provide explanations, classifications, or environmental scenarios.

B. Using Class Probabilities
Instead of a single class label, the LLM can use the full probability distribution to reason about uncertainty. For instance:

CRNN Output: [0.1, 0.3, ..., 0.05]
Text Input to LLM: "The audio likely contains the sound of a dog barking (30%) but may also include a bird chirping (10%)."


--> Simpler and can simply used the outcome from the outcome 
--> Maybe it can induce the situation?


In [1]:
import os
from pathlib import Path
import pandas as pd
import librosa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader



# file_path
audio_path = Path("dataset") / "audio"
csv_path = Path("dataset") / "esc50.csv"

metadata = pd.read_csv(csv_path)

In [2]:
# Load metadata
metadata = pd.read_csv("dataset/esc50.csv")

# Function to load features and flatten them
def load_features(feature_type="mfcc"):
    feature_dir = Path(f"processed_data/{feature_type}")
    X, y = [], []
    for _, row in metadata.iterrows():
        class_label = row["category"]
        file_name = row["filename"].split(".")[0]
        
        # Load .npy file
        feature_path = feature_dir / f"{file_name}_{feature_type}.npy"
        features = np.load(feature_path)
        
        # Flatten the features to 1D for simple models
        X.append(features.flatten())
        y.append(class_label)
        
    return np.array(X), np.array(y)

# Load and split data for MFCC
X_mfcc, y_mfcc = load_features("mfcc")
X_train_mfcc, X_test_mfcc, y_train_mfcc, y_test_mfcc = train_test_split(X_mfcc, y_mfcc, test_size=0.2, random_state=12)

# Load and split data for Mel Spectrogram
X_mel, y_mel = load_features("mel_spectrogram")
X_train_mel, X_test_mel, y_train_mel, y_test_mel = train_test_split(X_mel, y_mel, test_size=0.2, random_state=12)

In [30]:
class AudioDataset(Dataset):
    def __init__(self, metadata, feature_type, feature_dir, num_classes):
        self.metadata = metadata
        self.feature_type = feature_type
        self.feature_dir = feature_dir
        self.num_classes = num_classes
        self.label_map = {label: idx for idx, label in enumerate(metadata['category'].unique())}

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        file_name = row["filename"].split(".")[0]
        feature_path = f"{self.feature_dir}/{file_name}_{self.feature_type}.npy"
        features = np.load(feature_path)

        # Normalize features and add channel dimension
        features = (features - np.mean(features)) / np.std(features)
        features = torch.tensor(features, dtype=torch.float32).unsqueeze(0)

        label = self.label_map[row["category"]]
        label = torch.tensor(label, dtype=torch.long)
        return features, label


In [None]:
class CRNN(nn.Module):
    def __init__(self, input_channels, num_classes):
        super(CRNN, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, 64, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.dropout = nn.Dropout(0.5)

        # RNN
        self.rnn_hidden_size = 128
        self.rnn = nn.LSTM(128, self.rnn_hidden_size, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(self.rnn_hidden_size * 2, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(self.bn1(torch.relu(self.conv1(x))))
        x = self.pool(self.bn2(torch.relu(self.conv2(x))))
        x = self.dropout(x)

        # Prepare for RNN
        x = x.permute(0, 2, 1, 3).contiguous()  # Rearrange to (batch, seq_len, features)
        b, seq_len, _, features = x.shape
        x = x.view(b, seq_len, -1)

        # RNN
        x, _ = self.rnn(x)
        x = torch.relu(self.fc1(x[:, -1, :]))
        x = self.fc2(x)
        return x


In [21]:
def train_model(model, train_loader, test_loader, device, num_epochs, criterion, optimizer, scheduler=None):
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        correct = 0
        total = 0

        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            if scheduler:
                scheduler.step()

            epoch_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        train_accuracy = (correct / total) * 100
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_loader):.4f}, Accuracy: {train_accuracy:.2f}%")


In [6]:
def evaluate_model(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for features, labels in data_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

In [33]:

feature_type = "mel_spectrogram"
feature_dir = f"processed_data/{feature_type}"
metadata = pd.read_csv("dataset/esc50.csv")
num_classes = len(metadata["category"].unique())
num_epochs = 300
# Create dataset and dataloaders
dataset = AudioDataset(metadata, feature_type, feature_dir, num_classes)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CRNN(input_channels=1, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)  # Increase weight decay


scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=1e-3, steps_per_epoch=len(train_loader), epochs=num_epochs
)


for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        
        # Calculate accuracy for this batch
        _, predicted = torch.max(outputs, 1)  # Get class predictions
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    train_accuracy = (correct / total) * 100
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_loader):.4f}, Accuracy: {train_accuracy:.2f}%")



# Evaluate CRNN
print("\nEvaluating CRNN...")
crnn_accuracy = evaluate_model(model, test_loader, device)
print(f"CRNN Test Accuracy: {crnn_accuracy:.2f}%")


Epoch [1/300], Loss: 3.9118, Accuracy: 1.81%
Epoch [2/300], Loss: 3.9064, Accuracy: 3.25%
Epoch [3/300], Loss: 3.9014, Accuracy: 2.75%
Epoch [4/300], Loss: 3.8955, Accuracy: 4.31%
Epoch [5/300], Loss: 3.8874, Accuracy: 4.62%
Epoch [6/300], Loss: 3.8793, Accuracy: 4.62%
Epoch [7/300], Loss: 3.8665, Accuracy: 5.19%
Epoch [8/300], Loss: 3.8484, Accuracy: 6.56%
Epoch [9/300], Loss: 3.8295, Accuracy: 6.50%
Epoch [10/300], Loss: 3.8085, Accuracy: 6.25%
Epoch [11/300], Loss: 3.7905, Accuracy: 7.00%
Epoch [12/300], Loss: 3.7735, Accuracy: 7.62%
Epoch [13/300], Loss: 3.7599, Accuracy: 8.25%
Epoch [14/300], Loss: 3.7442, Accuracy: 8.25%
Epoch [15/300], Loss: 3.7333, Accuracy: 8.88%
Epoch [16/300], Loss: 3.7177, Accuracy: 10.12%
Epoch [17/300], Loss: 3.7049, Accuracy: 10.12%
Epoch [18/300], Loss: 3.6903, Accuracy: 10.44%
Epoch [19/300], Loss: 3.6721, Accuracy: 10.38%
Epoch [20/300], Loss: 3.6567, Accuracy: 11.12%
Epoch [21/300], Loss: 3.6379, Accuracy: 10.50%
Epoch [22/300], Loss: 3.6180, Accurac