In [8]:
import torchaudio
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

# 数据准备
# 假设您已经有一个语音数据集，每个文件标记了一个类别标签。我们可以使用 ​torchaudio​来加载和处理音频数据。
def load_audio_files(data_path):
    audio_files = []
    labels = []

    for root, _, files in os.walk(data_path):
        for file in files:
            if file.endswith(".wav"):  # Assumption: files are in .wav format
                full_path = os.path.join(root, file)
                label = os.path.basename(root)  # Assuming folder name is the label
                waveform, sample_rate = torchaudio.load(full_path)
                audio_files.append(waveform)
                labels.append(label)

    return audio_files, labels

In [11]:
# 特征提取
# 常见的特征提取方法包括梅尔频率倒谱系数（MFCC）、梅尔谱图（Mel Spectrogram）等。
def extract_features(audio_files, sample_rate):
    mfcc_transform = torchaudio.transforms.MFCC(
        sample_rate=sample_rate,
        n_mfcc=13,
        melkwargs={"n_fft": 400, "hop_length": int(0.010 * sample_rate), "n_mels": 23}
    )

    features = []
    for waveform in audio_files:
        mfcc = mfcc_transform(waveform)
        features.append(mfcc)

    return features

In [12]:
class SpeechClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SpeechClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

NameError: name 'num_epochs' is not defined

In [4]:
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")


In [7]:
# Dummy data for demonstration purposes
inputs = torch.randn(10, 13)  # Example input features
labels = torch.randint(0, 2, (10,))  # Example target labels (2 classes)
train_data = list(zip(inputs, labels))
train_loader = torch.utils.data.DataLoader(train_data, batch_size=2)

model = SpeechClassifier(input_size=13, num_classes=2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, criterion, optimizer, num_epochs=20)

Epoch 1, Loss: 0.6476296305656433
Epoch 2, Loss: 0.49950666427612306
Epoch 3, Loss: 0.4115217447280884
Epoch 4, Loss: 0.341979393362999
Epoch 5, Loss: 0.27994734048843384
Epoch 6, Loss: 0.2284907042980194
Epoch 7, Loss: 0.18540208116173745
Epoch 8, Loss: 0.1497023344039917
Epoch 9, Loss: 0.11916546523571014
Epoch 10, Loss: 0.09449867848306895
Epoch 11, Loss: 0.07461356818675995
Epoch 12, Loss: 0.058421484939754006
Epoch 13, Loss: 0.045755724795162676
Epoch 14, Loss: 0.03590860068798065
Epoch 15, Loss: 0.028399443626403807
Epoch 16, Loss: 0.022725792694836856
Epoch 17, Loss: 0.018385041039437056
Epoch 18, Loss: 0.015086334571242333
Epoch 19, Loss: 0.012587815104052425
Epoch 20, Loss: 0.010627786535769701
