In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset
import os
import sys
import numpy
import librosa
import pandas as pd
import numpy as np
import librosa.display
import matplotlib.pyplot as plt
import torch

In [14]:
audio_path = 'train/'
students_num = ["029", "033", "039", "045", "049", "068", "914", "918", "919", "934", "970"]
orders_num = ["Takeoff", "Landing", "Advance", "Retreat", "Rise"]
repeat_num = ['1', '2', '3', '4']
number_of_mfcc_features = 13
orders_dic = {
    'Takeoff': 0,
    'Landing': 1,
    'Advance': 2,
    'Retreat': 3,
    'Rise': 4
}

In [15]:
def mfcc_extraction(path, students_num, orders_num, repeat_num):
    labels = []
    mfcc_features = []

    for stu_num in students_num:
        for index, order in enumerate(orders_num):
            for repeat in repeat_num:
                file_path = path + stu_num + '_' + str(index+1) + '_' + repeat + ".wav"
                if os.path.exists(file_path):
                    x, sr = librosa.load(file_path)
                    x = librosa.effects.preemphasis(x)  # 预加重处理
                    mfccs = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=number_of_mfcc_features).T


                    mfcc_features.append(mfccs)
                    labels.append(order)
                else:
                    pass
    return mfcc_features, labels

In [16]:
mfcc_features_and_labels = mfcc_extraction(audio_path, students_num, orders_num, repeat_num)
mfcc_features, mfcc_labels = mfcc_features_and_labels

In [17]:
mfcc_labels = [orders_dic[label] for label in mfcc_labels]

In [18]:
class MFCC_DataSet(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        return feature, label

In [19]:
dataset = MFCC_DataSet(mfcc_features, mfcc_labels)
dataloader = DataLoader(dataset=dataset,shuffle=False)

In [20]:
for f , l in dataloader:
    print(f.shape)
    print(l.shape)

torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])
torch.Size([1, 87, 13])
torch.Size([1])


In [21]:


class MFCC_CNN(nn.Module):
    def __init__(self, num_classes, num_coefficients, num_frames):
        super(MFCC_CNN, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3))
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=(2, 2))
        
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3))
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=(2, 2))
        
        # 计算 fc1 的输入维度
        conv_out_channels = 64
        conv_out_num_frames = (num_frames - 4) // 4 - 2
        conv_out_num_coefficients = (num_coefficients - 4) // 4 - 2
        
        # 确保 conv_out_num_frames 和 conv_out_num_coefficients 不小于 0
        conv_out_num_frames = max(0, conv_out_num_frames)
        conv_out_num_coefficients = max(0, conv_out_num_coefficients)
        
        self.fc1 = nn.Linear(conv_out_channels * conv_out_num_frames * conv_out_num_coefficients, 128)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.pool1(self.relu1(self.conv1(x)))
        x = self.pool2(self.relu2(self.conv2(x)))
        x = x.view(x.size(0), -1)  # 展平
        x = self.relu3(self.fc1(x))
        x = self.fc2(x)
        x = self.softmax(x)
        return x

    

In [22]:
model = MFCC_CNN(num_classes=5, num_coefficients=13, num_frames=87)

# 定义优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=0.003)
criterion = nn.CrossEntropyLoss()



In [23]:
epochs = 100
for epoch in range(epochs):
    for (labels, features) in enumerate(dataloader):
        optimizer.zero_grad()
        #print(features[0].shape)
        # 前向传播
        features = features[0].unsqueeze(1)
        outputs = model(features)
        
        # 计算损失
        labels = labels.squeeze()  # 去除不必要的维度
        loss = criterion(outputs, labels)
        
        # 反向传播和优化
        loss.backward()
        optimizer.step()
        
        print('Epoch: {} \tLoss: {:.6f}'.format(epoch, loss.item()))
        print(outputs)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1280 and 0x128)