In [1]:
import librosa
import os
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 使用 StandardScaler 进行标准化
scaler = StandardScaler()
def preprocess_data(data_path):
    genres = os.listdir(data_path)
    features = []
    labels = []
    for genre_id, genre in enumerate(genres):
        genre_path = os.path.join(data_path, genre)
        for file_name in os.listdir(genre_path):
            file_path = os.path.join(genre_path, file_name)
            if file_path!='Data/genres_original\jazz\jazz.00054.wav':
            # 加载音频文件并提取 MEL 频谱特征
                y, sr = librosa.load(file_path, sr=None)
                mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=1024)
                mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
                scaled_data = scaler.fit_transform(mel_spectrogram)
                # 将特征添加到特征列表中
                features.append(scaled_data)
                labels.append(genre_id)
    return features, labels

data_path = 'Data/genres_original'
preprocessed_data, labels = preprocess_data(data_path)

In [2]:
len(labels)
preprocessed_data[0].shape

(128, 647)

In [3]:
data=[]
label=[]
for i in range(len(preprocessed_data)):
    if preprocessed_data[i].shape==(128, 647):
        data.append(preprocessed_data[i])
        label.append(labels[i])



In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import os
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score
class GTZANDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.labels[index]


dataset = GTZANDataset(data, labels)


train_size = int(0.7 * len(dataset))
val_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
batch_size=128

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [5]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class GenreLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes,dropout_rate):
        super(GenreLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout_rate)
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

       
        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(out)
        out = self.fc(out[:, -1, :])
        return out


num_classes=10
num_layers=1
hidden_size=128
input_size=647
dropout_rate = 0.5
model =GenreLSTM(input_size, hidden_size, num_layers, num_classes,dropout_rate=dropout_rate)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Using device: cuda


In [6]:
import torch


model.to(device)
num_epochs = 100
for epoch in range(num_epochs):
    
    train_loss = 0
    val_loss = 0
   
    model.train()
    for i, (data, labels) in enumerate(train_loader):
        data = data.to(device)

        labels = labels.to(device)

        outputs = model(data)
        labels = labels.long()
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)

   
    model.eval()
    with torch.no_grad():
        for data, labels in val_loader:
            data = data.to(device)
            labels = labels.to(device)

            outputs = model(data)
            labels = labels.long()
            loss = criterion(outputs, labels)

            val_loss += loss.item()

        val_loss /= len(val_loader)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

with torch.no_grad():
    correct = 0
    total = 0
    for data, labels in test_loader:
        
        data = data.to(device)
        labels = labels.to(device)
        outputs = model(data)
        labels = labels.long()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print('Test Accuracy: {:.2f}%'.format(accuracy))

Epoch [1/100], Train Loss: 2.3113, Validation Loss: 2.2803
Epoch [2/100], Train Loss: 2.2746, Validation Loss: 2.2573
Epoch [3/100], Train Loss: 2.2744, Validation Loss: 2.2567
Epoch [4/100], Train Loss: 2.2463, Validation Loss: 2.2661
Epoch [5/100], Train Loss: 2.2393, Validation Loss: 2.2442
Epoch [6/100], Train Loss: 2.2617, Validation Loss: 2.2549
Epoch [7/100], Train Loss: 2.2350, Validation Loss: 2.2287
Epoch [8/100], Train Loss: 2.2200, Validation Loss: 2.2306
Epoch [9/100], Train Loss: 2.2071, Validation Loss: 2.2354
Epoch [10/100], Train Loss: 2.1952, Validation Loss: 2.2241
Epoch [11/100], Train Loss: 2.1811, Validation Loss: 2.2384
Epoch [12/100], Train Loss: 2.1771, Validation Loss: 2.2091
Epoch [13/100], Train Loss: 2.1728, Validation Loss: 2.2068
Epoch [14/100], Train Loss: 2.1317, Validation Loss: 2.1880
Epoch [15/100], Train Loss: 2.1493, Validation Loss: 2.1793
Epoch [16/100], Train Loss: 2.1520, Validation Loss: 2.1918
Epoch [17/100], Train Loss: 2.0989, Validation Lo

In [7]:
class Generator(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(Generator, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out)
        
        return out

In [8]:
class Discriminator(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(Discriminator, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        out = torch.sigmoid(out)
        return out

In [9]:

generator = Generator(input_size, hidden_size, num_layers, input_size).to(device)
discriminator = Discriminator(input_size, hidden_size, num_layers, 1).to(device)


criterion_gan = nn.BCELoss()
optimizer_G = torch.optim.Adam(generator.parameters(), lr=1e-3)
optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=1e-3)

num_epochs = 50
for epoch in range(num_epochs):
    for i, (data, _) in enumerate(train_loader):
        data = data.to(device)
        
       
        optimizer_D.zero_grad()

        real_labels = torch.ones(data.size(0), 1).to(device)
        fake_labels = torch.zeros(data.size(0), 1).to(device)

        real_outputs = discriminator(data)
        real_loss = criterion_gan(real_outputs, real_labels)
        noise = torch.randn(data.size(0),data.size(1), input_size).to(device)
        fake_data = generator(noise)
        fake_outputs = discriminator(fake_data)
        fake_loss = criterion_gan(fake_outputs, fake_labels)
        d_loss = real_loss + fake_loss
        d_loss.backward()
        optimizer_D.step()

      
        optimizer_G.zero_grad()

        noise = torch.randn(data.size(0), data.size(1), input_size).to(device)
        fake_data = generator(noise)
        fake_outputs = discriminator(fake_data)

        g_loss = criterion_gan(fake_outputs, real_labels)
        g_loss.backward()
        optimizer_G.step()

print(f'Epoch [{epoch+1}/{num_epochs}], Discriminator Loss: {d_loss.item()}, Generator Loss: {g_loss.item()}')
noise = torch.randn(len(train_dataset), data.size(1), input_size).to(device)
generated_data = generator(noise)


Epoch [50/50], Discriminator Loss: 0.010071410797536373, Generator Loss: 4.750080108642578


In [16]:
import os
import librosa
import soundfile as sf
num_classes = 10
num_audios_per_class = 66
fake_data_np = generated_data.detach().cpu().numpy()
fake_data_np = fake_data_np.reshape(660,-1)
output_dir = "generated_audios"
os.makedirs(output_dir, exist_ok=True)

sample_rate = 22050

for class_idx in range(num_classes):
    class_dir = os.path.join(output_dir, f"class_{class_idx}")
    os.makedirs(class_dir, exist_ok=True)

    for audio_idx in range(num_audios_per_class):
        fake_data_np1 = fake_data_np[class_idx*audio_idx,:]
        fake_data_np1 = fake_data_np1 * 32768
        fake_data_np1 = fake_data_np1.astype(np.int16)

        output_file = os.path.join(class_dir, f"audio_{audio_idx}.wav")
        sf.write(output_file, fake_data_np1, sample_rate)

array([[ 0.32278854,  0.3272677 , -0.09516631, ...,  1.4742877 ,
        -0.5396615 , -0.39885685],
       [ 0.6430355 ,  0.19385163, -0.19499926, ...,  1.5642091 ,
        -0.26930186, -0.3093978 ],
       [ 0.60352606,  0.31450683, -0.0623032 , ...,  1.8034525 ,
        -0.70429707, -0.32490927],
       ...,
       [ 0.33393404,  0.00804125, -0.22638065, ...,  1.8626825 ,
        -0.44945222, -0.35803303],
       [ 0.95635974,  0.47882026, -0.09175487, ...,  1.6892006 ,
        -0.3957002 , -0.46639317],
       [ 0.4624194 ,  0.55562466,  0.22877781, ...,  1.6736945 ,
        -0.47752613, -0.33423373]], dtype=float32)