# Convert the original audio to one containing only drums

## Method 1: training a model by ourselves

In [23]:
# Import necessary libraries
import os
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch.nn as nn
import scipy.io.wavfile as wavwrite
import numpy as np

In [24]:
# Define a neural network model for drum separation
class DrumSeparationModel(nn.Module):
    def __init__(self):
        super(DrumSeparationModel, self).__init__()
        # First convolutional layer
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        # Second convolutional layer
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=2, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        # Forward pass through the network
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        return x

In [25]:
# Define a custom dataset for drum separation
class DrumDataset(Dataset):
    def __init__(self, original_path, separated_path, file_list, max_length):
        self.original_path = original_path
        self.separated_path = separated_path
        self.file_list = file_list
        self.max_length = max_length

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        # Load original and separated waveforms from audio files
        original_file = os.path.join(self.original_path, self.file_list[idx])
        separated_file = os.path.join(self.separated_path, self.file_list[idx])
        original_waveform, _ = torchaudio.load(original_file)
        separated_waveform, _ = torchaudio.load(separated_file)
        
        # Trim or pad waveforms to the specified maximum length
        if original_waveform.size(1) > self.max_length:
            original_waveform = original_waveform[:, :self.max_length]
            separated_waveform = separated_waveform[:, :self.max_length]
        else:
            pad_length = self.max_length - original_waveform.size(1)
            original_waveform = torch.nn.functional.pad(original_waveform, (0, pad_length))
            separated_waveform = torch.nn.functional.pad(separated_waveform, (0, pad_length))

        # Ensure there is only one channel
        original_waveform = original_waveform[0:1, :]
        return original_waveform, separated_waveform

In [26]:
# Calculate Root Mean Squared Error (RMSE)
def calculate_rmse(predictions, targets):
    rmse = torch.sqrt(torch.mean((predictions - targets)**2))
    return rmse.item()

In [27]:
# Set paths for training and validation datasets
train_original_path = "C:\\Users\\chaoy\\Downloads\\DSD100\\DSD100\\Mixtures\\Dev"
train_separated_path = "C:\\Users\\chaoy\\Downloads\\DSD100\\DSD100\\Sources\\Dev"
val_original_path = "C:\\Users\\chaoy\\Downloads\\DSD100\\DSD100\\Mixtures\\Test"
val_separated_path = "C:\\Users\\chaoy\\Downloads\\DSD100\\DSD100\\Sources\\Test"

# List all audio file names
train_files = list(Path(train_original_path).rglob('**/mixture.wav'))
val_files = list(Path(val_original_path).rglob('**/mixture.wav'))

# Assume file names have a one-to-one correspondence
assert len(train_files) == len(val_files)

# Split file names into training and validation sets
train_files, val_files = train_files, val_files

In [28]:
# Create instances of training and validation datasets
max_length = 1133393   # Set a maximum length
train_dataset = DrumDataset(train_original_path, train_separated_path, train_files, max_length)
val_dataset = DrumDataset(val_original_path, val_separated_path, val_files, max_length)

# Use DataLoader to load datasets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Initialize the drum separation model
model = DrumSeparationModel()

# Initialize loss function and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [32]:
# Train the model
num_epochs = 3
for epoch in range(num_epochs):
    for original_waveform, separated_waveform in train_loader:
        # Forward pass
        predictions = model(original_waveform)
        # Compute loss
        loss = criterion(predictions, separated_waveform)
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [33]:
# Evaluate the model on the validation set
model.eval()
with torch.no_grad():
    total_rmse = 0.0
    total_samples = 0
    for original_waveform, separated_waveform in val_loader:
        # Forward pass
        predictions = model(original_waveform)
         # Calculate RMSE
        rmse = calculate_rmse(predictions, separated_waveform)
        total_rmse += rmse
        total_samples += original_waveform.size(0)
    accuracy = total_rmse / total_samples
    print(f'Validation RMSE: {accuracy}')

Validation RMSE: 0.005262430310249328


In [40]:
# Load a new audio file for drum separation
new_audio_file = "drums3.wav"
new_waveform, _ = torchaudio.load(new_audio_file)

# Trim or pad the waveform to the specified maximum length
if new_waveform.size(1) > max_length:
    new_waveform = new_waveform[:, :max_length]
else:
    pad_length = max_length - new_waveform.size(1)
    new_waveform = torch.nn.functional.pad(new_waveform, (0, pad_length))

# Ensure there is only one channel
new_waveform = new_waveform[0:1, :]
new_waveform = new_waveform.unsqueeze(0) 

# Make predictions using the trained model
model.eval()
with torch.no_grad():
    predicted_waveform = model(new_waveform)
    predicted_waveform = predicted_waveform.squeeze(0)

# Scale the floating-point tensor to the range of 16-bit integers
predicted_waveform_int = (predicted_waveform * 32767).to(torch.int16)

# Save the predicted audio as a WAV file
predicted_audio_file_wav = "drums4.wav"
wavwrite.write(predicted_audio_file_wav, 44100, predicted_waveform_int.numpy().T)

## Method 2: separate audio tracks with Demucs

In [38]:
# Import necessary modules
from demucs import pretrained
from demucs.apply import apply_model
import torchaudio
import os

def separate_sources(input_file):
    # Load the pretrained Demucs model
    model = pretrained.get_model('mdx')

    # Load the audio file
    waveform, sample_rate = torchaudio.load(input_file)

    # Separate sources in the audio
    sources = apply_model(model, waveform.unsqueeze(0))  # Add a dimension before mix

    # Save only the first channel of the first source
    output_file = "drums2.wav"
    torchaudio.save(output_file, sources[0][0], sample_rate)

# Set input file and output file prefix
input_file = "input_music.wav"  

# Execute source separation
separate_sources(input_file)

# Converting the .wav file to .txt file

## Part 1: an original audio file to audio information

In [35]:
import librosa
import librosa.display
import numpy as np

# Load the wav file
wav_file = 'input_music.wav'
y, sr = librosa.load(wav_file)

# Set the desired hop length and bins per octave
hop_length = 512
bins_per_octave = 12

# Compute the constant-Q chromagram
CQT = librosa.amplitude_to_db(np.abs(librosa.cqt(y, sr=sr, hop_length=hop_length, bins_per_octave=bins_per_octave)), ref=np.max)

# Get the time and frequency bins
times = librosa.times_like(CQT)
frequencies = librosa.cqt_frequencies(n_bins=CQT.shape[0], fmin=librosa.note_to_hz('C1'), bins_per_octave=bins_per_octave)

# Convert the amplitude spectrogram to strength
strength = librosa.power_to_db(np.abs(librosa.stft(y, hop_length=hop_length)), ref=np.max)

# Calculate the average strength and frequency for each time frame
average_strength = np.mean(strength, axis=0)
average_frequency = np.sum(np.exp(CQT) * frequencies[:, None], axis=0) / np.sum(np.exp(CQT), axis=0)

# Calculate max and min values for average strength and frequency
max_avg_strength = np.max(average_strength)
min_avg_strength = np.min(average_strength)
max_avg_frequency = np.max(average_frequency)
min_avg_frequency = np.min(average_frequency)

# Write max and min values to a text file
output_file = 'audio.txt'
with open(output_file, 'w') as file:
    file.write(f"{max_avg_frequency:.2f}\n")
    file.write(f"{min_avg_frequency:.2f}\n")
    file.write(f"{max_avg_strength:.2f}\n")
    file.write(f"{min_avg_strength:.2f}\n")
    for i, time in enumerate(times):
        file.write(f"{time:.1f} {average_frequency[i]:.2f} {average_strength[i]:.2f}\n")

print(f"Average audio information with max and min values written to {output_file}")

Average audio information with max and min values written to audio.txt


## Part 2: a drums predicted file to drums information

In [36]:
import librosa

# Read the output drum source file
drum_file = "drums.wav" 

# Load the audio file
drum_data, drum_sr = librosa.load(drum_file)

# Extract drum onsets
onset_frames = librosa.onset.onset_detect(y=drum_data, sr=drum_sr)
onset_times_ms = librosa.frames_to_time(onset_frames, sr=drum_sr, hop_length=512) * 1000  # Convert time to milliseconds

# Write drum onset times to a text file (rounded to the nearest 100 milliseconds)
output_txt_file = "drums.txt"  

with open(output_txt_file, 'w') as f:
    for onset_time_ms in onset_times_ms:
        rounded_onset_time_ms = round(onset_time_ms / 100) * 100  # Round to the nearest 100 milliseconds
        f.write(f"{rounded_onset_time_ms}\n")

print(f"Rounded drum onset times (in milliseconds, rounded to the nearest 100ms) written to {output_txt_file}")

Rounded drum onset times (in milliseconds, rounded to the nearest 100ms) written to drums.txt
