# MusicNet Dataset

MusicNet is a collection of 330 freely-licensed classical music recordings, together with over 1 million annotated labels indicating the precise time of each note in every recording, the instrument that plays each note, and the note's position in the metrical structure of the composition.

Kaggle: https://www.kaggle.com/datasets/imsparsh/musicnet-dataset

In this project, we will only be using the solo recordings present in this dataset.

### Import Libaries

In [None]:
import shutil
import os
import pandas as pd

## Extracting Solo Recordings

Solo recordings have only 1 instrument playing in the recording. Only these recordings are used to prevent confusion recognising the instruments.

In [None]:
# Load the dataset
df = pd.read_csv('musicnet_metadata.csv')

# Filter the DataFrame to include only rows where the 'ensemble' column
# contains the word "solo" (the search is case-insensitive)
solo_performances = df[df['ensemble'].str.contains('solo', case=False, na=False)]

# Extract the 'id' column from the filtered DataFrame
# These IDs correspond to solo performances
solo_ids = solo_performances['id'].tolist()

# Display the first few solo_ids for verification
print(solo_ids[:10])

In [None]:
# Paths
source_dir = 'MusicNet/musicnet/musicnet/test_data'
destination_dir = 'solo_test_data'

# Ensure the destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Copy the files
for file_id in solo_ids:
    file_name = f"{file_id}.wav"
    source_path = os.path.join(source_dir, file_name)
    destination_path = os.path.join(destination_dir, file_name)
    
    # Check if the source file exists before copying
    if os.path.exists(source_path):
        shutil.copy(source_path, destination_path)
        print(f"Copied {file_name}")
    else:
        print(f"File {file_name} not found in source directory.")

## Sort Extracted Files

solo piano recordings will be transfered to 'solo_train_data/Piano', <br>
solo violin recordings will be transfered to 'solo_train_data/Violin', etc.

### Function to sort the extracted files by instrument

In [None]:
def organize_files_by_ensemble(df, source_dir, base_destination_dir):
    # Ensure the base destination directory exists
    os.makedirs(base_destination_dir, exist_ok=True)

    # Iterate through the DataFrame rows
    for index, row in df.iterrows():
        # Extract ensemble type and file id
        ensemble_type = row['ensemble']
        file_id = row['id']
        
        if 'solo' in ensemble_type.lower():  # This check is case-insensitive
            # Determine the type of instrument/folder name from the ensemble type
            folder_name = ensemble_type.split(" ")[-1] # Takes the last word
            
            # Construct the source and destination paths
            file_name = f"{file_id}.wav"
            source_path = os.path.join(source_dir, file_name)
            destination_dir = os.path.join(base_destination_dir, folder_name)
            destination_path = os.path.join(destination_dir, file_name)
            
            # Create the destination folder if it doesn't exist
            os.makedirs(destination_dir, exist_ok=True)
            
            # Move the file from source to destination
            if os.path.exists(source_path):
                shutil.move(source_path, destination_path)
                print(f"Moved {file_name} to {destination_dir}/")
            else:
                print(f"File {file_name} not found in source directory.")
                
        else:
            print(f"Skipped {file_id}.wav as it does not contain 'solo' in the ensemble type.")

In [None]:
source_dir = 'solo_train_data'
base_destination_dir = 'solo_train_data'
organize_files_by_ensemble(df, source_dir, base_destination_dir)

In [None]:
source_dir = 'solo_test_data'
base_destination_dir = 'solo_test_data'
organize_files_by_ensemble(df, source_dir, base_destination_dir)

## Split Recordings into 1-Second Segments

To allow for easier analysis and real-time predictions, the recordings are split into 1-second segments.

### Import necessary libraries

In [None]:
import torchaudio
from pathlib import Path

### Function to split a .wav audio file

into 1-second segments and save it into its respective folders

In [None]:
def split_wav_file(file_path, output_dir):
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)
    
    # Calculate the number of samples to remove from start and end
    start_trim_samples = 5 * sample_rate  # 5 seconds
    end_trim_samples = waveform.shape[1] - (5 * sample_rate)  # Last 5 seconds
    
    # Trim the waveform
    trimmed_waveform = waveform[:, start_trim_samples:end_trim_samples]
    
    # Calculate the number of 1-second segments
    segment_length = sample_rate  # 1 second
    num_segments = trimmed_waveform.shape[1] // segment_length
    
    # Split and save each segment
    for i in range(num_segments):
        start = i * segment_length
        end = start + segment_length
        segment = trimmed_waveform[:, start:end]
        segment_file_path = f"{output_dir}/{file_path.stem}_{i}.wav"
        torchaudio.save(segment_file_path, segment, sample_rate)

    print(f"Processed and saved {num_segments} segments for {file_path.name}")

### Function to visit a folder and iterate through its .wav files

In [None]:
def process_directory(input_directory, output_directory):
    # Convert string paths to Path objects if they are not already
    input_directory = Path(input_directory)
    output_directory = Path(output_directory)
    
    # Create the output directory if it doesn't exist
    output_directory.mkdir(parents=True, exist_ok=True)

    # Iterate over all .wav files in the input directory
    for file_path in input_directory.glob('*.wav'):
        split_wav_file(file_path, output_directory)
        print(f"Processed {file_path} to {output_directory}")

### Split the audio!

Flute will not be used from this point onwards as there is not test audio data for flute provided in the dataset.

In [None]:
directory_pairs = [
    (Path('Cello'), Path('split/Cello')),
    # (Path('Flute'), Path('split/Flute')),
    (Path('Piano'), Path('split/Piano')),
    (Path('Violin'), Path('split/Violin')),
]

In [None]:
for input_dir, output_dir in directory_pairs:
    process_directory(os.path.join("solo_train_data", input_dir), os.path.join("solo_train_data", output_dir))
    process_directory(os.path.join("solo_test_data", input_dir), os.path.join("solo_test_data", output_dir))

## Deleting Excess Training Data

To prevent overfitting of the models, excess files are deleted from folders that have too many files. Following the number of files violin has, the excess files are deleted from piano and cello at random.

In [None]:
import random

In [None]:
def delete_random_wav_files(folder_path, num_files_to_delete):
    # Get a list of all .wav files in the specified folder
    wav_files = [file for file in os.listdir(folder_path) if file.endswith('.wav')]
    
    # Check if the number of files to delete is not more than the available files
    if num_files_to_delete > len(wav_files):
        print(f"Requested number of files to delete ({num_files_to_delete}) exceeds the number of available .wav files ({len(wav_files)}).")
        return
    
    # Randomly select files to delete
    files_to_delete = random.sample(wav_files, num_files_to_delete)
    
    # Delete the selected files
    for file in files_to_delete:
        file_path = os.path.join(folder_path, file)
        os.remove(file_path)
        print(f"Deleted file: {file_path}")

##### check the number of files in each split folder

In [None]:
def count_files_in_directory(directory):
    # Get a list of entries in the directory
    entries = os.listdir(directory)

    # Count how many of these entries are files
    file_count = sum(1 for entry in entries if os.path.isfile(os.path.join(directory, entry)))
    
    return file_count

In [None]:
cello_split_directory = 'solo_train_data/split/Cello'
piano_split_directory = 'solo_train_data/split/Piano'
violin_split_directory = 'solo_train_data/split/Violin'

print(f"Cello:  {count_files_in_directory(cello_split_directory)} files in the directory.")
print(f"Piano:  {count_files_in_directory(piano_split_directory)} files in the directory.")
print(f"Violin: {count_files_in_directory(violin_split_directory)} files in the directory.")

##### Delete Files

In [None]:
# Piano
num_files_to_delete = 51474
delete_random_wav_files(piano_split_directory, num_files_to_delete)

In [None]:
# Cello
num_files_to_delete = 1105
delete_random_wav_files(cello_split_directory, num_files_to_delete)

##### Check the number of files

In [None]:
print(f"Cello:  {count_files_in_directory(cello_split_directory)} files in the directory.")
print(f"Piano:  {count_files_in_directory(piano_split_directory)} files in the directory.")
print(f"Violin: {count_files_in_directory(violin_split_directory)} files in the directory.")