In [4]:
# Importing all neccessary packages
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import IPython.display as ipd
from glob import glob
import cv2
import shutil

In [3]:
def add_files(source_data_file, destination_data_file, source_dir, destination_dir, required_file_number, choice_rate = 1):
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)
    elif len(os.listdir(destination_dir)) >= required_file_number:
        return
        
    reject_rate = 1-choice_rate
    
    superset = pd.read_csv(source_data_file)
    rows_to_append = []
    
    for i, row in superset.iterrows():
       if(np.random.rand(1) >= reject_rate):
           source_file = os.path.join(source_dir, row['FileName'])
           destination_file = os.path.join(destination_dir, row['FileName'])
           if os.path.exists(source_file):
               rows_to_append.append(row)
               shutil.move(source_file, destination_file)
           else:
               if(row['Class'] == 'Sound_Violin'):
                   print('Could not find:', source_file) 
            

    if not os.path.exists(destination_data_file):
        temp = pd.DataFrame(columns = ['FileName', 'Class'])
        temp.to_csv(destination_data_file, index = False)
    subset = pd.read_csv(destination_data_file)
    subset = pd.concat([subset, pd.DataFrame(rows_to_append)], ignore_index = True)
    subset.to_csv(destination_data_file, index = False)

In [9]:
# preparing training data
required_file_number = 3000

tr_source_dir = 'Train_audio_orig'
tr_destination_dir = 'Train_audios'
tr_source_data_file = 'Metadata_Train.csv'
tr_destination_data_file = 'Train_Data.csv'

add_files(tr_source_data_file, tr_destination_data_file, tr_source_dir, tr_destination_dir, required_file_number, 1)

In [11]:
# preparing training data
required_file_number = 1000

te_source_dir = 'Test_audio_orig'
te_destination_dir = 'Test_audios'
te_source_data_file = 'Metadata_Test.csv'
te_destination_data_file = 'Test_Data.csv'

add_files(te_source_data_file, te_destination_data_file, te_source_dir, te_destination_dir, required_file_number, 1)

In [12]:
print(f"Number of train files: {len(os.listdir(tr_destination_dir))}")
print(f"Number of test files: {len(os.listdir(te_destination_dir))}")

Number of train files: 2628
Number of test files: 80


In [13]:
def class_count(data_path, dir_path):
    df = pd.read_csv(data_path)
    f = 0
    class_count = {'Sound_Guitar': 0, 'Sound_Piano': 0, 'Sound_Drum': 0, 'Sound_Violin': 0}
    for i, row in df.iterrows():
        class_count[row['Class']]+=1
        
        if not os.path.exists(os.path.join(dir_path,row['FileName'])):
            print(row['FileName'])
            f = 1
    
    if f==0:
        print("No missing files")
    for key in class_count.keys():
        print(key, ':', class_count[key])

In [14]:
print('Train:')
class_count(tr_destination_data_file, tr_destination_dir)

Train:
No missing files
Sound_Guitar : 700
Sound_Piano : 528
Sound_Drum : 700
Sound_Violin : 700


In [15]:
print('Test:')
class_count(te_destination_data_file, te_destination_dir)

Test:
No missing files
Sound_Guitar : 20
Sound_Piano : 20
Sound_Drum : 20
Sound_Violin : 20


Upon inspection, it is observed that the original CSV file 'Metadata_Train.csv' containing file labels for test data is faulty. It does not contain the names of audio files with sound of a violin, but instead have the same names as the drum audio files with the label 'Sound_Violin'. To fix this, we can transfer the remaining audio files in our original audio directory to our new directory while simultaneously adding rows to our new CSV file of the format ['Audio_File_Name', 'Sound_Violin']. This should fix our issue.

In [3]:
rows_to_append = []

for file in os.listdir('Train_audio_orig'):
    file_path = os.path.join('Train_audio_orig', file)
    dest_path = os.path.join('Train_audios', file)
    
    rows_to_append.append([file, 'Sound_Violin'])
    
    shutil.move(file_path, dest_path)

df = pd.read_csv('Train_Data.csv')
df = pd.concat([df, pd.DataFrame(rows_to_append)], ignore_index = True)
df.to_csv('Train_Data.csv', index=False)