In [60]:
import os 
import pandas as pd
from pydub import AudioSegment
import re

In [61]:
guitar_path = './IDMT-SMT-CHORDS/guitar'
non_guitar_path = './IDMT-SMT-CHORDS/non_guitar'
raw_data_path = './IDMT-SMT-CHORDS/raw_data'    

In [62]:
guitar_annotations = pd.read_csv(os.path.join(guitar_path,'guitar_annotation.lab'), sep = '\t', header = 0, names =['start_time', 'end_time', 'chord'])
non_guitar_annotations = pd.read_csv(os.path.join(non_guitar_path,'non_guitar_annotation.lab'), sep = '\t', header = 0, names =['start_time', 'end_time', 'chord'])

In [64]:
def obtain_type_of_sound(file_name):
    pattern = r'_([A-Z]\w*)\.wav$' # r'_(\w+)\.wav$'
    match = re.search(pattern, file_name)   
    if match:
        return match.group(1)
    
def trim_audio(file_name, index, label_dict, label_dataframe, guitar_flag):

    audio = AudioSegment.from_wav(file_name)
    duration = len(audio) 

    start_time = 0 
    end_time = 2000
     
    dataframe_index = 0 
    while start_time < duration:    
        # Save the audio fragment
        index_str = f"{str(index).zfill(4)}.wav"
        trimmed_audio = audio[start_time:end_time]
        trimmed_audio.export('trimmed_audio/' + index_str, format="wav")
        start_time = end_time
        end_time = end_time + 2000
        
        # Get label to the dictionary and update it 
        dataframe = guitar_annotations if guitar_flag else non_guitar_annotations   
        label_dict[index_str] = dataframe.iloc[dataframe_index]['chord']  

        type_of_sound = obtain_type_of_sound(file_name) 
        instrument = 'guitar' if guitar_flag else 'non_guitar'  
        label_dataframe.loc[index] = [index_str, label_dict[index_str], instrument, type_of_sound]  
        # Update indexes    
        dataframe_index += 1     
        index += 1

    return index, label_dict, label_dataframe

In [66]:
label_dict = {} 
label_dataframe = pd.DataFrame(columns=['file_name', 'label', 'instrument', 'type_of_sound'])        
directory_path = os.makedirs('trimmed_audio', exist_ok=True)
for path, _, filenames in os.walk(raw_data_path):
        index = 0
        for filename in filenames:
                if filename.endswith('.wav'):
                        guitar_flag = 'guitar' in filename 
                        print(os.path.join(path, filename))
                        index, label_dict, label_dataframe = trim_audio(os.path.join(path, filename), index, label_dict, label_dataframe, guitar_flag)
                        print(path, filename)

dictionary = pd.DataFrame.from_dict(label_dict, orient='index')  
dictionary.to_csv('label_dictionary.csv', header=False)              
label_dataframe.to_csv('label_dataframe.csv', header=True)

./IDMT-SMT-CHORDS/raw_data/garageband_piano_Jazz_Organ.wav
./IDMT-SMT-CHORDS/raw_data garageband_piano_Jazz_Organ.wav
./IDMT-SMT-CHORDS/raw_data/ableton_live_piano_Sweetness_Pad.wav
./IDMT-SMT-CHORDS/raw_data ableton_live_piano_Sweetness_Pad.wav
./IDMT-SMT-CHORDS/raw_data/ableton_live_guitar_Nylon_Concerto_Guitar.wav
./IDMT-SMT-CHORDS/raw_data ableton_live_guitar_Nylon_Concerto_Guitar.wav
./IDMT-SMT-CHORDS/raw_data/ableton_live_guitar_Guitar_Open.wav
./IDMT-SMT-CHORDS/raw_data ableton_live_guitar_Guitar_Open.wav
./IDMT-SMT-CHORDS/raw_data/ableton_live_guitar_Campfire.wav
./IDMT-SMT-CHORDS/raw_data ableton_live_guitar_Campfire.wav
./IDMT-SMT-CHORDS/raw_data/ableton_live_piano_Celestial_Pad.wav
./IDMT-SMT-CHORDS/raw_data ableton_live_piano_Celestial_Pad.wav
./IDMT-SMT-CHORDS/raw_data/ableton_live_piano_Sadness_Pad.wav
./IDMT-SMT-CHORDS/raw_data ableton_live_piano_Sadness_Pad.wav
./IDMT-SMT-CHORDS/raw_data/ableton_live_piano_Grand_Piano.wav
./IDMT-SMT-CHORDS/raw_data ableton_live_piano_Gr

In [None]:
guitar_annotations.head()

In [None]:
non_guitar_annotations.head()   

In [None]:
print(f'El numero de acordes que vamos a tener en total es {(273*6)+(576*10)} por lo que vamos a necesitar 4 digitos')