In [1]:
from pydub import AudioSegment # need pip install pydub
import librosa # need pip install librosa

import os
import glob

import numpy as np
import pandas as pd
import re

### Preprocessing mp3 file(name)s

In [2]:
#move all files from ../Muziek and its subdirectories to one directory with all songs, ../Music
#My file structure for this to work:
'''
Project--|scripts -- this notebook
         |Muziek -- with subfolders as downloaded from drive (can be removed after executing commented code in this block)
         |Music -- empty (at this point)
         |annotations -- empty (at this point)
         |Annotations.csv
''';


#for filename in glob.glob('../Muziek/**/*.mp3', recursive=True):
#    os.rename(filename, '../Music/'+filename.split('/')[-1])


In [3]:
path = '../Music/'
#remove (things in brackets) and non-alphanumerics (except .mp3 for the extension)
#E.g. '01-1 Home_s Nice's U.S.S.R. (Remastered).mp3' -> '011HomesNicesUSSR.mp3'
[os.rename(path+f, path+re.sub('\(.*\)|(?:\.(?!mp3))|[^\w.]|_','',f)) for f in os.listdir(path)]
# '011HomesNicesUSSR.mp3' -> 'HomesNicesUSSR.mp3'
[os.rename(path+f, path+re.sub('^\d+\s*','',f)) for f in os.listdir(path)];

#### Trim silence

In [4]:
path = '../Music/'
for i,file in enumerate(os.listdir(path)): #for each song in the specified path:
    new_path = '../Trimmed/'
    if file[:-4]+'.wav' not in os.listdir(new_path): #if it not yet in the trimmed directory:
        filename = new_path+file[:-4]+'.wav'
        song,sr = librosa.load(path+file) # load song with librosa
        trimmed,_ = librosa.effects.trim(song) # trim silence from song
        librosa.output.write_wav(filename, trimmed, sr) # save trimmed song to trimmed directory

In [5]:
print("Number of songs in dataset: {}".format(len(os.listdir('../Trimmed/'))))

Number of songs in dataset: 260


### Preprocessing annotation data

In [6]:
#load segment csv
annotations = pd.read_excel('../final_annotations.xlsx', sep=';', names=['Artist', 'Album', 'Song', 'Oldstart', 'Oldstop', 'Label', 'Start', 'Stop'])

In [7]:
annotations = annotations.drop(['Oldstart', 'Oldstop'], axis=1).dropna(how='any')

In [8]:
annotations.head(10)

Unnamed: 0,Artist,Album,Song,Label,Start,Stop
0,Carole King,Tapestry,01 I Feel The Earth Move,refrain,8.224,23.996
1,Carole King,Tapestry,01 I Feel The Earth Move,refrain,40.512,60.482
2,Carole King,Tapestry,01 I Feel The Earth Move,refrain,116.568,148.312
3,Carole King,Tapestry,02 So Far Away,refrain,13.045,33.318
4,Carole King,Tapestry,02 So Far Away,refrain,33.318,60.226
5,Carole King,Tapestry,02 So Far Away,refrain,87.025,120.244
6,Carole King,Tapestry,02 So Far Away,refrain,150.159,183.969
7,Carole King,Tapestry,03 It's Too Late,verse,0.0,28.736
8,Carole King,Tapestry,03 It's Too Late,refrain,28.736,47.388
9,Carole King,Tapestry,03 It's Too Late,verse,47.388,75.473


In [9]:
#convert timestamps to milisec
annotations[['Start', 'Stop']] = annotations[['Start', 'Stop']].applymap(lambda x: int(x*1000))

#remove (thing in brackets) and _ and ' from song names in annotation data
annotations.Song = annotations.Song.str.replace('\(.*\)|\W|_', '', regex=True)
annotations.Song = annotations.Song.str.replace('^\d+\s*', '', regex=True)

In [10]:
print("Number of annotations before preprocessing: {}".format(len(annotations)))

#Make labels with verse and chorus uniform
annotations.loc[annotations['Label'].str.contains('verse'), 'Label'] = 'verse'
annotations.loc[annotations['Label'].str.contains('chorus'), 'Label'] = 'chorus'
annotations.loc[annotations['Label'].str.contains('refrain'), 'Label'] = 'chorus'

#Remove non-relevant labels (should not be needed in the final annotation file)
annotations = annotations[annotations.Label.str.contains('chorus|verse', regex=True)]

print("Number of annotations containing verse/chorus: {}".format(len(annotations)))

Number of annotations before preprocessing: 1480
Number of annotations containing verse/chorus: 1480


In [11]:
#remove songs that do not have both a chorus and a verse:
for song in annotations.Song.unique():
    labels = annotations[annotations.Song==song].Label.unique() #all labels in song
    if 'chorus' not in labels or 'verse' not in labels: #if chorus or verse missing:
        annotations = annotations.loc[annotations.Song!=song] #remove song
        
print("Number of songs containing both verse/chorus: {}".format(len(annotations.Song.unique())))

Number of songs containing both verse/chorus: 156


In [12]:
#preview of dataset
annotations.head()

Unnamed: 0,Artist,Album,Song,Label,Start,Stop
7,Carole King,Tapestry,ItsTooLate,verse,0,28735
8,Carole King,Tapestry,ItsTooLate,chorus,28735,47388
9,Carole King,Tapestry,ItsTooLate,verse,47388,75473
10,Carole King,Tapestry,ItsTooLate,chorus,75473,94025
11,Carole King,Tapestry,ItsTooLate,verse,112300,148847


### Cutting up songs into annotations

In [13]:
#1 verse and chorus per song:

for song in annotations.Song.unique(): # For each song in the annotation file
    try:
        sound = AudioSegment.from_wav('../Trimmed/{}.wav'.format(song)) #Load the song with pydub
        to_add = {'verse': True, 'chorus': True} # LUT for which segments do we still need to extract
        # For each row in the annotations corresponding to the selected song:
        for _,row in annotations[annotations.Song==song].reset_index().iterrows(): 
            segment = sound[row.Start:row.Stop] # cut out the segment from the song
            kind = row.Label #'verse' or 'chorus'
            if(to_add[kind]): # if we still need this segment (chorus / verse)
                segment.export("../segments/{}_{}.mp3".format(song, kind), format="mp3") #export segment
                to_add[kind] = False #No longer need to extract this label
    except:
        print('failed on song: '+song) #Song is in annotations, but not in audio directory.

In [14]:
#All choruses and verses from each song:
#(NOT USED)


'''
for song in annotations.Song.unique():
    try:
        sound = AudioSegment.from_mp3('../Music/{}.mp3'.format(song)) #get the corresponding song
        verse_i = 1
        chorus_i = 1
        for _,row in annotations[annotations.Song==song].reset_index().iterrows():
            segment = sound[row.Start:row.Stop]
            segment.export("../annotations/{}_{}_{}.mp3".format\
                           (song, row.Label, verse_i if row.Label=='verse' else chorus_i), format="mp3")
            if row.Label=='verse':
                verse_i+=1
            else:
                chorus_i+=1
    except:
        print('failed on song: '+song)
''';