# Dataset split

This file divides our dataset into train, validation and test set. We can't use preprogrammed functions to do that, because we divided each of our files into other smallers (i.e. one sound to six images). Putting images made out of same mp3 file might lead to the *data leakage* and make our results *not trustworthy* and *biased*.

In [1]:
import os

# define number of files for all sets
train = 0.8 # 80% of all sound should be in the train set
val = 0.1 # 10% validation set
test = 0.1 # 10% test set
kfolds = 1
basePath="../data/xeno-canto-dataset-full-all-Countries/" # path with sound files. Can be downloaded with "AM_downloadData"
imPath = "../data/mels-27class/" # path with images (melspectrograms)
                                # Can be generated with "AM_prepareData" after downloading sound files
destPath = "data/27_class/" # destination path - where the split dataset should be copied
                            # This folder will be used to train CNNs

# first find all of the mp3 files in the directory
birds=[] # list of all bird spiecies (Ember, Phyll...)
singleBirdList=[] # list of files for one bird
allFilesList=[] # list of all files for all birds. A list of singleBirdLists. 
for root, dirs, files in os.walk(basePath):
    if root == basePath:
        birds=dirs
print(birds)

trainSet=[]
testSet=[]
valSet=[]

birdsShort=[] # list of short file names
birdNumber=0
for nr,bird in enumerate(birds):
    for root, dirs, files in os.walk(basePath+bird):
        for file in files:
            if file.endswith(".mp3"):
                singleBirdList.append(os.path.join(root, file))
    if len(singleBirdList) > 50:
        birdsShort.append(str(birdNumber)+bird[:5])
        birdNumber = birdNumber+1;
        print("Found ", len(singleBirdList), ' mp3 files for ', bird)
        trainSet.append(int(train*len(singleBirdList)))
        valSet.append(int(val*len(singleBirdList)))
        roundDiff = len(singleBirdList)-(int(train*len(singleBirdList))+int(test*len(singleBirdList))+int(val*len(singleBirdList))) 
        testSet.append(int(test*len(singleBirdList)) + roundDiff)
        print("Size of train: ", int(train*len(singleBirdList)),", val: ",int(val*len(singleBirdList)) ,", test: ",int(test*len(singleBirdList)))
        allFilesList.append(singleBirdList)
    singleBirdList=[]


print(trainSet)
print(valSet)
print(testSet)


['Parusmajor', 'Turduspilaris', 'Passerdomesticus', 'Luscinialuscinia', 'Dendrocoposmajor', 'Corvusmonedula', 'Phoenicurusphoenicurus', 'Erithacusrubecula', 'Picapica', 'Phoenicurusochruros', 'Garrulusglandarius', 'Passermontanus', 'Corvuscorone', 'Corvusfrugilegus', 'Coccothraustescoccothraustes', 'Sittaeuropaea', 'Alaudaarvensis', 'Streptopeliadecaocto', 'Phylloscopustrochilus', 'Delichonurbicum', 'Turdusphilomelos', 'Phylloscopuscollybita', 'Fringillacoelebs', 'Sturnusvulgaris', 'Emberizacitrinella', 'Columbapalumbus', 'Troglodytestroglodytes', 'Paruscaeruleus', 'Apusapus', 'Cardueliscarduelis', 'Chlorischloris', 'Motacillaalba', 'Turdusmerula']
Found  1931  mp3 files for  Parusmajor
Size of train:  1544 , val:  193 , test:  193
Found  123  mp3 files for  Turduspilaris
Size of train:  98 , val:  12 , test:  12
Found  384  mp3 files for  Passerdomesticus
Size of train:  307 , val:  38 , test:  38
Found  396  mp3 files for  Luscinialuscinia
Size of train:  316 , val:  39 , test:  39
F

In [2]:
# randomly choose mp3 files for each set  

from random import sample

trainFiles=[]
valFiles=[]
testFiles=[]

for index, singleBirdList in enumerate(allFilesList):
    randFiles = sample(range(len(singleBirdList)), len(singleBirdList))
    start = 0
    end = trainSet[index]
    trainFiles.append(randFiles[start:end])
    start = end 
    end = start + valSet[index]
    valFiles.append(randFiles[start:end])
    start = end 
    end = start + testSet[index] 
    testFiles.append(randFiles[start:end])
    print("Selected random files number:\n train: ", len(trainFiles[index]),"/",trainSet[index],
          ", val: ",len(valFiles[index]),"/",valSet[index],
          ", test: ",len(testFiles[index]),"/",testSet[index])



Selected random files number:
 train:  1544 / 1544 , val:  193 / 193 , test:  194 / 194
Selected random files number:
 train:  98 / 98 , val:  12 / 12 , test:  13 / 13
Selected random files number:
 train:  307 / 307 , val:  38 / 38 , test:  39 / 39
Selected random files number:
 train:  316 / 316 , val:  39 / 39 , test:  41 / 41
Selected random files number:
 train:  414 / 414 , val:  51 / 51 , test:  53 / 53
Selected random files number:
 train:  1269 / 1269 , val:  158 / 158 , test:  160 / 160
Selected random files number:
 train:  44 / 44 , val:  5 / 5 , test:  7 / 7
Selected random files number:
 train:  286 / 286 , val:  35 / 35 , test:  37 / 37
Selected random files number:
 train:  108 / 108 , val:  13 / 13 , test:  15 / 15
Selected random files number:
 train:  146 / 146 , val:  18 / 18 , test:  19 / 19
Selected random files number:
 train:  71 / 71 , val:  8 / 8 , test:  10 / 10
Selected random files number:
 train:  311 / 311 , val:  38 / 38 , test:  40 / 40
Selected random 

# Copying files to a new directory
In previous blocks we've randomly selected which files will belong to which sets. Now we have to find corresponding images to those files and copy them to new directory.

* new directory - i.e. ..data/mels/final
    - train 
        - one folder for each class
        - Ember
        - Phyll
        - ...
    - val
        - folders for each class
    - test
        - folders for each class

In [3]:
def extractName(string):
    return string.rsplit('/',1)[1].replace(' ', '')[:-4]

In [4]:
# sort all the lists to make copying files easier
sets=[trainFiles,valFiles,testFiles]
for fileSet in sets:
    for index,files in enumerate(fileSet):
        fileSet[index].sort()

In [5]:
# change full names to short

for root, dirs, files in os.walk(basePath):
    if root == basePath:
        birds=dirs
#birdsShort=[]
#for bird in birds:
#    birdsShort.append(bird[:5])

setNames = ["train/","val/","test/"]

print("Long: ", birds,"\nShort: ",birdsShort)


Long:  ['Parusmajor', 'Turduspilaris', 'Passerdomesticus', 'Luscinialuscinia', 'Dendrocoposmajor', 'Corvusmonedula', 'Phoenicurusphoenicurus', 'Erithacusrubecula', 'Picapica', 'Phoenicurusochruros', 'Garrulusglandarius', 'Passermontanus', 'Corvuscorone', 'Corvusfrugilegus', 'Coccothraustescoccothraustes', 'Sittaeuropaea', 'Alaudaarvensis', 'Streptopeliadecaocto', 'Phylloscopustrochilus', 'Delichonurbicum', 'Turdusphilomelos', 'Phylloscopuscollybita', 'Fringillacoelebs', 'Sturnusvulgaris', 'Emberizacitrinella', 'Columbapalumbus', 'Troglodytestroglodytes', 'Paruscaeruleus', 'Apusapus', 'Cardueliscarduelis', 'Chlorischloris', 'Motacillaalba', 'Turdusmerula'] 
Short:  ['0Parus', '1Turdu', '2Passe', '3Lusci', '4Phoen', '5Erith', '6Picap', '7Phoen', '8Garru', '9Passe', '10Cocco', '11Sitta', '12Alaud', '13Strep', '14Phyll', '15Delic', '16Turdu', '17Phyll', '18Fring', '19Sturn', '20Ember', '21Colum', '22Trogl', '23Cardu', '24Chlor', '25Motac', '26Turdu']


In [6]:

import shutil

counter=0
for birdNumber, bird in enumerate(birdsShort): # for each class (bird) check where the file should be copied
    print(counter)
    counter=0
    for setName, fileSet in zip(setNames, sets): # check for all datasets: train, val and test sests 
        for setNumber in fileSet[birdNumber]:
            
            for fileNumber, file in enumerate(allFilesList[birdNumber]):
                if setNumber == fileNumber: # if file number to copy is same as number of file, then copy it
                    
                    for root, dirs, files in os.walk(imPath):
                        for file2 in files:
                            if extractName(file) in file2:
                                counter=counter+1
                                source=root+"/"+file2
                                
                                destination = destPath+setName+bird+"/"
                                if not os.path.exists(destination):
                                    os.makedirs(destination)
                                shutil.copy2(source, destination)
                                #print(source, "   ->   ", destination)

            

0
29767
1770
4897
9698
10656
44221
516
5540
2225
3463
988
4220
13610
3462
53258
1205
82631
23903
38499
10644
36184
3033
21386
10390
8775
1554
