# DataSet Preparation Utils
by Wael Radwan

#### Library used to prepare datasets from data
continans various tools to prepare the training dataset and testing dataset

mainly used to randomize files of the dataset
divide the dataset into traininig and testing dataset
randomize the test files

<b> the main idea here is to load the dataset into classified directories then use this utility to create train/test dataset with the randomization </b>

In [1]:
import os
import re
import random
import ntpath
import shutil
import pandas as pd
import numpy as np

#### Input description
input to this tool is a dataset of files divided into classes in the following order
- class_1:
    - file_1
    - file_2
    - .
    - .
- class_2
    - file_1
    - file_2
    - .
    - .
- .
- .


- class_n 
    - file_1
    - file_2
    - .
    - .


#### Output description
output shoud be the data divided into train and test sets  in the following order

- train
    - class_1_001
    - class_1_002
    - .
    - .
    - class_2_001
    - class_2_002
    - .
    - .
    - .
    - .
    - class_n_001
    - class_n_002
    - .
    - .
 
- test
    - class_1_001
    - class_1_002
    - .
    - .
    - class_2_001
    - class_2_002
    - .
    - .
    - .
    - .
    - class_n_001
    - class_n_002
    - .
    - .
 
    - class_label.csv (a csv file contains the filename and the label so it can be used for testing comparison)

#### create panda from the files dataset
mainly used for the test dataset to generate the file name and the label based on the label criteria
randomize is set to true if the files are ranomized in the dataset instead of ordered

In [2]:
def list_files_with_labels(path, randomize=False):
    dataset = []
    files = sorted(os.listdir(path))
    if not files:
        return dataset
    file = [os.path.splitext(x)[0] for x in files]
    ext = [os.path.splitext(x)[-1] for x in files]
    class_name = ntpath.basename(path)
    

    # create panda dataframe for file sorted by their name

    dataset = pd.DataFrame(data={"file":file,"ext":ext, "label":class_name})
    dataset = dataset[['file', 'ext', 'label']]

    # if randomize is set to True then shuffle the dataset dataframe
    if (randomize) :
        dataset= dataset.sample(frac=1)
        dataset = dataset.sample(frac=1).reset_index(drop=True)

    return dataset


#### create list of the dataset with new name for each file based on a sequence
randomize or order the files in the dataset then create a new name based on sequence number

- path      : path to the class
- seqlen    : is the lenght of the sequence so if seqlen = 3 the new file names follow the sequence 001, 002, 003, ...
- randomize : either to randomize the files in the class set or not (default is not to randomize) 
- retrun panda structure with new file name 

In [3]:
def list_with_new_file_name_seq(path,  seqlen, randomize=False,):
    df = []  
    dataset = list_files_with_labels( path , randomize)
    #if not dataset:
     #   return
    
    # recreate index in case the dataset has been randomize
    # so the seq_number will be reflected in out file_name
    df = dataset.sample(frac=1).reset_index(drop=True)
    
    # new file name will be classid_seq(of seq_len).ext
    # for example 
    # 1_0001.mp3
    # 1_0002.mp3
    df['newfilename'] = df['label']+'_'+(df.index + 1).astype(str).str.zfill(seqlen) + df.ext
    
    # df.index = df.index.map(str) 
    return df

#### Rename files 
After creating the dataframe with proposed renamed files then do the rename files in the dataset
then rename all the files to _tmp files in order not to overwrite files that already exist 


In [4]:
def rename_files_dataset(path, seqlen):
    # first list the files in the input dir and put the porposed name in the dataframe
    data = list_with_new_file_name_seq( path,seqlen)
    #if not data:
     #   return
    
    # track the seq
    i = 1
    
    # keep the track of the new file names in a list in order to be used later
    tmp_file_names = []
    
    # first rename the files with _tmp in order not to overwrite files 
    # and keep the renamed files in the list 
    # then use the list to remove _tmp files
    for index, row in data.iterrows():
        tmp_file_name = path+'/'+row['newfilename'] + '_tmp'
        orig_file_name =  path+'/'+row['file'] + row['ext']
        print('rename %s to %s'  % (orig_file_name, tmp_file_name) )
        os.rename(orig_file_name, tmp_file_name)
        tmp_file_names.append(tmp_file_name)
        
    # now remove _tmp files    
    for temp_file in tmp_file_names:
        print('rename %s to %s'  % (temp_file, temp_file.rsplit('_tmp', -1)[0]) )
        os.rename(temp_file, temp_file.rsplit('_tmp', -1)[0]) 
    return

##### splitt the dataset into training and test
here split the dataset into training and testing 
- path : path to the class
- train_path : path to the train dir
- test_path : path to the test dir
- train_files_count : is the number of files in each class to be moved to the training dir
- randomize : either to randomize the files in the class set or not (default is not to randomize)

the output will be copying the data into train and test directories

In [5]:
def split_into_train_test( path, train_path, test_path,train_files_count, randomize=True):
    # read the files in the origianl path ordered by there name
    data = list_files_with_labels( path,randomize)
    # copy the dir name of the class 
    # this only copy the current directory name (without the path)
    dir_name = ntpath.basename(path)
    
    # track the seq
    i = 1
    
    # keep the track of the new file names in a list in order to be used later
    tmp_file_names = []
    
    # first rename the files with _tmp in order not to overwrite files 
    # and keep the renamed files in the list 
    # then use the list to remove _tmp files
    for index, row in data.iterrows():
        orig_file_name =  path+'/'+row['file'] + row['ext']
        if (index<train_files_count):
            tmp_file_name = train_path+'/'+row['file']  + row['ext']
        else :
            tmp_file_name = test_path+'/'+row['file']  + row['ext']
       
        print('copy %s to %s'  % (orig_file_name, tmp_file_name) )
        shutil.copy(orig_file_name, tmp_file_name)
    return

In [50]:
train_path='/ds/dataset/quran/train'
test_path='/ds/dataset/quran/test'
split_into_train_test( path, train_path, test_path,10)

copy /ds/dataset/quran/999/999_0032.mp3 to /ds/dataset/quran/train/999_0032.mp3
copy /ds/dataset/quran/999/999_0024.mp3 to /ds/dataset/quran/train/999_0024.mp3
copy /ds/dataset/quran/999/999_0025.mp3 to /ds/dataset/quran/train/999_0025.mp3
copy /ds/dataset/quran/999/999_0004.mp3 to /ds/dataset/quran/train/999_0004.mp3
copy /ds/dataset/quran/999/999_0003.mp3 to /ds/dataset/quran/train/999_0003.mp3
copy /ds/dataset/quran/999/999_0030.mp3 to /ds/dataset/quran/train/999_0030.mp3
copy /ds/dataset/quran/999/999_0005.mp3 to /ds/dataset/quran/train/999_0005.mp3
copy /ds/dataset/quran/999/999_0021.mp3 to /ds/dataset/quran/train/999_0021.mp3
copy /ds/dataset/quran/999/999_0020.mp3 to /ds/dataset/quran/train/999_0020.mp3
copy /ds/dataset/quran/999/999_0017.mp3 to /ds/dataset/quran/train/999_0017.mp3
copy /ds/dataset/quran/999/999_0002.mp3 to /ds/dataset/quran/test/999_0002.mp3
copy /ds/dataset/quran/999/999_0022.mp3 to /ds/dataset/quran/test/999_0022.mp3
copy /ds/dataset/quran/999/999_0011.mp3 to

##### shuffle  dataset  files

shuffle files in any dataset then rename all the files to _tmp files in order not to overwrite files that already exist

** this can be used to shuffle any directory but the main reason here is to make sure that the  dataset is shuffled in order to avoid overfitting 

In [6]:
def shuffle_dataset(path):
    # first list the files in the input dir and put the porposed name in the dataframe
    data = list_files_with_labels( path,randomize=True)
    # create a new column with the random file selected from the same dataset
    # so the files will be shuffled
    data['newfile'] = data["file"].transform(np.random.permutation)


     
    # keep the track of the new file names in a list in order to be used later
    tmp_file_names = []
    
    # first rename the files with _tmp in order not to overwrite files 
    # and keep the renamed files in the list 
    # then use the list to remove _tmp files
    for index, row in data.iterrows():
        tmp_file_name = path+'/'+row['newfile'] + row['ext'] + '_tmp'
        orig_file_name =  path+'/'+row['file'] + row['ext']
        print('rename %s to %s'  % (orig_file_name, tmp_file_name) )
        os.rename(orig_file_name, tmp_file_name)
        tmp_file_names.append(tmp_file_name)
        
    # now remove _tmp f    
    for temp_file in tmp_file_names:
        print('rename %s to %s'  % (temp_file, temp_file.rsplit('_tmp', -1)[0]) )
        os.rename(temp_file, temp_file.rsplit('_tmp', -1)[0])
    return

###### to be continued
from here we will continue by adding more features
-  generate a csv description file for test to be used kaggle 

##### Generate the files / lables csv file
finally generate the csv file that holds the correct mapping of the files and thier correct labels this file should be used as a benchmark against files submitted

In [7]:
def generate_test_csv(test_path, csv_file_name):
    test_dataset = list_files_with_labels( test_path,randomize=True)
    test_dataset['fullfilename'] = test_dataset['file'] + test_dataset['ext']
    header = ["fullfilename", "label"]
    test_dataset.to_csv( csv_file_name, index=False, quoting=3 ,columns = header)
    return

#### create panda from test files dataset
mainly used for the test dataset to generate the file name and the label based on the label criteria randomize is set to true if the files are ranomized in the dataset instead of ordered
will return the panda with the original file name and the new file name 

In [8]:
def list_files_with_xlabels(path, randomize=False):
    dataset = []
    files = sorted(os.listdir(path))
    file = [re.split('[_ \.]+',x) for x in files]
    np.array(file).reshape(len(file),3)
    dataset = pd.DataFrame(np.array(file).reshape(len(file),3), columns =  ['label', 'file', 'ext'] )

    # if randomize is set to True then shuffle the dataset dataframe
    if (randomize) :
        dataset= dataset.sample(frac=1)
        dataset = dataset.sample(frac=1).reset_index(drop=True)

    return dataset

#### shuffle  test dataset  files

shuffle files in the <b>testing dataset </b> then rename all the files to _tmp files in order not to overwrite files that already exist

** this can be used to shuffle any directory but the main reason here is to make sure that the testing dataset is shuffled in order to avoid overfitting

In [9]:
def shuffle_test_dataset(path):
    # first list the files in the input dir and put the porposed name in the dataframe
    data = list_files_with_xlabels( path,randomize=True)

    # recreate index in case the dataset has been randomize
    # so the seq_number will be reflected in out file_name
    df = data.sample(frac=1).reset_index(drop=True)
    
    # new file name will be classid_seq(of seq_len).ext
    # for example 
    # 1_0001.mp3
    # 1_0002.mp3
    
    # find the maximum number of files then use this as a sequence length
    seqlen= len(str(len(df)))
    df['newfilename'] = (df.index + 1).astype(str).str.zfill(seqlen) + '.' +df.ext

    #if not data:
     #   return
    
    # track the seq
    i = 1
    
    # keep the track of the new file names in a list in order to be used later
    tmp_file_names = []
    
    # first rename the files with _tmp in order not to overwrite files 
    # and keep the renamed files in the list 
    # then use the list to remove _tmp files
    for index, row in df.iterrows():
        tmp_file_name = path+'/'+row['newfilename'] + '_tmp'
        orig_file_name =  path+'/'+row['label']+'_'+row['file'] +'.'+ row['ext']
        print('rename %s to %s'  % (orig_file_name, tmp_file_name) )
        tmp_file_names.append(tmp_file_name)

        
    # now remove _tmp files    
    for temp_file in tmp_file_names:
        print('rename %s to %s'  % (temp_file, temp_file.rsplit('_tmp', -1)[0]) )
        #os.rename(temp_file, temp_file.rsplit('_tmp', -1)[0]) 
    return df

#### Generate the files / lables csv file
finally generate the csv file that holds the correct mapping of the files and thier correct labels this file should be used as a benchmark against files submitted additionally a new csv file is created with the new and old names in order to keep track of the files.


In [28]:
def generate_test_csv_with_original(path,csv_file_name):
    # gat the panda with orignal and new file names
    data = list_files_with_xlabels( path,randomize=True)
    header = ["file", "label"]
    data.to_csv( csv_file_name, index=False, quoting=3 ,columns = header)
    return

#### Example code
the following sample code could be used to generate train / test dataset from the dataset

-- you can copy it and paste it in the code 

In [47]:
# first of all define the location of the files
print('----Start----')
path='/ds/dataset/quran/112'
train_path='/ds/dataset/quran/train'
test_path='/ds/dataset/quran/test'
csv_file_name = '/ds/dataset/quran/test_dataset.csv'

#first rename files in the path to sequenced files prefixed with label 
print('rename file in dataset %s' % (path))

rename_files_dataset(path, 4)

# suffle the files in the path
print('shuffle files in dataset %s' % (path))
shuffle_dataset(path)

# then split into train and test
print('then split into train and test')
split_into_train_test( path, train_path, test_path,20)

# suffle the files in the test
print('shuffle test dataset')
shuffle_test_dataset(test_path)

# write csv file
csv_file_name ='test_dataset.csv'
csv_file= path+'/'+csv_file_name
generate_test_csv(test_path, csv_file)

----Start----
rename file in dataset /ds/dataset/quran/112
rename /ds/dataset/quran/112/112_0079.mp3 to /ds/dataset/quran/112/112_0001.mp3_tmp
rename /ds/dataset/quran/112/112_0363.mp3 to /ds/dataset/quran/112/112_0002.mp3_tmp
rename /ds/dataset/quran/112/112_0300.mp3 to /ds/dataset/quran/112/112_0003.mp3_tmp
rename /ds/dataset/quran/112/112_0049.mp3 to /ds/dataset/quran/112/112_0004.mp3_tmp
rename /ds/dataset/quran/112/112_0223.mp3 to /ds/dataset/quran/112/112_0005.mp3_tmp
rename /ds/dataset/quran/112/112_0431.mp3 to /ds/dataset/quran/112/112_0006.mp3_tmp
rename /ds/dataset/quran/112/112_0168.mp3 to /ds/dataset/quran/112/112_0007.mp3_tmp
rename /ds/dataset/quran/112/112_0151.mp3 to /ds/dataset/quran/112/112_0008.mp3_tmp
rename /ds/dataset/quran/112/112_0399.mp3 to /ds/dataset/quran/112/112_0009.mp3_tmp
rename /ds/dataset/quran/112/112_0051.mp3 to /ds/dataset/quran/112/112_0010.mp3_tmp
rename /ds/dataset/quran/112/112_0084.mp3 to /ds/dataset/quran/112/112_0011.mp3_tmp
rename /ds/datase

rename /ds/dataset/quran/112/112_0090.mp3 to /ds/dataset/quran/112/112_0133.mp3_tmp
rename /ds/dataset/quran/112/112_0025.mp3 to /ds/dataset/quran/112/112_0153.mp3_tmp
rename /ds/dataset/quran/112/112_0318.mp3 to /ds/dataset/quran/112/112_0114.mp3_tmp
rename /ds/dataset/quran/112/112_0281.mp3 to /ds/dataset/quran/112/112_0150.mp3_tmp
rename /ds/dataset/quran/112/112_0433.mp3 to /ds/dataset/quran/112/112_0117.mp3_tmp
rename /ds/dataset/quran/112/112_0154.mp3 to /ds/dataset/quran/112/112_0189.mp3_tmp
rename /ds/dataset/quran/112/112_0456.mp3 to /ds/dataset/quran/112/112_0255.mp3_tmp
rename /ds/dataset/quran/112/112_0053.mp3 to /ds/dataset/quran/112/112_0361.mp3_tmp
rename /ds/dataset/quran/112/112_0459.mp3 to /ds/dataset/quran/112/112_0062.mp3_tmp
rename /ds/dataset/quran/112/112_0193.mp3 to /ds/dataset/quran/112/112_0326.mp3_tmp
rename /ds/dataset/quran/112/112_0284.mp3 to /ds/dataset/quran/112/112_0126.mp3_tmp
rename /ds/dataset/quran/112/112_0006.mp3 to /ds/dataset/quran/112/112_0069.

copy /ds/dataset/quran/112/112_0151.mp3 to /ds/dataset/quran/train/112_0151.mp3
copy /ds/dataset/quran/112/112_0163.mp3 to /ds/dataset/quran/train/112_0163.mp3
copy /ds/dataset/quran/112/112_0071.mp3 to /ds/dataset/quran/train/112_0071.mp3
copy /ds/dataset/quran/112/112_0433.mp3 to /ds/dataset/quran/train/112_0433.mp3
copy /ds/dataset/quran/112/112_0310.mp3 to /ds/dataset/quran/train/112_0310.mp3
copy /ds/dataset/quran/112/112_0140.mp3 to /ds/dataset/quran/train/112_0140.mp3
copy /ds/dataset/quran/112/112_0211.mp3 to /ds/dataset/quran/train/112_0211.mp3
copy /ds/dataset/quran/112/112_0339.mp3 to /ds/dataset/quran/train/112_0339.mp3
copy /ds/dataset/quran/112/112_0100.mp3 to /ds/dataset/quran/train/112_0100.mp3
copy /ds/dataset/quran/112/112_0039.mp3 to /ds/dataset/quran/test/112_0039.mp3
copy /ds/dataset/quran/112/112_0403.mp3 to /ds/dataset/quran/test/112_0403.mp3
copy /ds/dataset/quran/112/112_0017.mp3 to /ds/dataset/quran/test/112_0017.mp3
copy /ds/dataset/quran/112/112_0032.mp3 to 

copy /ds/dataset/quran/112/112_0315.mp3 to /ds/dataset/quran/test/112_0315.mp3
copy /ds/dataset/quran/112/112_0327.mp3 to /ds/dataset/quran/test/112_0327.mp3
copy /ds/dataset/quran/112/112_0298.mp3 to /ds/dataset/quran/test/112_0298.mp3
copy /ds/dataset/quran/112/112_0446.mp3 to /ds/dataset/quran/test/112_0446.mp3
copy /ds/dataset/quran/112/112_0340.mp3 to /ds/dataset/quran/test/112_0340.mp3
copy /ds/dataset/quran/112/112_0421.mp3 to /ds/dataset/quran/test/112_0421.mp3
copy /ds/dataset/quran/112/112_0110.mp3 to /ds/dataset/quran/test/112_0110.mp3
copy /ds/dataset/quran/112/112_0396.mp3 to /ds/dataset/quran/test/112_0396.mp3
copy /ds/dataset/quran/112/112_0181.mp3 to /ds/dataset/quran/test/112_0181.mp3
copy /ds/dataset/quran/112/112_0430.mp3 to /ds/dataset/quran/test/112_0430.mp3
copy /ds/dataset/quran/112/112_0133.mp3 to /ds/dataset/quran/test/112_0133.mp3
copy /ds/dataset/quran/112/112_0183.mp3 to /ds/dataset/quran/test/112_0183.mp3
copy /ds/dataset/quran/112/112_0171.mp3 to /ds/datas

copy /ds/dataset/quran/112/112_0111.mp3 to /ds/dataset/quran/test/112_0111.mp3
copy /ds/dataset/quran/112/112_0259.mp3 to /ds/dataset/quran/test/112_0259.mp3
copy /ds/dataset/quran/112/112_0067.mp3 to /ds/dataset/quran/test/112_0067.mp3
copy /ds/dataset/quran/112/112_0420.mp3 to /ds/dataset/quran/test/112_0420.mp3
copy /ds/dataset/quran/112/112_0395.mp3 to /ds/dataset/quran/test/112_0395.mp3
copy /ds/dataset/quran/112/112_0092.mp3 to /ds/dataset/quran/test/112_0092.mp3
copy /ds/dataset/quran/112/112_0070.mp3 to /ds/dataset/quran/test/112_0070.mp3
copy /ds/dataset/quran/112/112_0348.mp3 to /ds/dataset/quran/test/112_0348.mp3
copy /ds/dataset/quran/112/112_0370.mp3 to /ds/dataset/quran/test/112_0370.mp3
copy /ds/dataset/quran/112/112_0103.mp3 to /ds/dataset/quran/test/112_0103.mp3
copy /ds/dataset/quran/112/112_0434.mp3 to /ds/dataset/quran/test/112_0434.mp3
copy /ds/dataset/quran/112/112_0364.mp3 to /ds/dataset/quran/test/112_0364.mp3
copy /ds/dataset/quran/112/112_0260.mp3 to /ds/datas

copy /ds/dataset/quran/112/112_0226.mp3 to /ds/dataset/quran/test/112_0226.mp3
copy /ds/dataset/quran/112/112_0425.mp3 to /ds/dataset/quran/test/112_0425.mp3
copy /ds/dataset/quran/112/112_0019.mp3 to /ds/dataset/quran/test/112_0019.mp3
copy /ds/dataset/quran/112/112_0034.mp3 to /ds/dataset/quran/test/112_0034.mp3
copy /ds/dataset/quran/112/112_0262.mp3 to /ds/dataset/quran/test/112_0262.mp3
copy /ds/dataset/quran/112/112_0227.mp3 to /ds/dataset/quran/test/112_0227.mp3
copy /ds/dataset/quran/112/112_0392.mp3 to /ds/dataset/quran/test/112_0392.mp3
copy /ds/dataset/quran/112/112_0023.mp3 to /ds/dataset/quran/test/112_0023.mp3
copy /ds/dataset/quran/112/112_0345.mp3 to /ds/dataset/quran/test/112_0345.mp3
copy /ds/dataset/quran/112/112_0432.mp3 to /ds/dataset/quran/test/112_0432.mp3
copy /ds/dataset/quran/112/112_0173.mp3 to /ds/dataset/quran/test/112_0173.mp3
copy /ds/dataset/quran/112/112_0440.mp3 to /ds/dataset/quran/test/112_0440.mp3
copy /ds/dataset/quran/112/112_0457.mp3 to /ds/datas

In [11]:
# first of all define the location of the files
print('----Start----')
path='/ds/dataset/quran/112'
train_path='/ds/dataset/quran/train'
test_path='/ds/dataset/quran/test'
csv_file_name = '/ds/dataset/quran/test_dataset.csv'


----Start----


In [12]:

#first rename files in the path to sequenced files prefixed with label 
print('rename file in dataset %s' % (path))

rename_files_dataset(path, 4)


rename file in dataset /ds/dataset/quran/112
rename /ds/dataset/quran/112/112_171.mp3 to /ds/dataset/quran/112/112_0001.mp3_tmp
rename /ds/dataset/quran/112/112_231.mp3 to /ds/dataset/quran/112/112_0002.mp3_tmp
rename /ds/dataset/quran/112/112_uP_bY_mUSLEm.Ettounssi_18.mp3 to /ds/dataset/quran/112/112_0003.mp3_tmp
rename /ds/dataset/quran/112/112_uP_bY_mUSLEm_39.mp3 to /ds/dataset/quran/112/112_0004.mp3_tmp
rename /ds/dataset/quran/112/112_muslem.ettounsi-_1.mp3 to /ds/dataset/quran/112/112_0005.mp3_tmp
rename /ds/dataset/quran/112/112_166.mp3 to /ds/dataset/quran/112/112_0006.mp3_tmp
rename /ds/dataset/quran/112/112_211.mp3 to /ds/dataset/quran/112/112_0007.mp3_tmp
rename /ds/dataset/quran/112/112_250.mp3 to /ds/dataset/quran/112/112_0008.mp3_tmp
rename /ds/dataset/quran/112/112_uP_bY_mUSLEm_1.mp3 to /ds/dataset/quran/112/112_0009.mp3_tmp
rename /ds/dataset/quran/112/112_uP_bY_mUSLEm_20.mp3 to /ds/dataset/quran/112/112_0010.mp3_tmp
rename /ds/dataset/quran/112/Hisham-Ibn-Aamir_112.mp3

In [14]:

# suffle the files in the path
print('shuffle files in dataset %s' % (path))
shuffle_dataset(path)



shuffle files in dataset /ds/dataset/quran/112
rename /ds/dataset/quran/112/112_0163.mp3 to /ds/dataset/quran/112/112_0020.mp3_tmp
rename /ds/dataset/quran/112/112_0433.mp3 to /ds/dataset/quran/112/112_0333.mp3_tmp
rename /ds/dataset/quran/112/112_0366.mp3 to /ds/dataset/quran/112/112_0378.mp3_tmp
rename /ds/dataset/quran/112/112_0062.mp3 to /ds/dataset/quran/112/112_0116.mp3_tmp
rename /ds/dataset/quran/112/112_0037.mp3 to /ds/dataset/quran/112/112_0343.mp3_tmp
rename /ds/dataset/quran/112/112_0010.mp3 to /ds/dataset/quran/112/112_0196.mp3_tmp
rename /ds/dataset/quran/112/112_0403.mp3 to /ds/dataset/quran/112/112_0215.mp3_tmp
rename /ds/dataset/quran/112/112_0427.mp3 to /ds/dataset/quran/112/112_0081.mp3_tmp
rename /ds/dataset/quran/112/112_0286.mp3 to /ds/dataset/quran/112/112_0144.mp3_tmp
rename /ds/dataset/quran/112/112_0170.mp3 to /ds/dataset/quran/112/112_0083.mp3_tmp
rename /ds/dataset/quran/112/112_0445.mp3 to /ds/dataset/quran/112/112_0406.mp3_tmp
rename /ds/dataset/quran/112/

rename /ds/dataset/quran/112/112_0124.mp3_tmp to /ds/dataset/quran/112/112_0124.mp3
rename /ds/dataset/quran/112/112_0082.mp3_tmp to /ds/dataset/quran/112/112_0082.mp3
rename /ds/dataset/quran/112/112_0195.mp3_tmp to /ds/dataset/quran/112/112_0195.mp3
rename /ds/dataset/quran/112/112_0426.mp3_tmp to /ds/dataset/quran/112/112_0426.mp3
rename /ds/dataset/quran/112/112_0303.mp3_tmp to /ds/dataset/quran/112/112_0303.mp3
rename /ds/dataset/quran/112/112_0352.mp3_tmp to /ds/dataset/quran/112/112_0352.mp3
rename /ds/dataset/quran/112/112_0331.mp3_tmp to /ds/dataset/quran/112/112_0331.mp3
rename /ds/dataset/quran/112/112_0073.mp3_tmp to /ds/dataset/quran/112/112_0073.mp3
rename /ds/dataset/quran/112/112_0283.mp3_tmp to /ds/dataset/quran/112/112_0283.mp3
rename /ds/dataset/quran/112/112_0025.mp3_tmp to /ds/dataset/quran/112/112_0025.mp3
rename /ds/dataset/quran/112/112_0065.mp3_tmp to /ds/dataset/quran/112/112_0065.mp3
rename /ds/dataset/quran/112/112_0117.mp3_tmp to /ds/dataset/quran/112/112_0

In [16]:
# then split into train and test
print('then split into train and test')
split_into_train_test( path, train_path, test_path,400)



then split into train and test
copy /ds/dataset/quran/112/112_0117.mp3 to /ds/dataset/quran/train/112_0117.mp3
copy /ds/dataset/quran/112/112_0387.mp3 to /ds/dataset/quran/train/112_0387.mp3
copy /ds/dataset/quran/112/112_0202.mp3 to /ds/dataset/quran/train/112_0202.mp3
copy /ds/dataset/quran/112/112_0280.mp3 to /ds/dataset/quran/train/112_0280.mp3
copy /ds/dataset/quran/112/112_0079.mp3 to /ds/dataset/quran/train/112_0079.mp3
copy /ds/dataset/quran/112/112_0326.mp3 to /ds/dataset/quran/train/112_0326.mp3
copy /ds/dataset/quran/112/112_0093.mp3 to /ds/dataset/quran/train/112_0093.mp3
copy /ds/dataset/quran/112/112_0001.mp3 to /ds/dataset/quran/train/112_0001.mp3
copy /ds/dataset/quran/112/112_0187.mp3 to /ds/dataset/quran/train/112_0187.mp3
copy /ds/dataset/quran/112/112_0300.mp3 to /ds/dataset/quran/train/112_0300.mp3
copy /ds/dataset/quran/112/112_0302.mp3 to /ds/dataset/quran/train/112_0302.mp3
copy /ds/dataset/quran/112/112_0186.mp3 to /ds/dataset/quran/train/112_0186.mp3
copy /ds/

copy /ds/dataset/quran/112/112_0211.mp3 to /ds/dataset/quran/train/112_0211.mp3
copy /ds/dataset/quran/112/112_0150.mp3 to /ds/dataset/quran/train/112_0150.mp3
copy /ds/dataset/quran/112/112_0040.mp3 to /ds/dataset/quran/train/112_0040.mp3
copy /ds/dataset/quran/112/112_0057.mp3 to /ds/dataset/quran/train/112_0057.mp3
copy /ds/dataset/quran/112/112_0359.mp3 to /ds/dataset/quran/train/112_0359.mp3
copy /ds/dataset/quran/112/112_0196.mp3 to /ds/dataset/quran/train/112_0196.mp3
copy /ds/dataset/quran/112/112_0443.mp3 to /ds/dataset/quran/train/112_0443.mp3
copy /ds/dataset/quran/112/112_0323.mp3 to /ds/dataset/quran/train/112_0323.mp3
copy /ds/dataset/quran/112/112_0391.mp3 to /ds/dataset/quran/train/112_0391.mp3
copy /ds/dataset/quran/112/112_0328.mp3 to /ds/dataset/quran/train/112_0328.mp3
copy /ds/dataset/quran/112/112_0238.mp3 to /ds/dataset/quran/train/112_0238.mp3
copy /ds/dataset/quran/112/112_0053.mp3 to /ds/dataset/quran/train/112_0053.mp3
copy /ds/dataset/quran/112/112_0349.mp3 

In [17]:
# suffle the files in the test
print('shuffle test dataset')
shuffle_test_dataset(test_path)



shuffle test dataset
rename /ds/dataset/quran/test/112_0206.mp3 to /ds/dataset/quran/test/01.mp3_tmp
rename /ds/dataset/quran/test/112_0455.mp3 to /ds/dataset/quran/test/02.mp3_tmp
rename /ds/dataset/quran/test/112_0023.mp3 to /ds/dataset/quran/test/03.mp3_tmp
rename /ds/dataset/quran/test/112_0055.mp3 to /ds/dataset/quran/test/04.mp3_tmp
rename /ds/dataset/quran/test/112_0164.mp3 to /ds/dataset/quran/test/05.mp3_tmp
rename /ds/dataset/quran/test/112_0111.mp3 to /ds/dataset/quran/test/06.mp3_tmp
rename /ds/dataset/quran/test/112_0084.mp3 to /ds/dataset/quran/test/07.mp3_tmp
rename /ds/dataset/quran/test/112_0165.mp3 to /ds/dataset/quran/test/08.mp3_tmp
rename /ds/dataset/quran/test/112_0297.mp3 to /ds/dataset/quran/test/09.mp3_tmp
rename /ds/dataset/quran/test/112_0285.mp3 to /ds/dataset/quran/test/10.mp3_tmp
rename /ds/dataset/quran/test/112_0410.mp3 to /ds/dataset/quran/test/11.mp3_tmp
rename /ds/dataset/quran/test/112_0271.mp3 to /ds/dataset/quran/test/12.mp3_tmp
rename /ds/dataset/

Unnamed: 0,label,file,ext,newfilename
0,112,206,mp3,01.mp3
1,112,455,mp3,02.mp3
2,112,23,mp3,03.mp3
3,112,55,mp3,04.mp3
4,112,164,mp3,05.mp3
5,112,111,mp3,06.mp3
6,112,84,mp3,07.mp3
7,112,165,mp3,08.mp3
8,112,297,mp3,09.mp3
9,112,285,mp3,10.mp3


In [29]:
# write csv file
csv_file_name ='test_dataset.csv'
csv_file= path+'/'+csv_file_name
generate_test_csv_with_original(test_path, csv_file)

###### list all dataset and convert the file names (

In [12]:
input_dir= '/home/ubuntu/dataset'

# list the audio directory
lst = os.listdir(input_dir)
lst.sort()
for filename in lst:
    full_dir_name = os.path.join(input_dir, filename)
    if (os.path.isdir(full_dir_name)) :
        if (filename.startswith("1")):
            print("Dir: %s " % (full_dir_name))
            #first rename files in the path to sequenced files prefixed with label 
            print('rename file in dataset %s' % (full_dir_name))
            
            rename_files_dataset(full_dir_name, 4)
            
            # suffle the files in the path
            print('shuffle files in dataset %s' % (full_dir_name))
            shuffle_dataset(full_dir_name)

FileNotFoundError: [Errno 2] No such file or directory: '/home/ubuntu/dataset'