# DataSet Preparation Utils
by Wael Radwan

#### Library used to prepare datasets from data
continans various tools to prepare the training dataset and testing dataset

mainly used to randomize files of the dataset
divide the dataset into traininig and testing dataset
randomize the test files

<b> the main idea here is to load the dataset into classified directories then use this utility to create train/test dataset with the randomization </b>

In [45]:
import os
import re
import random
import ntpath
import shutil
import pandas as pd
import numpy as np

#### Input description
input to this tool is a dataset of files divided into classes in the following order
- class_1:
    - file_1
    - file_2
    - .
    - .
- class_2
    - file_1
    - file_2
    - .
    - .
- .
- .


- class_n 
    - file_1
    - file_2
    - .
    - .


#### Output description
output shoud be the data divided into train and test sets  in the following order

- train
    - class_1_001
    - class_1_002
    - .
    - .
    - class_2_001
    - class_2_002
    - .
    - .
    - .
    - .
    - class_n_001
    - class_n_002
    - .
    - .
 
- test
    - class_1_001
    - class_1_002
    - .
    - .
    - class_2_001
    - class_2_002
    - .
    - .
    - .
    - .
    - class_n_001
    - class_n_002
    - .
    - .
 
    - class_label.csv (a csv file contains the filename and the label so it can be used for testing comparison)

#### create panda from the files dataset
mainly used for the test dataset to generate the file name and the label based on the label criteria
randomize is set to true if the files are ranomized in the dataset instead of ordered

In [78]:
def list_files_with_labels(path, randomize=False):
    dataset = []
    files = sorted(os.listdir(path))
    if not files:
        return dataset
    file = [os.path.splitext(x)[0] for x in files]
    ext = [os.path.splitext(x)[-1] for x in files]
    class_name = ntpath.basename(path)
    

    # create panda dataframe for file sorted by their name

    dataset = pd.DataFrame(data={"file":file,"ext":ext, "label":class_name})
    dataset = dataset[['file', 'ext', 'label']]

    # if randomize is set to True then shuffle the dataset dataframe
    if (randomize) :
        dataset= dataset.sample(frac=1)
        dataset = dataset.sample(frac=1).reset_index(drop=True)

    return dataset


#### create list of the dataset with new name for each file based on a sequence
randomize or order the files in the dataset then create a new name based on sequence number

- path      : path to the class
- seqlen    : is the lenght of the sequence so if seqlen = 3 the new file names follow the sequence 001, 002, 003, ...
- randomize : either to randomize the files in the class set or not (default is not to randomize) 
- retrun panda structure with new file name 

In [80]:
def list_with_new_file_name_seq(path,  seqlen, randomize=False,):
    df = []  
    dataset = list_files_with_labels( path , randomize)
    if not dataset:
        return
    
    # recreate index in case the dataset has been randomize
    # so the seq_number will be reflected in out file_name
    df = dataset.sample(frac=1).reset_index(drop=True)
    
    # new file name will be classid_seq(of seq_len).ext
    # for example 
    # 1_0001.mp3
    # 1_0002.mp3
    df['newfilename'] = df['label']+'_'+(df.index + 1).astype(str).str.zfill(seqlen) + df.ext
    
    # df.index = df.index.map(str) 
    return df

In [76]:
path='/ds/dataset/quran/999'
dataset = list_files_with_labels( path,randomize=True)
data = list_with_new_file_name_seq( path,3)

In [67]:
dataset

Unnamed: 0,file,ext,label
0,112_88,.mp3,999
1,112_198,.mp3,999
2,112_186,.mp3,999
3,112_87,.mp3,999
4,112_180,.mp3,999
5,112_uP_bY_mUSLEm_28,.mp3,999
6,112-Al-ikhlas_88,.mp3,999
7,112_uP_bY_mUSLEm.Ettounssi_38,.mp3,999
8,112_uP_bY_mUSLEm_18,.mp3,999
9,112_68,.mp3,999


#### Rename files 
After creating the dataframe with proposed renamed files then do the rename files in the dataset
then rename all the files to _tmp files in order not to overwrite files that already exist 


In [82]:
def rename_files_dataset(path, seqlen):
    # first list the files in the input dir and put the porposed name in the dataframe
    data = list_with_new_file_name_seq( path,seqlen)
    if not data:
        return
    
    # track the seq
    i = 1
    
    # keep the track of the new file names in a list in order to be used later
    tmp_file_names = []
    
    # first rename the files with _tmp in order not to overwrite files 
    # and keep the renamed files in the list 
    # then use the list to remove _tmp files
    for index, row in data.iterrows():
        tmp_file_name = path+'/'+row['newfilename'] + '_tmp'
        orig_file_name =  path+'/'+row['file'] + row['ext']
        print('rename %s to %s'  % (orig_file_name, tmp_file_name) )
        tmp_file_names.append(tmp_file_name)
        
    # now remove _tmp f    
    for temp_file in tmp_file_names:
        print('rename %s to %s'  % (temp_file, temp_file.rsplit('_tmp', -1)[0]) )
    return

##### splitt the dataset into training and test
here split the dataset into training and testing 
- path : path to the class
- train_path : path to the train dir
- test_path : path to the test dir
- train_files_count : is the number of files in each class to be moved to the training dir
- randomize : either to randomize the files in the class set or not (default is not to randomize)

the output will be copying the data into train and test directories

In [60]:
def split_into_train_test( path, train_path, test_path,train_files_count, randomize=True):
    # read the files in the origianl path ordered by there name
    data = list_files_with_labels( path,randomize)
    # copy the dir name of the class 
    # this only copy the current directory name (without the path)
    dir_name = ntpath.basename(path)
    
    # track the seq
    i = 1
    
    # keep the track of the new file names in a list in order to be used later
    tmp_file_names = []
    
    # first rename the files with _tmp in order not to overwrite files 
    # and keep the renamed files in the list 
    # then use the list to remove _tmp files
    for index, row in data.iterrows():
        orig_file_name =  path+'/'+row['file'] + row['ext']
        if (index<train_files_count):
            tmp_file_name = train_path+'/'+row['file']  + row['ext']
        else :
            tmp_file_name = test_path+'/'+row['file']  + row['ext']
       
        print('rename %s to %s'  % (orig_file_name, tmp_file_name) )
    return

In [61]:
train_path='/ds/dataset/quran/train'
test_path='/ds/dataset/quran/test'
split_into_train_test( path, train_path, test_path,10)

rename /ds/dataset/quran/999/112_81.mp3 to /ds/dataset/quran/train/112_81.mp3
rename /ds/dataset/quran/999/112_248.mp3 to /ds/dataset/quran/train/112_248.mp3
rename /ds/dataset/quran/999/112_83.mp3 to /ds/dataset/quran/train/112_83.mp3
rename /ds/dataset/quran/999/112_183.mp3 to /ds/dataset/quran/train/112_183.mp3
rename /ds/dataset/quran/999/112_185.mp3 to /ds/dataset/quran/train/112_185.mp3
rename /ds/dataset/quran/999/112-Al-ikhlas_88.mp3 to /ds/dataset/quran/train/112-Al-ikhlas_88.mp3
rename /ds/dataset/quran/999/112_uP_bY_mUSLEm_48.mp3 to /ds/dataset/quran/train/112_uP_bY_mUSLEm_48.mp3
rename /ds/dataset/quran/999/112_186.mp3 to /ds/dataset/quran/train/112_186.mp3
rename /ds/dataset/quran/999/112_uP_bY_mUSLEm_38.mp3 to /ds/dataset/quran/train/112_uP_bY_mUSLEm_38.mp3
rename /ds/dataset/quran/999/112_138.mp3 to /ds/dataset/quran/train/112_138.mp3
rename /ds/dataset/quran/999/112-Al-ikhlas_28.mp3 to /ds/dataset/quran/test/112-Al-ikhlas_28.mp3
rename /ds/dataset/quran/999/112_89.mp3 t

##### shuffle  test files

shuffle files in the testing dataset then rename all the files to _tmp files in order not to overwrite files that already exist

** this can be used to shuffle any directory but the main reason here is to make sure that the testing dataset is shuffled in order to avoid overfitting

In [62]:
def shuffle_test_dataset(path):
    # first list the files in the input dir and put the porposed name in the dataframe
    data = list_files_with_labels( path,randomize=True)
    # create a new column with the random file selected from the same dataset
    # so the files will be shuffled
    data['newfile'] = data["file"].transform(np.random.permutation)


    
    # keep the track of the new file names in a list in order to be used later
    tmp_file_names = []
    
    # first rename the files with _tmp in order not to overwrite files 
    # and keep the renamed files in the list 
    # then use the list to remove _tmp files
    for index, row in data.iterrows():
        tmp_file_name = path+'/'+row['newfile'] + row['ext'] + '_tmp'
        orig_file_name =  path+'/'+row['file'] + row['ext']
        print('rename %s to %s'  % (orig_file_name, tmp_file_name) )
        tmp_file_names.append(tmp_file_name)
        
    # now remove _tmp f    
    for temp_file in tmp_file_names:
        print('rename %s to %s'  % (temp_file, temp_file.rsplit('_tmp', -1)[0]) )
    return

In [63]:
shuffle_test_dataset(path)

rename /ds/dataset/quran/999/112_uP_bY_mUSLEm_48.mp3 to /ds/dataset/quran/999/112_uP_bY_mUSLEm.Ettounssi_28.mp3_tmp
rename /ds/dataset/quran/999/112_81.mp3 to /ds/dataset/quran/999/112_148.mp3_tmp
rename /ds/dataset/quran/999/112_138.mp3 to /ds/dataset/quran/999/112_87.mp3_tmp
rename /ds/dataset/quran/999/112-Al-ikhlas_58.mp3 to /ds/dataset/quran/999/112-Al-ikhlas_58.mp3_tmp
rename /ds/dataset/quran/999/112_uP_bY_mUSLEm_18.mp3 to /ds/dataset/quran/999/112_uP_bY_mUSLEm_38.mp3_tmp
rename /ds/dataset/quran/999/112_28.mp3 to /ds/dataset/quran/999/112_78.mp3_tmp
rename /ds/dataset/quran/999/112_38.mp3 to /ds/dataset/quran/999/112_86.mp3_tmp
rename /ds/dataset/quran/999/112-Al-ikhlas_88.mp3 to /ds/dataset/quran/999/112_68.mp3_tmp
rename /ds/dataset/quran/999/112_208.mp3 to /ds/dataset/quran/999/dataset.mp3_tmp
rename /ds/dataset/quran/999/112_86.mp3 to /ds/dataset/quran/999/112_208.mp3_tmp
rename /ds/dataset/quran/999/112_uP_bY_mUSLEm_58.mp3 to /ds/dataset/quran/999/112_uP_bY_mUSLEm_8.mp3_tm

###### to be continued
from here we will continue by adding more features
-  generate a csv description file for test to be used kaggle 

##### Generate the files / lables csv file
finally generate the csv file that holds the correct mapping of the files and thier correct labels this file should be used as a benchmark against files submitted

In [64]:
def generate_test_csv(test_path, csv_file_name):
    dataset = list_files_with_labels( test_path,randomize=True)
    test_dataset['fullfilename'] = test_dataset['file'] + test_dataset['ext']
    header = ["fullfilename", "label"]
    csv_file= test_path+'/'+"csv_file_name"
    test_dataset.to_csv( csv_file, index=False, quoting=3 ,columns = header)
    return

In [39]:
test_path='/ds/dataset/quran/999'
generate_test_csv(test_path,'test.csv')

#### Exmple code
the following sample code could be used to generate train / test dataset from the dataset

-- you can copy it and paste it in the code 

In [85]:
# first of all define the location of the files
print('----Start----')
path='/ds/dataset/quran/999'
train_path='/ds/dataset/quran/train'
test_path='/ds/dataset/quran/test'


# then split into train and test
print('then split into train and test')
split_into_train_test( path, train_path, test_path,10)

#rename train/test dataset
print('rename train dataset')
rename_files_dataset(train_path , 4)
print('rename test dataset')
rename_files_dataset(test_path , 4)

# shuffle test dataset
print('shuffle test dataset')
shuffle_test_dataset(test_path)

#finally genearte csv file for test dataset
print('finally genearte csv file for test dataset')
generate_test_csv(test_path,'test.csv')

----Start----
then split into train and test
rename /ds/dataset/quran/999/112_189.mp3 to /ds/dataset/quran/train/112_189.mp3
rename /ds/dataset/quran/999/112_158.mp3 to /ds/dataset/quran/train/112_158.mp3
rename /ds/dataset/quran/999/112_228.mp3 to /ds/dataset/quran/train/112_228.mp3
rename /ds/dataset/quran/999/112_uP_bY_mUSLEm_58.mp3 to /ds/dataset/quran/train/112_uP_bY_mUSLEm_58.mp3
rename /ds/dataset/quran/999/112_38.mp3 to /ds/dataset/quran/train/112_38.mp3
rename /ds/dataset/quran/999/112_80.mp3 to /ds/dataset/quran/train/112_80.mp3
rename /ds/dataset/quran/999/112_187.mp3 to /ds/dataset/quran/train/112_187.mp3
rename /ds/dataset/quran/999/112_148.mp3 to /ds/dataset/quran/train/112_148.mp3
rename /ds/dataset/quran/999/112-Al-ikhlas_28.mp3 to /ds/dataset/quran/train/112-Al-ikhlas_28.mp3
rename /ds/dataset/quran/999/112-Al-ikhlas_8.mp3 to /ds/dataset/quran/train/112-Al-ikhlas_8.mp3
rename /ds/dataset/quran/999/112_84.mp3 to /ds/dataset/quran/test/112_84.mp3
rename /ds/dataset/quran

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().