## Step 1: Filtering data
Randomise dataset and filter them into databank consisting of 70% training data, 15% validation data, and 15% testing data

Importing relevant libraries

In [17]:
#receive all files from 
import os
import random
import math
import pandas as pd
import numpy as np

from distutils.dir_util import copy_tree
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import shuffle

Specifying directory of main and relevant sets
- Do change the target directory as per needed

In [18]:
# Specify the main directory
main_directory = os.getcwd() + '\\EATD-Corpus\\'

# Specify the directory to copy files to
target_trainset = os.getcwd() + '\\data\\training_set\\'
target_valiset = os.getcwd() + '\\data\\validation_set\\'
target_testset = os.getcwd() + '\\data\\testing_set\\'

Split data into 70-15-15 (Training-Validation-Testing)

In [19]:
# Create an element tagged to file order in directory and shuffle order to funnel to relevant folders
ntotal = 162
list = [i for i in range(1, 163)]
random.shuffle(list)

# Split data into 70% training, 15% validation and 15% testing
n_testdata = math.ceil(ntotal * 0.15)
n_validata = math.ceil(ntotal * 0.15)

train_data = list[n_validata+n_testdata:]
vali_data = list[:n_validata] 
test_data = list[n_validata:n_validata+n_testdata]

In [30]:
print(np.shape(list[n_validata+n_testdata:]))
print(train_data)
print(vali_data)

(112,)
[52, 79, 59, 69, 141, 48, 73, 146, 56, 61, 63, 68, 88, 6, 58, 54, 89, 39, 134, 43, 103, 33, 51, 7, 129, 55, 137, 24, 130, 81, 125, 105, 71, 18, 16, 112, 120, 40, 11, 67, 10, 131, 126, 151, 132, 70, 27, 150, 53, 158, 86, 97, 119, 162, 117, 91, 135, 62, 100, 127, 157, 92, 83, 64, 115, 93, 8, 45, 85, 65, 49, 109, 102, 107, 113, 114, 149, 29, 87, 21, 123, 124, 77, 90, 22, 121, 38, 41, 108, 144, 66, 60, 26, 111, 31, 35, 23, 138, 1, 99, 14, 80, 37, 139, 44, 78, 42, 128, 152, 133, 122, 104]
[72, 76, 46, 145, 116, 15, 156, 106, 32, 94, 3, 30, 140, 84, 20, 34, 50, 13, 5, 153, 101, 36, 17, 136, 95]


Funnel data directories from EATD-Corpus to relevant folders 

In [20]:

print(main_directory)
i = 1
# Loop through all folders in the main directory
for folder_name in os.listdir(main_directory):
    #Joins folder name with the main directory
    folder_path = os.path.join(main_directory, folder_name)
    folder_path = folder_path + '\\'

    # Check if the item in the directory is a folder
    if os.path.isdir(folder_path):
        target_folder = ""
        if (i in vali_data):
            target_folder = os.path.join(target_valiset, folder_name) 
        elif (i in test_data):
            target_folder = os.path.join(target_testset, folder_name)
        else:
            target_folder = os.path.join(target_trainset, folder_name)
        
        #Check if there is some issue segregating into target folder
        if target_folder == "":
            print("Problem with segregating data into target folder")
            break

        #If folder does not exist, make directory
        if not os.path.exists(target_folder):
            os.makedirs(target_folder)

        #Copy all files from data folder to target folder
        copy_tree(folder_path, target_folder)
        i += 1

print("Segregation Complete!")

c:\Users\benny\Desktop\Y4S1\Deep_Speech_Technology\Project\EATD-Corpus\


Segregation Complete!


## Step 2: Compile dataset information in CSV

Specify directory of datasets and import relevant libraries

In [21]:
import os
import csv

# Specify the main directory
main_data = os.getcwd() + '\\data\\'

# Specify the directory to copy files to
target_trainset = os.getcwd() + '\\data\\training_set\\'
target_valiset = os.getcwd() + '\\data\\validation_set\\'
target_testset = os.getcwd() + '\\data\\testing_set\\'

# Label file
label_file = 'new_label.txt'
# Types of responses in each folder
responses = ['negative_out.wav', 'neutral_out.wav', 'positive_out.wav']

In [22]:
# Function to list all folders in a directory
def list_files(directory):
    folders = []
    for f in os.listdir(directory):
        if os.path.isdir(os.path.join(directory, f)):
            # Get label from label file
            label = 0
            label_input = os.path.join(directory, f, label_file)
            if os.path.isfile(label_input):
                with open(label_input, 'r') as input_txtf:
                    score = float(input_txtf.read().strip())
                    if score >=53:
                        label = 1 
            else:
                    raise Exception("File not found: " + label_input)

            # Get all audio responses in the folder
            for i in range (0, len(responses)):
                if os.path.isfile(os.path.join(directory, f, responses[i])):
                    folders.append((f + '\\' + responses[i], label))
                else:
                    print("Data file erased / Data file not found: " + os.path.join(directory, f, responses[i]))
    return folders

# Function to save the folder names to a CSV file
def save_to_csv(fname_labs, csv_filename):
    with open(csv_filename, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['fname', 'label'])  # Write header

        # Write each row with (fname, label) data
        for fname_lab in fname_labs:
            csv_writer.writerow([fname_lab[0], fname_lab[1]])

In [23]:
# Main function
if __name__ == "__main__":

    csv_path = os.getcwd() + '\\data\\'
    # Get a list of all folders in the specified directory
    files_lab = list_files(target_trainset)
    # Specify the CSV file name
    csv_file = csv_path + 'training.csv'
    # Save the folder names to a CSV file
    save_to_csv(files_lab, csv_file)

    # Get a list of all folders in the specified directory
    files_lab = list_files(target_valiset)
    # Specify the CSV file name
    csv_file = csv_path + 'validation.csv'
    # Save the folder names to a CSV file
    save_to_csv(files_lab, csv_file)

    # Get a list of all folders in the specified directory
    files_lab = list_files(target_testset)
    # Specify the CSV file name
    csv_file = csv_path + 'testing.csv'
    # Save the folder names to a CSV file
    save_to_csv(files_lab, csv_file)

Data file erased / Data file not found: c:\Users\benny\Desktop\Y4S1\Deep_Speech_Technology\Project\data\training_set\v_79\positive_out.wav
Data file erased / Data file not found: c:\Users\benny\Desktop\Y4S1\Deep_Speech_Technology\Project\data\validation_set\v_79\positive_out.wav


## 3. Oversample minority class in training

In [24]:
# Load data
data_folder = os.getcwd() + '\\data'

train_path = os.getcwd() + '\\data\\training_set'
vali_path = os.getcwd() + '\\data\\validation_set'
test_path = os.getcwd() + '\\data\\testing_set'

audio_train_folder = os.listdir(os.getcwd() + '\\data\\training_set')
audio_vali_folder = os.listdir(os.getcwd() + '\\data\\validation_set')
audio_test_folder = os.listdir(os.getcwd() + '\\data\\testing_set')

traindf = pd.read_csv(os.getcwd() + '\\data\\training.csv')
testdf = pd.read_csv(os.getcwd() + '\\data\\testing.csv')
#submission = pd.read_csv('../input/sample_submission.csv')

train_opfp = os.getcwd() + '\\data\\training_final.csv'

In [25]:
# Extract labels to split into minority and majority class
targetcol = 'label'

fname = traindf.drop(targetcol, axis=1)
label = traindf[targetcol]

# Separate majority and minority classes
majority_data = traindf[label == label.value_counts().idxmax()]
minority_data = traindf[label == label.value_counts().idxmin()]

resam_min_size = label.value_counts().max()

# Resample the minority class to the specified size
oversampler = RandomOverSampler(sampling_strategy={label.value_counts().idxmin(): resam_min_size}, random_state=0)
fname_resam, label_resam = oversampler.fit_resample(fname, label)

# Create a DataFrame with the resampled minority class
balanced_data = pd.DataFrame()
balanced_data['fname'] = fname_resam['fname']
balanced_data[targetcol] = label_resam

balanced_data.to_csv(train_opfp, index=False)