# Preprocessing - LEVEL 2
## Imports and Constants

In [74]:
import os
import librosa
import soundfile as sf
import pandas as pd
import uuid
import shutil

DATA_DIR = 'heart_audio'
BACKUP_DIR = 'backup'

CSV_DF = pd.read_csv("output.csv").fillna(-1)
SLICE_LENGTH = 5 # seconds 

DATASET_DF = pd.DataFrame(columns=['age', 'sex', 'audio', 'chest_pain', 'bp', 'palpitations', 'other_disease'])

if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)

if not os.path.exists(BACKUP_DIR):
    os.mkdir(BACKUP_DIR)

## Process Audio Files 
Slice audio files into specified lengths and store in `DATA_DIR` and return the file paths 

In [75]:
def process_audio(audio_files):
    sliced_files = []
    for file in audio_files:
        data, sr = librosa.load(file)
        for i in range(0, len(data), sr*SLICE_LENGTH):
            slice = data[i:(i+(sr*SLICE_LENGTH))]
            file_path = f"{DATA_DIR}/{uuid.uuid4()}.wav" # create unique files name 
            #write file
            sf.write(file_path, slice, sr)
            if os.path.exists(file_path): # ensure file has been created 
                sliced_files.append(file_path) # append to output sliced files list
            else :
                raise FileNotFoundError(f"File <{file_path}> not exists")
        # move original file to backup to avoid redundancy 
        # shutil.move(file, BACKUP_DIR)
    return sliced_files

## Create DF
create the dataframe for the dataset appending the files names from `process_audio` along with other data

In [76]:
def append_to_df(row):
    files = [row['pulmonic'], row['erbs'], row['tricuspid']]
    audio_files = list(filter(lambda file: file != -1, files))
    if len(audio_files) != 0:
        file_paths = process_audio(audio_files)
        for file in file_paths:
            DATASET_DF.loc[len(DATASET_DF)] = [
                row['age'],
                row['sex'], 
                file, 
                row['chest_pain'], 
                row['bp'],
                row['palpitations'],
                row['other_disease']
            ]

## Main

In [77]:
if __name__ == "__main__":
    for row in CSV_DF.iterrows():
        append_to_df((row[1])) # row[0] is index and row[1] gives the actual data as series object 
    DATASET_DF.to_csv('heart_dataset.csv', index=False)
    print("<<<<<< Dataset Saved Successfully >>>>> ")


<<<<<< Dataset Saved Successfully >>>>> 
