# Preprocessing of Dataset
## Imports and Constants



In [21]:
import os
import pandas as pd
import re

AUDIO_DIR = "audio_files"
AUDIO_FILES = os.listdir(AUDIO_DIR)
CSV_DF = pd.read_csv("Heartbeat_Cusat - Form responses 1.csv").sort_values(by='Name').reset_index(drop=True)

DIGIT_PATTERN = re.compile(r"\d+")
CHAR_PATTERN = re.compile(r"_$|-$")

FILE_NAMES = list({CHAR_PATTERN.sub("",DIGIT_PATTERN.split(file)[0]).strip() for file in os.listdir(AUDIO_DIR)})
NAMES = [(index, row['Name']) for index,row in CSV_DF.iterrows()]


result_df = pd.DataFrame(columns=['name', 'age', 'sex', 'chest_pain', 'bp', 'palpitations', 'other_disease', 'pulmonic', 'erbs', 'tricuspid'])



In [22]:
pattern = re.compile(r'(?!(no|nil))', re.IGNORECASE)

bool(pattern.match('ot'))


True

## Find File Name 

In [23]:
def find_filename(name):
    pattern_str = "|".join([f"({word})" for word in  name.split(" ") ]) #if len(word) > 1

    name_pattern = re.compile(rf"{pattern_str}", re.IGNORECASE)
    remove_pattern = re.compile(r"-|_| ")

    files = [name for name in FILE_NAMES if remove_pattern.sub("",name_pattern.sub("", name)) == ""]
    if len(files) == 1 :
        return files[0] 
    else:
        with open('activity_log.txt', 'a') as  log:
            log.write(f"*****[ {files} ]*****")
        return None

## Find Files

In [24]:
def find_files(file_name):
    pattern = re.compile(rf"{file_name}")

    return [file for file in AUDIO_FILES if pattern.search(file)]

## Append to DF

In [25]:
def append_to_df(files, index):
    if (length:=len(files)) < 3 :
        with open('activity_log.txt', 'a') as log :
                log.write(f">>>> MISSING FILES (only {length})<<<<{CSV_DF.iloc[index]['Name']} at index {index}\n")
    y_n_pattern = re.compile(r'(?!(no|nil))', re.IGNORECASE)
    m_f_pattern = re.compile(r"^male", re.IGNORECASE)
    pulmonic_pattern = re.compile(r"\d+ *(_|-)* *p", re.IGNORECASE)
    tricuspid_pattern = re.compile(r"\d+ *(_|-)* *t", re.IGNORECASE)
    erbs_pattern = re.compile(r"\d+ *(_|-)* *e", re.IGNORECASE)

    crr_index = len(result_df)
    result_df.loc[crr_index] = [
        CSV_DF.iloc[index]['Name'], 
        CSV_DF.iloc[index]['Age'], 
        int(bool(m_f_pattern.search(CSV_DF.iloc[index]['Sex']))),
        int(bool(y_n_pattern.match(CSV_DF.iloc[index]['Chest Pain']))),
        int(bool(y_n_pattern.match(CSV_DF.iloc[index]['BP']))), 
        int(bool(y_n_pattern.match(CSV_DF.iloc[index]['Palpitations']))),
        int(bool(y_n_pattern.match(CSV_DF.iloc[index]['other_disease']))),
        None,
        None, 
        None,
        ]
    for file in files :
        if pulmonic_pattern.search(file):
            result_df.loc[crr_index, 'pulmonic'] = f"{AUDIO_DIR}/{file}"
            AUDIO_FILES.remove(file)      
        elif tricuspid_pattern.search(file):
            result_df.loc[crr_index, 'tricuspid'] = f"{AUDIO_DIR}/{file}"
            AUDIO_FILES.remove(file)       
        elif erbs_pattern.search(file):
            result_df.loc[crr_index, 'erbs'] = f"{AUDIO_DIR}/{file}"
            AUDIO_FILES.remove(file)
        else:
            with open('activity_log.txt', 'a') as log :
                log.write(f">>>> NO PET FILES<<<<{CSV_DF.iloc[index]['Name']} at index {index}\n")


## Main Function

In [27]:

if __name__ == "__main__":
    with open('activity_log.txt', 'w') as log :
        log.write("__________LOSS__________\n\n")
    itr_count = 0
    for item in NAMES:
        itr_count += 1
        # print(f"Processing {item[1]}")
        file_name = find_filename(item[1])
        if file_name:
            # print(f"{item[1]} ==> {file_name}") 
            files = find_files(file_name)
            if files :
                append_to_df(files, item[0])
                # print(files)
            else:
                # print(files)
                with open('activity_log.txt', 'a') as log:
                    log.write(f"==>NO FILES FOUND<== {item[1]} at index {item[0]}\n")
        else:
            with open('activity_log.txt', 'a') as log:
                log.write(f"==> {item[1]} at index {item[0]}\n")

        


## Save Output

In [28]:

result_df.to_csv('output.csv', index=False)   
