In [None]:
import pandas as pd
import numpy as np

In [None]:
!pip install openpyxl noisereduce

In [None]:
import os
from pathlib import Path

In [None]:
BASE_PATH = Path("YOUR_ROOT")

Peripheral_Neuropathy_AUDIO = BASE_PATH / "Training/audio_data/TS01/Peripheral_Neuropathy"
Peripheral_Neuropathy_LABEL = BASE_PATH / "Training/label_data/TL01/Peripheral_Neuropathy"

Cerebral_Palsy_AUDIO = BASE_PATH / "Validation/audio_data/VS01/Cerebral_Palsy_disease"
Cerebral_Palsy_LABEL = BASE_PATH / "Validation/label_data/VL01/Cerebral_Palsy_disease"

Stroke_AUDIO = BASE_PATH / "Validation/audio_data/VS01/Stroke"
Stroke_LABEL = BASE_PATH / "Validation/label_data/VL01/Stroke"

OUTPUT_BASE = BASE_PATH / "Preprocessed"
Peripheral_Neuropathy_OUTPUT = OUTPUT_BASE / "Peripheral_Neuropathy_dataset"
Cerebral_Palsy_OUTPUT = OUTPUT_BASE / "Cerebral_Palsy_dataset"
Stroke_OUTPUT = OUTPUT_BASE / "Stroke_dataset"

In [None]:
Peripheral_Neuropathy_OUTPUT.mkdir(parents=True, exist_ok=True)
Cerebral_Palsy_OUTPUT.mkdir(parents=True, exist_ok=True)
Stroke_OUTPUT.mkdir(parents=True, exist_ok=True)

print("Peripheral_Neuropathy →", Peripheral_Neuropathy_OUTPUT)
print("Cerebral_Palsy →", Cerebral_Palsy_OUTPUT)
print("Stroke →", Stroke_OUTPUT)

In [None]:
print("[Peripheral_Neuropathy]")
print("  🎧 Audio_File_Count: ", len(os.listdir(Peripheral_Neuropathy_AUDIO)))
print("  🏷️ Label_File_Count: ", len(os.listdir(Peripheral_Neuropathy_LABEL)))

print("[Cerebral_Palsy]")
print("  🎧 Audio_File_Count: ", len(os.listdir(Cerebral_Palsy_AUDIO)))
print("  🏷️ Label_File_Count: ", len(os.listdir(Cerebral_Palsy_LABEL)))

print("[Stroke]")
print("  🎧 Audio_File_Count: ", len(os.listdir(Stroke_AUDIO)))
print("  🏷️ Label_File_Count: ", len(os.listdir(Stroke_LABEL)))

In [None]:
!pip install librosa

In [None]:
from pydub.utils import which
print("ffmpeg location:", which("ffmpeg"))

In [None]:
import pandas as pd
import numpy as np
from pydub.utils import db_to_float
import itertools
from pydub import AudioSegment

import IPython.display as ipd
from pydub import AudioSegment
import torch
import librosa
from IPython.display import Audio
import noisereduce as nr

In [None]:
# Raw Audio Data Pre-proocessing

import os
import glob
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from pydub import AudioSegment
import noisereduce as nr
    
# VAD
def vad_segment_by_energy(audio_segment, frame_ms=500, silence_duration_sec=5, alpha=0.3):
    frame_len = frame_ms
    total_len = len(audio_segment)
    energy_values = []

    for i in range(0, total_len, frame_len):
        frame = audio_segment[i:i+frame_len]
        energy_values.append(frame.rms)

    mean_energy = sum(energy_values) / len(energy_values)
    threshold = mean_energy * alpha
    silence_flags = [rms < threshold for rms in energy_values]

    frame_duration_sec = frame_ms / 1000
    min_silence_frames = int(silence_duration_sec / frame_duration_sec)

    segments = []
    is_silent = False
    start = 0

    for idx, silent in enumerate(silence_flags):
        if not is_silent and silent:
            silence_run = silence_flags[idx:idx+min_silence_frames]
            if len(silence_run) == min_silence_frames and all(silence_run):
                end = idx * frame_len
                if end - start > 0:
                    segments.append((start, end))
                is_silent = True
        elif is_silent and not silent:
            start = idx * frame_len
            is_silent = False

    if not is_silent and start < total_len:
        segments.append((start, total_len))

    if not segments:
        print(f"VAD Failed")

    return segments

# Preprocessing
def extract_vad_segments_for_all_files(Stroke_AUDIO, Stroke_OUTPUT, frame_ms=500, silence_duration_sec=5, alpha=0.3):
    Stroke_audio_files = glob.glob(os.path.join(Stroke_AUDIO, "*.wav"))

    if not os.path.exists(Stroke_OUTPUT):
        os.makedirs(Stroke_OUTPUT)

    for Stroke_audio_file in tqdm(Stroke_audio_files, desc="Is VAD processing..."):
        Stroke_base_filename = os.path.basename(Stroke_audio_file)
    
        if not os.path.exists(Stroke_audio_file):
            print(f"File not exist: {Stroke_audio_file}")
            continue
            
        # Person_code extraction (ex: ID-02-26-N-AJH-01-01-F-45-SU.wav → AJH-01-01-F-45-SU)
        match = re.match(r"ID-\d{2}-\d{2}-N-(.+)\.wav", Stroke_base_filename)
        if not match:
            print(f"Regular expression matching failed: {Stroke_base_filename}")
            continue

        person_code = match.group(1)

        # load audio
        Stroke_audio = AudioSegment.from_file(Stroke_audio_file)

        # Split Silence/non-Silence part
        segments = vad_segment_by_energy(Stroke_audio, frame_ms, silence_duration_sec, alpha)

        # Save Segment
        for i, (start, end) in enumerate(segments):
            segment = Stroke_audio[start:end]
            Stroke_output_filename = f"output_PN_{person_code}_{i}.wav"
            Stroke_output_path = os.path.join(Stroke_OUTPUT, Stroke_output_filename)
            segment.export(Stroke_output_path, format="wav")
            print(f"✅ File {Stroke_base_filename} - Segment {i}: {start / 1000:.2f}s ~ {end / 1000:.2f}s → {Stroke_output_filename}")

In [None]:
extract_vad_segments_for_all_files(
    Stroke_AUDIO=Stroke_AUDIO,
    Stroke_OUTPUT=Stroke_OUTPUT,
    frame_ms=500,
    silence_duration_sec=5,
    alpha=0.3,
)

In [None]:
SEGMENT = os.path.join(Stroke_OUTPUT)

print("Number of silence removal audio files: ", len(os.listdir(SEGMENT)))

In [None]:
# Label Data pre-processing

import re
import os
import pandas as pd

Stroke_df_list = []

# Split and merge Text segments
for i in range(len(os.listdir(Stroke_LABEL))):
    # Load Script
    Stroke_label_file = os.path.join(Stroke_LABEL, sorted(os.listdir(Stroke_LABEL))[i])
    
    Stroke_meta = pd.read_json(Stroke_label_file, orient='columns')
    Stroke_transcript = str(Stroke_meta['Transcript'].iloc[0]).strip()

    if "/" in Stroke_transcript:
        Stroke_segments = Stroke_transcript.split("/")
        Stroke_segments = [s.strip() for s in Stroke_segments if s.strip()]

    elif re.search(r"[\.?!]", Stroke_transcript):
        Stroke_segments = re.split(r"[\.?!]", Stroke_transcript)
        Stroke_segments = [s.strip() for s in Stroke_segments if s.strip()]
    
    else:
        Stroke_segments = Stroke_transcript.split()
        Stroke_segments = [s.strip() for s in Stroke_segments if s.strip()]

    # Convert DataFrame and add to list
    if Stroke_segments:
        Stroke_df = pd.DataFrame(Stroke_segments, columns=['text'])
        Stroke_df_list.append(Stroke_df)

Stroke_text_df = pd.concat(Stroke_df_list, axis=0, ignore_index=True)

print(Stroke_text_df)

In [None]:
# Since there is a difference in the number of voice data and text data, the number of segments is estimated by comparing text and voice files.
# Then, post-processing is performed on files that have differences between predicted and actual values.

!pip install torch torchaudio

In [None]:
# Comparison of the number of differences between voice-labeling

import os
import json
import re
import pandas as pd

def get_text_segment_count(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    transcript = str(data["Transcript"]).strip()

    if "/" in transcript:
        segments = transcript.split("/")
    elif re.search(r"[\.?!]", transcript):
        segments = re.split(r"[\.?!]", transcript)
    else:
        segments = transcript.split()

    segments = [s.strip() for s in segments if s.strip()]
    return len(segments)

def count_audio_files(audio_folder, person_code):
    prefix = f"output_PN_{person_code}_"
    return len([
        f for f in os.listdir(audio_folder)
        if f.endswith(".wav") and f.startswith(prefix)
    ])

def analyze_by_count_only(audio_folder, label_folder, output_csv="review_targets_VAD_Stroke.csv", threshold=0):
    results = []

    for json_file in sorted(os.listdir(label_folder)):
        if not json_file.endswith(".json"):
            continue
    
        json_path = os.path.join(label_folder, json_file)
    
        try:
            match = re.match(r"ID-\d{2}-\d{2}-N-(.+)\.json", json_file)
            if not match:
                raise ValueError("Fail extracting person_code")
            person_code = match.group(1)
    
            text_count = get_text_segment_count(json_path)
            audio_count = count_audio_files(audio_folder, person_code)
            gap = abs(text_count - audio_count)
    
            results.append({
                "file": json_file,
                "person_code": person_code,
                "text_segments": text_count,
                "audio_files": audio_count,
                "gap": gap,
                "flag_for_review": gap > threshold
            })
    
        except Exception as e:
            results.append({
                "file": json_file,
                "person_code": "N/A",
                "text_segments": -1,
                "audio_files": -1,
                "gap": -1,
                "flag_for_review": True,
                "error": str(e)
            })

    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"\n SAVE: {output_csv}")
    return df

In [None]:
AUDIO_FOLDER = Stroke_OUTPUT
LABEL_FOLDER = Stroke_LABEL

result_df = analyze_by_count_only(AUDIO_FOLDER, LABEL_FOLDER, threshold=0)

In [None]:
import pandas as pd

def get_review_target_list(csv_path, save_list_path="review_targets_VAD_Stroke.txt"):
    df = pd.read_csv(csv_path)
    review_targets = df[df["flag_for_review"] == True]["person_code"].tolist()

    with open(save_list_path, 'w', encoding='utf-8') as f:
        for code in review_targets:
            f.write(code + '\n')

    print(f"🚩 person_code {len(review_targets)} saved: {save_list_path}")
    return review_targets

In [None]:
targets = get_review_target_list("review_targets_VAD_Stroke.csv")

In [None]:
# Delete preprocessed audio files for file names that do not match 1:1
# Delete after checking for existence

import os

def delete_audio_files_by_person_code(audio_folder, review_list):
    deleted_files = []
    missing_files = []

    for file in os.listdir(audio_folder):
        if not file.endswith(".wav"):
            continue
        for code in review_list:
            pattern = f"output_PN_{code}_"
            if file.startswith(pattern):
                file_path = os.path.join(audio_folder, file)
                if os.path.exists(file_path):
                    os.remove(file_path)
                    deleted_files.append(file)
                else:
                    print(f"Fail Delete (File not exist): {file_path}")
                    missing_files.append(file)

    print(f"Delete {len(deleted_files)} files.")
    if missing_files:
        print(f" Can't delete {len(missing_files)} files.")
    
    return deleted_files

In [None]:
deleted = delete_audio_files_by_person_code(AUDIO_FOLDER, targets)

In [None]:
# Second preprocessing for deleted audio files

import os
import glob
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from pydub import AudioSegment
import noisereduce as nr

# RMS based VAD
def vad_segment_by_energy(audio_segment, frame_ms, silence_duration_sec, alpha):
    frame_len = frame_ms
    total_len = len(audio_segment)
    energy_values = []

    for i in range(0, total_len, frame_len):
        frame = audio_segment[i:i + frame_len]
        energy_values.append(frame.rms)

    mean_energy = sum(energy_values) / len(energy_values)
    threshold = mean_energy * alpha
    silence_flags = [rms < threshold for rms in energy_values]

    frame_duration_sec = frame_ms / 1000
    min_silence_frames = int(silence_duration_sec / frame_duration_sec)

    segments = []
    is_silent = False
    start = 0

    for idx, silent in enumerate(silence_flags):
        if not is_silent and silent:
            silence_run = silence_flags[idx:idx + min_silence_frames]
            if len(silence_run) == min_silence_frames and all(silence_run):
                end = idx * frame_len
                if end - start > 0:
                    segments.append((start, end))
                is_silent = True
        elif is_silent and not silent:
            start = idx * frame_len
            is_silent = False

    if not is_silent and start < total_len:
        segments.append((start, total_len))

    return segments

# Process only target person_code
def extract_vad_segments_by_review_targets(Stroke_audio_folder, Stroke_output_base_dir, review_target_codes, frame_ms, silence_duration_sec, alpha):
    Stroke_audio_files = glob.glob(os.path.join(Stroke_audio_folder, "*.wav"))

    if not os.path.exists(Stroke_output_base_dir):
        os.makedirs(Stroke_output_base_dir)

    for Stroke_audio_file in tqdm(Stroke_audio_files, desc="Preprocessing VAD file with segment gap"):
        Stroke_base_filename = os.path.basename(Stroke_audio_file)

        match = re.match(r"ID-\d{2}-\d{2}-N-(.+)\.wav", Stroke_base_filename)
        if not match:
            print(f"Regular expression matching failed: {Stroke_base_filename}")
            continue

        person_code = match.group(1)

        if person_code not in review_target_codes:
            continue

        # Load audio
        if not os.path.exists(Stroke_audio_file):
            print(f"File not exist: {Stroke_audio_file}")
            continue

        try:
            Stroke_audio = AudioSegment.from_file(Stroke_audio_file)
        except Exception as e:
            print(f"Fail pre-processing: {Stroke_base_filename} - {e}")
            continue

        # Split Silence/non-Silence part
        segments = vad_segment_by_energy(Stroke_audio, frame_ms, silence_duration_sec, alpha)

        if not segments:
            print(f"failed VAD: {Stroke_base_filename}")
            continue

        # Save segment
        for i, (start, end) in enumerate(segments):
            segment = Stroke_audio[start:end]
            Stroke_output_filename = f"output_PN_{person_code}_{i}.wav"
            Stroke_output_path = os.path.join(Stroke_output_base_dir, Stroke_output_filename)
            segment.export(Stroke_output_path, format="wav")
            print(f"✅ {Stroke_base_filename} → Segment {i}: {start / 1000:.2f}s ~ {end / 1000:.2f}s → {Stroke_output_filename}")

In [None]:
# Load the list of person_codes to be preprocessed

with open("review_targets_VAD_Stroke.txt", "r", encoding="utf-8") as f:
    review_target_codes = [line.strip() for line in f.readlines()]

Stroke_audio_folder = Stroke_AUDIO
Stroke_output_base_dir = Stroke_OUTPUT

extract_vad_segments_by_review_targets(
    Stroke_audio_folder,
    Stroke_output_base_dir,
    review_target_codes,
    frame_ms=600,
    silence_duration_sec=3,
    alpha=0.4,
)

In [None]:
# Comparison of the number of differences between voice-labeling

import os
import json
import re
import pandas as pd

def get_text_segment_count(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    transcript = str(data["Transcript"]).strip()

    if "/" in transcript:
        segments = transcript.split("/")
    elif re.search(r"[\.?!]", transcript):
        segments = re.split(r"[\.?!]", transcript)
    else:
        segments = transcript.split()

    segments = [s.strip() for s in segments if s.strip()]
    return len(segments)

def count_audio_files(audio_folder, person_code):
    prefix = f"output_PN_{person_code}_"
    return len([
        f for f in os.listdir(audio_folder)
        if f.endswith(".wav") and f.startswith(prefix)
    ])

def analyze_by_count_only(audio_folder, label_folder, output_csv="review_targets_VAD_Stroke_v2.csv", threshold=0):
    results = []

    for json_file in sorted(os.listdir(label_folder)):
        if not json_file.endswith(".json"):
            continue
    
        json_path = os.path.join(label_folder, json_file)
    
        try:
            match = re.match(r"ID-\d{2}-\d{2}-N-(.+)\.json", json_file)
            if not match:
                raise ValueError("Failed to extract person_code from file name")
            person_code = match.group(1)
    
            text_count = get_text_segment_count(json_path)
            audio_count = count_audio_files(audio_folder, person_code)
            gap = abs(text_count - audio_count)
    
            results.append({
                "file": json_file,
                "person_code": person_code,
                "text_segments": text_count,
                "audio_files": audio_count,
                "gap": gap,
                "flag_for_review": gap > threshold
            })
    
        except Exception as e:
            results.append({
                "file": json_file,
                "person_code": "N/A",
                "text_segments": -1,
                "audio_files": -1,
                "gap": -1,
                "flag_for_review": True,
                "error": str(e)
            })

    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"\n Save: {output_csv}")
    return df

In [None]:
AUDIO_FOLDER = Stroke_OUTPUT
LABEL_FOLDER = Stroke_LABEL

result_df = analyze_by_count_only(AUDIO_FOLDER, LABEL_FOLDER, threshold=0)

In [None]:
result_df[result_df['flag_for_review'] == True].head()

In [None]:
import os
import re

def rename_audio_files_sequentially(Stroke_OUTPUT):
    pattern = re.compile(r"output_PN_(.+)_(\d+)\.wav")

    person_files = {}
    for filename in os.listdir(Stroke_OUTPUT):
        if not filename.endswith(".wav"):
            continue

        match = pattern.match(filename)
        if match:
            person_code = match.group(1)
            index = int(match.group(2))
            if person_code not in person_files:
                person_files[person_code] = []
            person_files[person_code].append((index, filename))

    for person_code, files in person_files.items():
        files.sort()
        for new_index, (old_index, old_filename) in enumerate(files):
            new_filename = f"output_PN_{person_code}_{new_index}.wav"
            old_path = os.path.join(Stroke_OUTPUT, old_filename)
            new_path = os.path.join(Stroke_OUTPUT, new_filename)

            if old_filename != new_filename:
                print(f"Renaming: {old_filename} → {new_filename}")
                os.rename(old_path, new_path)

rename_audio_files_sequentially(Stroke_OUTPUT)

In [None]:
import re
import os
import pandas as pd

Stroke_text_df_list = []

for i in range(len(os.listdir(Stroke_LABEL))):
    Stroke_label_file = os.path.join(Stroke_LABEL, sorted(os.listdir(Stroke_LABEL))[i])
    
    Stroke_meta = pd.read_json(Stroke_label_file, orient='columns')
    Stroke_transcript = str(Stroke_meta['Transcript'].iloc[0]).strip()

    if "/" in Stroke_transcript:
        Stroke_segments = Stroke_transcript.split("/")
        Stroke_segments = [s.strip() for s in Stroke_segments if s.strip()]

    elif re.search(r"[\.?!]", Stroke_transcript):
        Stroke_segments = re.split(r"[\.?!]", Stroke_transcript)
        Stroke_segments = [s.strip() for s in Stroke_segments if s.strip()]

    else:
        Stroke_segments = Stroke_transcript.split()
        Stroke_segments = [s.strip() for s in Stroke_segments if s.strip()]

    if Stroke_segments:
        Stroke_df = pd.DataFrame(Stroke_segments, columns=['text'])
        Stroke_text_df_list.append(Stroke_df)

Stroke_text_df = pd.concat(Stroke_text_df_list, axis=0, ignore_index=True)

print(Stroke_text_df)

In [None]:
SEGMENT = os.path.join(Stroke_OUTPUT)

print("Audio File Count: ", len(os.listdir(SEGMENT)))

In [None]:
# Make Text data output folder

TEXT_OUTPUT_BASE = BASE_PATH / "Text_Preprocessed"
Stroke_TEXT_OUTPUT = TEXT_OUTPUT_BASE / "Stroke_dataset"

Stroke_TEXT_OUTPUT.mkdir(parents=True, exist_ok=True)

print("Stroke →", Stroke_TEXT_OUTPUT)

In [None]:
import re
import os
import pandas as pd

Stroke_text_df_list = []

for filename in sorted(os.listdir(Stroke_LABEL)):
    if not filename.endswith(".json"):
        continue
    
    file_path = os.path.join(Stroke_LABEL, filename)
    meta = pd.read_json(file_path, orient='columns')
    transcript = str(meta['Transcript'].iloc[0]).strip()

    if "/" in transcript:
        segments = transcript.split("/")
    elif re.search(r"[\.?!]", transcript):
        segments = re.split(r"[\.?!]", transcript)
    else:
        segments = transcript.split()

    segments = [s.strip() for s in segments if s.strip()]

    if segments:
        df = pd.DataFrame({
            'filename': [filename] * len(segments),
            'text': segments
        })
        Stroke_text_df_list.append(df)

Stroke_text_df = pd.concat(Stroke_text_df_list, axis=0, ignore_index=True)

In [None]:
filename_counter = {}

for i in range(len(Stroke_text_df)):
    row = Stroke_text_df.iloc[i]

    full_name = os.path.splitext(row['filename'])[0]  # "output_PN_AJH-01-01-F-45-SU_0"

    match = re.match(r'ID-\d{2}-\d{2}-N-(.+)', full_name)
    
    if match:
        person_code = match.group(1)  # ex: AJH-01-01-F-45-SU
    else:
        print(f"⚠️ Code pattern does not match: {full_name}")
        continue

    count = filename_counter.get(person_code, 0)
    filename_counter[person_code] = count + 1

    new_filename = f"output_PN_{person_code}_{count}.txt"
    output_path = Stroke_TEXT_OUTPUT / new_filename

    cleaned_text = re.sub(r"[+\*\(\)\?!,\.~\-']", "", row['text'])

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(cleaned_text)

print("✅ TextData is Saved:", Stroke_TEXT_OUTPUT)