In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from pydub.utils import db_to_float
import itertools
from pydub import AudioSegment

import IPython.display as ipd
from pydub import AudioSegment
import torch
import librosa
from IPython.display import Audio
import noisereduce as nr

In [None]:
from pydub.utils import which
print("ffmpeg location:", which("ffmpeg"))

In [None]:
BASE_PATH = Path("YOUR_ROOT")

Peripheral_Neuropathy_AUDIO = BASE_PATH / "Training/audio_data/TS01/Peripheral_Neuropathy"
Peripheral_Neuropathy_LABEL = BASE_PATH / "Training/label_data/TL01/Peripheral_Neuropathy"

Cerebral_Palsy_AUDIO = BASE_PATH / "Validation/audio_data/VS01/Cerebral_Palsy_disease"
Cerebral_Palsy_LABEL = BASE_PATH / "Validation/label_data/VL01/Cerebral_Palsy_disease"

Stroke_AUDIO = BASE_PATH / "Validation/audio_data/VS01/Stroke"
Stroke_LABEL = BASE_PATH / "Validation/label_data/VL01/Stroke"

OUTPUT_BASE = BASE_PATH / "Preprocessed"
Peripheral_Neuropathy_OUTPUT = OUTPUT_BASE / "Peripheral_Neuropathy_dataset"
Cerebral_Palsy_OUTPUT = OUTPUT_BASE / "Cerebral_Palsy_dataset"
Stroke_OUTPUT = OUTPUT_BASE / "Stroke_dataset"

In [None]:
print("Peripheral_Neuropathy →", Peripheral_Neuropathy_OUTPUT)
print("Cerebral_Palsy →", Cerebral_Palsy_OUTPUT)
print("Stroke →", Stroke_OUTPUT)

In [None]:
TEXT_OUTPUT_BASE = BASE_PATH / "Text_Preprocessed"
Peripheral_Neuropathy_TEXT_OUTPUT = TEXT_OUTPUT_BASE / "Peripheral_Neuropathy_dataset"

Peripheral_Neuropathy_TEXT_OUTPUT.mkdir(parents=True, exist_ok=True)

print("Peripheral_Neuropathy →", Peripheral_Neuropathy_TEXT_OUTPUT)

In [None]:
Fluency_OUTPUT_BASE = BASE_PATH / 'Fluency_Metirx'
Peripheral_Neuropathy_Fluency = Fluency_OUTPUT_BASE / 'Peripheral_Neuropathy_Fluency'

Peripheral_Neuropathy_Fluency.mkdir(parents=True, exist_ok=True)

In [None]:
from pydub.utils import mediainfo

df = pd.read_csv("preprocessing.csv")

df['Merged_File_ids'] = df['Merged_File_ids'].str.split(',')
df = df.explode('Merged_File_ids').reset_index(drop=True)

df.head(10)

In [None]:
import os
import re
import pandas as pd

Fluency_files = [f for f in os.listdir(Peripheral_Neuropathy_Fluency) if f.endswith(".wav")]

def extract_person_code(filename):
    match = re.search(r'output_PN_(.+)_\d+\.wav', filename)
    return match.group(1) if match else None

person_codes_in_output = [extract_person_code(f) for f in Fluency_files]
person_codes_in_output = [code for code in person_codes_in_output if code is not None]

def count_matches_per_file(row):
    fname = row['Merged_File_ids']
    match = re.search(r'ID-\d{2}-\d{2}-[A-Z]-(.+?)\.wav', fname)
    if match:
        person_code = match.group(1)
        return person_codes_in_output.count(person_code)
    return 0

df['File_Count'] = df.apply(count_matches_per_file, axis=1)

In [None]:
df[(df['Disease'] == 2) & (df['File_Count'] != 0)]

In [None]:
df[(df['Disease'] == 2) & (df['File_Count'] == 0)]

In [None]:
df = df[~((df['Disease'] == 2) & (df['File_Count'] == 0))].reset_index(drop=True)

In [None]:
df[(df['Disease'] == 2) & (df['File_Count'] == 0)].head(10)

In [None]:
df[(df['Disease'] == 2)].head(10)

In [None]:
import os
import re
import pandas as pd

Fluency_files = [f for f in os.listdir(Peripheral_Neuropathy_Fluency) if f.endswith(".wav")]

def extract_person_code(filename):
    match = re.search(r'output_PN_(.+)_\d+\.wav', filename)
    return match.group(1).strip() if match else None

person_code_to_files = {}
for f in Fluency_files:
    code = extract_person_code(f)
    if code:
        person_code_to_files.setdefault(code, []).append(f)

def get_matching_fluency_files(row):
    fname = row['Merged_File_ids']
    match = re.search(r'ID-\d{2}-\d{2}-[A-Z]-(.+?)\.wav', fname)
    if match:
        person_code = match.group(1).strip()
        return person_code_to_files.get(person_code, [])
    return []

df['File_ids'] = df.apply(get_matching_fluency_files, axis=1)

In [None]:
df = df.explode('File_ids').reset_index(drop=True)

In [None]:
df[df['Disease'] == 2]

In [None]:
df[df['Disease'] == 2].count()

In [None]:
!pip install soundfile

In [None]:
## 'Duration', 'Syllable' extraction

import os
import librosa
import numpy as np
import pandas as pd

def analyze_syllable_rate_for_files(file_list, folder_path):
    total_duration = 0
    total_syllables = 0

    for filename in file_list:
        file_path = os.path.join(folder_path, filename)
        try:
            y, sr = librosa.load(file_path, sr=None)
            duration = librosa.get_duration(y=y, sr=sr)
            onset_env = librosa.onset.onset_strength(y=y, sr=sr)
            onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')
            syllable_count = len(onsets)

            total_duration += duration
            total_syllables += syllable_count
        except Exception as e:
            print(f"Error: {filename} - {e}")
            continue

    return total_duration, total_syllables

In [None]:
def apply_full_syllable_analysis(row):
    result = {
        'Duration': 0.0,
        'Syllable': 0.0,
    }

    match_files = row['File_ids']

    if isinstance(match_files, str):
        match_files = [match_files]
    elif isinstance(match_files, list):
        match_files = match_files
    else:
        match_files = []
    
    if match_files:
        total_duration, total_syllables = analyze_syllable_rate_for_files(match_files, Peripheral_Neuropathy_Fluency)
        result.update({
            'Duration': total_duration,
            'Syllable': total_syllables,
        })
        
    return pd.Series(result)

df_disease2 = df[df['Disease'] == 2].copy()

df_disease2[['Duration', 'Syllable']] = df_disease2.apply(apply_full_syllable_analysis, axis=1)

df[['Duration', 'Syllable']] = 0.0

df.update(df_disease2)

In [None]:
df[df['Disease'] == 2].head(10)

In [None]:
# 'Speak_Time' extraction

import librosa
import os
import numpy as np

def estimate_speaking_ratio(file_list, folder_path, vad_threshold_db=-30):
    total_duration = 0
    total_speaking_time = 0

    for filename in file_list:
        file_path = os.path.join(folder_path, filename)
        try:
            y, sr = librosa.load(file_path, sr=None)
            duration = librosa.get_duration(y=y, sr=sr)
            total_duration += duration

            intervals = librosa.effects.split(y, top_db=abs(vad_threshold_db))
            speaking_segments = sum((end - start) for start, end in intervals)
            speaking_time = speaking_segments / sr
            total_speaking_time += speaking_time
        except Exception as e:
            print(f"Error: {filename} - {e}")

    return total_speaking_time

In [None]:
def apply_speaking_ratio(row):
    result = {
        'Speak_Time': 0.0
    }

    match_files = row['File_ids']

    if isinstance(match_files, str):
        match_files = [match_files]
    elif isinstance(match_files, list):
        match_files = match_files
    else:
        match_files = []
    
    if match_files:
        total_speaking_time = estimate_speaking_ratio(match_files, Peripheral_Neuropathy_Fluency)
        result.update({
            'Speak_Time': total_speaking_time
        })
        
    return pd.Series(result)

df_disease2 = df[df['Disease'] == 2].copy()

df_disease2[['Speak_Time']] = df_disease2.apply(apply_speaking_ratio, axis=1)

df[['Speak_Time']] = 0.0

df.update(df_disease2)

In [None]:
df[df['Disease'] == 2].head(10)

In [None]:
# Pause_Time, Pause_Count extraction

import librosa
import os
import numpy as np

def estimate_pause_ratio(file_list, folder_path, pause_threshold_sec=0.5, top_db=30):
    total_audio_duration = 0.0
    total_pause_time = 0.0
    total_pause_count = 0

    for filename in file_list:
        file_path = os.path.join(folder_path, filename)
        try:
            y, sr = librosa.load(file_path, sr=None)
            duration = librosa.get_duration(y=y, sr=sr)
            total_audio_duration += duration

            intervals = librosa.effects.split(y, top_db=top_db)  # [[start, end], ...]

            for i in range(1, len(intervals)):
                prev_end = intervals[i-1][1]
                curr_start = intervals[i][0]
                gap_duration = (curr_start - prev_end) / sr  # sample → sec

                if gap_duration >= pause_threshold_sec:
                    total_pause_time += gap_duration
                    total_pause_count += 1
        except Exception as e:
            print(f"Error: {filename} - {e}")

    return total_pause_time, total_pause_count

In [None]:
def apply_pause_ratio(row):
    result = {
        'Pause_Time': 0.0,
        'Pause_Count': 0.0
    }
    match_files = row['File_ids']

    if isinstance(match_files, str):
        match_files = [match_files]
    elif isinstance(match_files, list):
        match_files = match_files
    else:
        match_files = []
    
    if match_files:
        total_pause_time, total_pause_count = estimate_pause_ratio(match_files, Peripheral_Neuropathy_Fluency)
        result.update({
            'Pause_Time': total_pause_time,
            'Pause_Count': total_pause_count
        })
        
    return pd.Series(result)

df_disease2 = df[df['Disease'] == 2].copy()

df_disease2[['Pause_Time', 'Pause_Count']] = df_disease2.apply(apply_pause_ratio, axis=1)

df[['Pause_Time', 'Pause_Count']] = 0.0

df.update(df_disease2)

In [None]:
df[df['Disease'] == 2].head(10)

In [None]:
# Speak_Count extraction

import librosa
import os
import numpy as np

def estimate_continuity(file_list, folder_path, top_db=30, pause_threshold_sec=0.5):
    total_speaking_time = 0.0
    total_speech_segments = 0

    for filename in file_list:
        file_path = os.path.join(folder_path, filename)
        try:
            y, sr = librosa.load(file_path, sr=None)

            intervals = librosa.effects.split(y, top_db=top_db)

            segment_speaking_time = sum((end - start) for start, end in intervals) / sr
            total_speaking_time += segment_speaking_time

            if len(intervals) == 0:
                continue
            segment_count = 1

            for i in range(1, len(intervals)):
                prev_end = intervals[i - 1][1]
                curr_start = intervals[i][0]
                gap = (curr_start - prev_end) / sr

                if gap >= pause_threshold_sec:
                    segment_count += 1

            total_speech_segments += segment_count
        except Exception as e:
            print(f"Error: {filename} - {e}")

    return total_speech_segments

In [None]:
def apply_continuity(row):
    result = {
        'Speak_Count': 0.0
    }

    match_files = row['File_ids']

    if isinstance(match_files, str):
        match_files = [match_files]
    elif isinstance(match_files, list):
        match_files = match_files
    else:
        match_files = []
    
    if match_files:
        total_speech_segments = estimate_continuity(match_files, Peripheral_Neuropathy_Fluency)
        result.update({
            'Speak_Count': total_speech_segments,
        })
        
    return pd.Series(result)

df_disease2 = df[df['Disease'] == 2].copy()

df_disease2[['Speak_Count']] = df_disease2.apply(apply_continuity, axis=1)

df[['Speak_Count']] = 0.0

df.update(df_disease2)

In [None]:
df[df['Disease'] == 2].head(10)

In [None]:
!pip install librosa praat-parselmouth

In [None]:
import os
import pandas as pd
import parselmouth
from parselmouth.praat import call
import librosa
import numpy as np

def safe_get_mean(obj, method, *args):
    try:
        val = call(obj, method, *args)
        return 0.0 if np.isnan(val) else val
    except:
        return 0.0

def extract_core_acoustic_features(file_path):
    snd = parselmouth.Sound(file_path)
    # Pitch
    pitch = snd.to_pitch(pitch_floor=50, pitch_ceiling=600)
    pitch_mean = safe_get_mean(pitch, "Get mean", 0, 0, "Hertz")
    # Jitter
    pp = call(snd, "To PointProcess (periodic, cc)", 75, 500)
    jitter = safe_get_mean(pp, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    # Shimmer
    shimmer = safe_get_mean([snd, pp], "Get shimmer (local)",
                            0,0,0.0001,0.02,1.3,1.6)
    # HNR
    hnr = safe_get_mean(snd.to_harmonicity_cc(), "Get mean", 0, 0)
    # Formants
    formant = snd.to_formant_burg(time_step=0.01,
                                  max_number_of_formants=5,
                                  maximum_formant=5500,
                                  window_length=0.025,
                                  pre_emphasis_from=50)
    f1 = safe_get_mean(formant, "Get mean", 1, 0, 0, "Hertz")
    f2 = safe_get_mean(formant, "Get mean", 2, 0, 0, "Hertz")
    f3 = safe_get_mean(formant, "Get mean", 3, 0, 0, "Hertz")
    # MFCC
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_means = np.mean(mfccs, axis=1)

    features = {
        'Pitch_Mean': pitch_mean,
        'Jitter_Local': jitter,
        'Shimmer_Local': shimmer,
        'HNR': hnr,
        'formant1_mean': f1,
        'formant2_mean': f2,
        'formant3_mean': f3,
        **{f'mfcc_{i+1}': mfcc_means[i] for i in range(13)}
    }
    return pd.Series(features)

In [None]:
feature_columns = [
    'Pitch_Mean',
    'Jitter_Local',
    'Shimmer_Local',
    'HNR',
    'formant1_mean',
    'formant2_mean',
    'formant3_mean',
] + [f'mfcc_{i}' for i in range(1, 14)]

for col in feature_columns:
    df[col] = 0.0

def apply_acoustic_features_by_file(row):
    file_name = row['File_ids']
    if isinstance(file_name, str):
        fpath = os.path.join(Peripheral_Neuropathy_Fluency, file_name)
        if os.path.isfile(fpath):
            return extract_core_acoustic_features(fpath)
    return pd.Series({col: 0.0 for col in feature_columns})

mask = df['Disease'] == 2
df.loc[mask, feature_columns] = df.loc[mask].apply(
    apply_acoustic_features_by_file, axis=1
)

In [None]:
df[df['Disease'] == 2].head(10)

In [None]:
from sklearn.decomposition import PCA

mfcc_cols = [f"mfcc_{i}" for i in range(1, 14)]
formant_cols = ["formant1_mean", "formant2_mean", "formant3_mean"]

mfcc_pca = PCA(n_components=1)
formant_pca = PCA(n_components=1)

df["MFCC"] = mfcc_pca.fit_transform(df[mfcc_cols]).ravel()
df["Formant"] = formant_pca.fit_transform(df[formant_cols]).ravel()

print("MFCC PC1 explained variance ratio:", mfcc_pca.explained_variance_ratio_[0])
print("Formant PC1 explained variance ratio:", formant_pca.explained_variance_ratio_[0])

In [None]:
df = df.explode('formant1_mean').reset_index(drop=True)
df = df.explode('formant2_mean').reset_index(drop=True)
df = df.explode('formant3_mean').reset_index(drop=True)

df = df.explode('mfcc_1').reset_index(drop=True)
df = df.explode('mfcc_2').reset_index(drop=True)
df = df.explode('mfcc_3').reset_index(drop=True)
df = df.explode('mfcc_4').reset_index(drop=True)
df = df.explode('mfcc_5').reset_index(drop=True)
df = df.explode('mfcc_6').reset_index(drop=True)
df = df.explode('mfcc_7').reset_index(drop=True)
df = df.explode('mfcc_8').reset_index(drop=True)
df = df.explode('mfcc_9').reset_index(drop=True)
df = df.explode('mfcc_10').reset_index(drop=True)
df = df.explode('mfcc_11').reset_index(drop=True)
df = df.explode('mfcc_12').reset_index(drop=True)
df = df.explode('mfcc_13').reset_index(drop=True)

In [None]:
df = df.explode('Initial').reset_index(drop=True)
df = df.explode('Area').reset_index(drop=True)
df = df.explode('Merged_File_ids').reset_index(drop=True)
df = df.explode('File_Count').reset_index(drop=True)
df = df.explode('Duration').reset_index(drop=True)

In [None]:
df[df['Disease'] == 2].head(10)

In [None]:
df.to_csv("Peripheral_Neuropathy_feature.csv", index=False, encoding='utf-8-sig')