In [5]:
cd /home/ec2-user/SageMaker/VIP-Dementia

/home/ec2-user/SageMaker/VIP-Dementia


In [6]:
pip install -r requirements.txt

Collecting librosa (from -r requirements.txt (line 5))
  Downloading librosa-0.10.1-py3-none-any.whl.metadata (8.3 kB)
Collecting opensmile (from -r requirements.txt (line 8))
  Downloading opensmile-2.5.0-py3-none-manylinux_2_17_x86_64.whl.metadata (15 kB)
Collecting audioread>=2.1.9 (from librosa->-r requirements.txt (line 5))
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting soundfile>=0.12.1 (from librosa->-r requirements.txt (line 5))
  Downloading soundfile-0.12.1-py2.py3-none-manylinux_2_17_x86_64.whl.metadata (14 kB)
Collecting pooch>=1.0 (from librosa->-r requirements.txt (line 5))
  Downloading pooch-1.8.1-py3-none-any.whl.metadata (9.5 kB)
Collecting soxr>=0.3.2 (from librosa->-r requirements.txt (line 5))
  Downloading soxr-0.3.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting audobject>=0.6.1 (from opensmile->-r requirements.txt (line 8))
  Downloading audobject-0.7.11-py3-none-any.whl.metadata (2.6 kB)
Coll

In [1]:
cd /home/ec2-user/SageMaker/VIP-Dementia/audio_processing

/home/ec2-user/SageMaker/VIP-Dementia/audio_processing


In [2]:
import os

# Assuming the directory setup remains unchanged
HOME_DIRECTORY = os.path.expanduser("~/SageMaker/VIP-Dementia")
RAW_DATA_DIR = os.path.join(HOME_DIRECTORY, 'data/raw')

# Calculate total number of .wav files in each partition (AD and CN)
total_ad_files = len([name for name in os.listdir(os.path.join(RAW_DATA_DIR, 'audio', 'ad')) if name.endswith('.wav')])
total_cn_files = len([name for name in os.listdir(os.path.join(RAW_DATA_DIR, 'audio', 'cn')) if name.endswith('.wav')])

print("Total AD files:", total_ad_files)
print("Total CN files:", total_cn_files)
print("Total files:", total_ad_files + total_cn_files)

Total AD files: 87
Total CN files: 79
Total files: 166


In [11]:
import os
import pandas as pd
import librosa
import opensmile
from tqdm import tqdm

# Assuming the directory setup remains unchanged
HOME_DIRECTORY = os.path.expanduser("~/SageMaker/VIP-Dementia")
RAW_DATA_DIR = os.path.join(HOME_DIRECTORY, 'data/raw')
PROCESSED_DATA_DIR = os.path.join(HOME_DIRECTORY, 'data/processed')
FEATURES_DIR = os.path.join(PROCESSED_DATA_DIR, 'features')
LABELS_DIR = os.path.join(PROCESSED_DATA_DIR, 'labels')

# Label mapping for binary classification
label_dict = {'CN': 0, 'AD': 1}

def get_features(audio_path):
    """
    Extract features for the entire audio file.

    :param audio_path: Path to the audio file.
    :return: DataFrame containing extracted features.
    """
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPSv02,
        feature_level=opensmile.FeatureLevel.Functionals,
    )

    try:
        # Load the entire audio file
        y, sr = librosa.load(audio_path, sr=None)
        # Extract features using OpenSMILE
        features_df = smile.process_signal(y, sr)
        return features_df
    except Exception as e:
        print(f"Error processing file {audio_path}: {e}")
        return pd.DataFrame()

def load_audio_data():
    X, y = [], []
    partitions = ['ad', 'cn']
    total_ad_files = len([name for name in os.listdir(os.path.join(RAW_DATA_DIR, 'audio', 'ad')) if name.endswith('.wav')])
    total_cn_files = len([name for name in os.listdir(os.path.join(RAW_DATA_DIR, 'audio', 'cn')) if name.endswith('.wav')])
    total_files = total_ad_files + total_cn_files
    processed_count = 0

    for partition in partitions:
        audio_dir = os.path.join(RAW_DATA_DIR, 'audio', partition)
        audio_files = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.endswith('.wav')]
        
        # Using tqdm to display progress
        with tqdm(total=len(audio_files), desc=f"Processing {partition.upper()} files") as pbar:
            for audio_path in audio_files:
                features_df = get_features(audio_path)
                if not features_df.empty:
                    X.append(features_df)
                    y.append(label_dict[partition.upper()])
                processed_count += 1
                pbar.update(1)

    print(f"\nFinished loading audio data. Successfully processed {processed_count} audio files out of {total_files} files.")
    return X, y

In [12]:
X, y = load_audio_data()

Processing AD files: 100%|██████████| 87/87 [13:30<00:00,  9.32s/it]
Processing CN files: 100%|██████████| 79/79 [08:50<00:00,  6.71s/it]


Finished loading audio data. Successfully processed 166 audio files out of 166 files.





In [13]:
from sklearn.model_selection import train_test_split
import numpy as np

features_df = pd.concat(X, ignore_index=True)
labels_series = pd.Series(y)
print(f'\nAudio samples represented: {len(X)}')  # Total number of audio files processed
print(f'Numerical features extracted per sample: {features_df.shape[1]}')
print(f'Unique labels in dataset: {labels_series.unique()}')

# Display the first few rows of the features DataFrame to check the data
features_df.head()

# Assuming 'features_df' and 'labels_series' are already defined and contain all your data
X = features_df
y = labels_series

# First, split into a training and a temp set (combining validation and test) with an 80/20 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Then split the temp set equally into validation and test sets
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Display sizes of the datasets
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_valid.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Specify the directory where you want to save the files
save_dir = PROCESSED_DATA_DIR

# Function to save datasets
def save_dataset(filename, dataset):
    path = os.path.join(save_dir, filename)
    np.save(path, dataset)
    print(f"Saved {filename} to {save_dir}")

# Saving each dataset
save_dataset('X_train.npy', X_train)
save_dataset('X_valid.npy', X_valid)
save_dataset('X_test.npy', X_test)
save_dataset('y_train.npy', y_train)
save_dataset('y_valid.npy', y_valid)
save_dataset('y_test.npy', y_test)



Audio samples represented: 166
Numerical features extracted per sample: 88
Unique labels in dataset: [1 0]
Training set size: 132
Validation set size: 17
Test set size: 17
Saved X_train.npy to /home/ec2-user/SageMaker/VIP-Dementia/data/processed
Saved X_valid.npy to /home/ec2-user/SageMaker/VIP-Dementia/data/processed
Saved X_test.npy to /home/ec2-user/SageMaker/VIP-Dementia/data/processed
Saved y_train.npy to /home/ec2-user/SageMaker/VIP-Dementia/data/processed
Saved y_valid.npy to /home/ec2-user/SageMaker/VIP-Dementia/data/processed
Saved y_test.npy to /home/ec2-user/SageMaker/VIP-Dementia/data/processed


In [14]:
import os
import numpy as np

# Correct directory path setup
HOME_DIRECTORY = os.path.expanduser("~/SageMaker/VIP-Dementia")
SPLIT_DATA_DIR = os.path.join(HOME_DIRECTORY, 'data/processed')

# Function to load datasets
def load_dataset(filename):
    path = os.path.join(SPLIT_DATA_DIR, filename)
    if os.path.exists(path):
        return np.load(path, allow_pickle=True)
    else:
        print(f"File {filename} not found in {SPLIT_DATA_DIR}. Please check the directory and try again.")
        return None

# Loading each dataset
X_train = load_dataset('X_train.npy')
X_valid = load_dataset('X_valid.npy')
X_test = load_dataset('X_test.npy')
y_train = load_dataset('y_train.npy')
y_valid = load_dataset('y_valid.npy')
y_test = load_dataset('y_test.npy')

# Verifying the shapes of the loaded datasets (if they were successfully loaded)
if X_train is not None and y_train is not None:
    print(f"Loaded X_train: {X_train.shape}, y_train: {y_train.shape}")
if X_valid is not None and y_valid is not None:
    print(f"Loaded X_valid: {X_valid.shape}, y_valid: {y_valid.shape}")
if X_test is not None and y_test is not None:
    print(f"Loaded X_test: {X_test.shape}, y_test: {y_test.shape}")


Loaded X_train: (132, 88), y_train: (132,)
Loaded X_valid: (17, 88), y_valid: (17,)
Loaded X_test: (17, 88), y_test: (17,)


In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classification_models = [
    KNeighborsClassifier(),#(3),
    SVC(kernel='linear'),#, C=0.025),
    SVC(kernel='rbf'),
    DecisionTreeClassifier(),#max_depth=5),
    RandomForestClassifier(),#max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

scores = []
for model in classification_models:
    try:
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        model_name = type(model).__name__
        if isinstance(model, SVC) and model.kernel == 'rbf':
            model_name += ' RBF kernel'
        scores.append((model_name, f'{100*score:.2f}%'))
    except ValueError as e:
        print(f"Error training {type(model).__name__}: {str(e)}")




In [16]:
# Make it pretty
scores_df = pd.DataFrame(scores,columns=['Classifier','Accuracy Score'])
scores_df.sort_values(by='Accuracy Score',axis=0,ascending=False)

Unnamed: 0,Classifier,Accuracy Score
3,DecisionTreeClassifier,70.59%
4,RandomForestClassifier,70.59%
5,AdaBoostClassifier,70.59%
7,QuadraticDiscriminantAnalysis,70.59%
1,SVC,64.71%
6,GaussianNB,64.71%
2,SVC RBF kernel,52.94%
0,KNeighborsClassifier,47.06%
