In [1]:
cd /home/ec2-user/SageMaker/VIP-Dementia/audio_processing

/home/ec2-user/SageMaker/VIP-Dementia/audio_processing


In [3]:
import os
import pandas as pd
import librosa
import opensmile
from tqdm import tqdm

# Assuming the directory setup remains unchanged
HOME_DIRECTORY = os.path.expanduser("~/SageMaker/VIP-Dementia")
RAW_DATA_DIR = os.path.join(HOME_DIRECTORY, 'data/raw')
PROCESSED_DATA_DIR = os.path.join(HOME_DIRECTORY, 'data/processed')
FEATURES_DIR = os.path.join(PROCESSED_DATA_DIR, 'features')
LABELS_DIR = os.path.join(PROCESSED_DATA_DIR, 'labels')

# Label mapping for binary classification
label_dict = {'CN': 0, 'AD': 1}

def get_features(audio_path, sample_rate=16000):
    """
    Extract features for the entire audio file at a specified sampling rate.

    :param audio_path: Path to the audio file.
    :param sample_rate: Desired sampling rate (in Hz).
    :return: DataFrame containing extracted features.
    """
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPS,
        feature_level=opensmile.FeatureLevel.Functionals,
    )

    try:
        # Load the entire audio file at the specified sampling rate
        y, sr = librosa.load(audio_path, sr=sample_rate)
        # Extract features using OpenSMILE
        features_df = smile.process_signal(y, sr)
        return features_df
    except Exception as e:
        print(f"Error processing file {audio_path}: {e}")
        return pd.DataFrame()

def load_audio_data(sample_rate=16000):
    X, y = [], []
    partitions = ['ad', 'cn']
    total_ad_files = len([name for name in os.listdir(os.path.join(RAW_DATA_DIR, 'audio', 'ad')) if name.endswith('.wav')])
    total_cn_files = len([name for name in os.listdir(os.path.join(RAW_DATA_DIR, 'audio', 'cn')) if name.endswith('.wav')])
    total_files = total_ad_files + total_cn_files
    processed_count = 0

    for partition in partitions:
        audio_dir = os.path.join(RAW_DATA_DIR, 'audio', partition)
        audio_files = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.endswith('.wav')]
        
        # Using tqdm to display progress
        with tqdm(total=len(audio_files), desc=f"Processing {partition.upper()} files") as pbar:
            for audio_path in audio_files:
                features_df = get_features(audio_path, sample_rate)
                if not features_df.empty:
                    X.append(features_df)
                    y.append(label_dict[partition.upper()])
                processed_count += 1
                pbar.update(1)

    print(f"\nFinished loading audio data. Successfully processed {processed_count} audio files out of {total_files} files.")
    return X, y

In [19]:
X, y = load_audio_data(sample_rate=16000) 

Processing AD files: 100%|██████████| 87/87 [05:18<00:00,  3.67s/it]
Processing CN files: 100%|██████████| 79/79 [03:41<00:00,  2.81s/it]


Finished loading audio data. Successfully processed 166 audio files out of 166 files.





In [21]:
from sklearn.model_selection import train_test_split


features_df = pd.concat(X, ignore_index=True)
labels_series = pd.Series(y)
print(f'\nAudio samples represented: {len(X)}')  # Total number of audio files processed
print(f'Numerical features extracted per sample: {features_df.shape[1]}')
print(f'Unique labels in dataset: {labels_series.unique()}')

# Display the first few rows of the features DataFrame to check the data
features_df.head()

# Assuming 'features_df' and 'labels_series' are already defined and contain all your data
X = features_df
y = labels_series

# First, split into a training and a temp set (combining validation and test) with an 80/20 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Then split the temp set equally into validation and test sets
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Display sizes of the datasets
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_valid.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")



Audio samples represented: 166
Numerical features extracted per sample: 88
Unique labels in dataset: [1 0]
Training set size: 132
Validation set size: 17
Test set size: 17


In [23]:
import numpy as np

# Specify the directory where you want to save the files
HOME_DIRECTORY = os.path.expanduser("~/SageMaker/VIP-Dementia")
save_dir = os.path.join(HOME_DIRECTORY, 'data/processed')

# Function to save datasets
def save_dataset(filename, dataset):
    path = os.path.join(save_dir, filename)
    np.save(path, dataset)
    print(f"Saved {filename} to {save_dir}")

# Saving each dataset
save_dataset('X_train.npy', X_train)
save_dataset('X_valid.npy', X_valid)
save_dataset('X_test.npy', X_test)
save_dataset('y_train.npy', y_train)
save_dataset('y_valid.npy', y_valid)
save_dataset('y_test.npy', y_test)


Saved X_train.npy to /home/ec2-user/SageMaker/VIP-Dementia/data/processed
Saved X_valid.npy to /home/ec2-user/SageMaker/VIP-Dementia/data/processed
Saved X_test.npy to /home/ec2-user/SageMaker/VIP-Dementia/data/processed
Saved y_train.npy to /home/ec2-user/SageMaker/VIP-Dementia/data/processed
Saved y_valid.npy to /home/ec2-user/SageMaker/VIP-Dementia/data/processed
Saved y_test.npy to /home/ec2-user/SageMaker/VIP-Dementia/data/processed


In [5]:
import os
import numpy as np

# Specify the directory where you want to save the files
HOME_DIRECTORY = os.path.expanduser("~/SageMaker/VIP-Dementia")
save_dir = os.path.join(HOME_DIRECTORY, 'data/processed/16000')
SPLIT_DATA_DIR = save_dir

# Function to load datasets
def load_dataset(filename):
    path = os.path.join(SPLIT_DATA_DIR, filename)
    if os.path.exists(path):
        return np.load(path, allow_pickle=True)
    else:
        print(f"File {filename} not found in {SPLIT_DATA_DIR}. Please check the directory and try again.")
        return None

# Loading each dataset
X_train = load_dataset('X_train.npy')
X_valid = load_dataset('X_valid.npy')
X_test = load_dataset('X_test.npy')
y_train = load_dataset('y_train.npy')
y_valid = load_dataset('y_valid.npy')
y_test = load_dataset('y_test.npy')

# Verifying the shapes of the loaded datasets (if they were successfully loaded)
if X_train is not None and y_train is not None:
    print(f"Loaded X_train: {X_train.shape}, y_train: {y_train.shape}")
if X_valid is not None and y_valid is not None:
    print(f"Loaded X_valid: {X_valid.shape}, y_valid: {y_valid.shape}")
if X_test is not None and y_test is not None:
    print(f"Loaded X_test: {X_test.shape}, y_test: {y_test.shape}")


File X_train.npy not found in /home/ec2-user/SageMaker/VIP-Dementia/data/processed. Please check the directory and try again.
File X_valid.npy not found in /home/ec2-user/SageMaker/VIP-Dementia/data/processed. Please check the directory and try again.
File X_test.npy not found in /home/ec2-user/SageMaker/VIP-Dementia/data/processed. Please check the directory and try again.
File y_train.npy not found in /home/ec2-user/SageMaker/VIP-Dementia/data/processed. Please check the directory and try again.
File y_valid.npy not found in /home/ec2-user/SageMaker/VIP-Dementia/data/processed. Please check the directory and try again.
File y_test.npy not found in /home/ec2-user/SageMaker/VIP-Dementia/data/processed. Please check the directory and try again.


In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classification_models = [
    KNeighborsClassifier(),#(3),
    SVC(kernel='linear'),#, C=0.025),
    SVC(kernel='rbf'),
    DecisionTreeClassifier(),#max_depth=5),
    RandomForestClassifier(),#max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

scores = []
for model in classification_models:
    try:
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        model_name = type(model).__name__
        if isinstance(model, SVC) and model.kernel == 'rbf':
            model_name += ' RBF kernel'
        scores.append((model_name, f'{100*score:.2f}%'))
    except ValueError as e:
        print(f"Error training {type(model).__name__}: {str(e)}")




In [26]:
# Make it pretty
scores_df = pd.DataFrame(scores,columns=['Classifier','Accuracy Score'])
scores_df.sort_values(by='Accuracy Score',axis=0,ascending=False)

Unnamed: 0,Classifier,Accuracy Score
4,RandomForestClassifier,88.24%
5,AdaBoostClassifier,82.35%
6,GaussianNB,76.47%
1,SVC,70.59%
3,DecisionTreeClassifier,70.59%
2,SVC RBF kernel,52.94%
7,QuadraticDiscriminantAnalysis,52.94%
0,KNeighborsClassifier,35.29%


In [1]:
# Importing necessary libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Other imports for the confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns


def plot_confusion_matrix(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title(f'Confusion Matrix for {type(clf).__name__}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# Example of plotting confusion matrix for RandomForestClassifier
plot_confusion_matrix(RandomForestClassifier().fit(X_train, y_train), X_test, y_test)

Matplotlib is building the font cache; this may take a moment.


NameError: name 'RandomForestClassifier' is not defined