# NeuroVocal AI: Vocal Biomarker Classifier (Training Pipeline)

This notebook provides a complete, end-to-end pipeline for training the ensemble model used in the NeuroVocal AI web application. It trains two separate models directly from `.wav` files:

1.  **CRNN Model**: A Convolutional Recurrent Neural Network trained on **Mel Spectrograms**.
2.  **Random Forest Model**: A classic machine learning model trained on a **vector of summary statistics** (jitter, shimmer, MFCC means, etc.).

This script automatically scans a directory of audio files, extracts both feature sets, trains the models, and saves all necessary components for the Flask backend (`app.py`).

## 1. Imports and Configuration

First, we import all the necessary libraries and set up the configuration for our project. **You only need to update the `DATA_DIR` variable to point to your main dataset folder.**

In [4]:
import os
import numpy as np
import pandas as pd
import librosa
import parselmouth
from parselmouth.praat import call
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, LSTM, Dense, Dropout, BatchNormalization, Reshape
from tensorflow.keras.utils import to_categorical
import warnings

warnings.filterwarnings('ignore')

# --- Configuration ---
class Config:
    # --- USER ACTION REQUIRED: Update this path ---
    DATA_DIR = r'C:\Users\yajna\Downloads\fiftydataset' # Main directory with subfolders for each class (e.g., 'healthy', 'parkinsons')

    # Audio & Feature Parameters
    SAMPLE_RATE = 44100
    N_MELS_CRNN = 128
    MAX_PAD_LEN_CRNN = 250 # Max length for spectrograms
    N_MFCC_RF = 13 # Number of MFCCs for the Random Forest features

    # Model & Training parameters
    TEST_SIZE = 0.2
    RANDOM_STATE = 42
    N_ESTIMATORS_RF = 100
    EPOCHS_CRNN = 50
    BATCH_SIZE_CRNN = 32

config = Config()

## 2. Dual Feature Extraction

We define two separate feature extraction functions, mirroring the `feature_extractor_advanced.py` file used by the Flask application.

- `extract_features_for_rf`: Creates a single row of summary statistics for the Random Forest model.
- `extract_spectrogram_for_crnn`: Creates a 2D Mel Spectrogram image for the CRNN model.

In [5]:
def extract_features_for_rf(wav_path):
    """Extracts a comprehensive feature vector (like a CSV row) for the Random Forest model."""
    features = {}
    try:
        y, sr = librosa.load(wav_path, sr=config.SAMPLE_RATE)
        sound = parselmouth.Sound(wav_path)
        
        # Jitter & shimmer
        point_process = call(sound, "To PointProcess (periodic, cc)", 75, 500)
        features['jitter_local'] = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
        features['shimmer_local'] = call([sound, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

        # Harmonicity
        harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
        features['hnr'] = call(harmonicity, "Get mean", 0, 0)

        # Pitch & intensity (fixed call!)
        pitch = sound.to_pitch(None, 75, 600)   # <-- FIXED
        intensity = sound.to_intensity()
        features['mean_f0'] = call(pitch, "Get mean", 0, 0, "Hertz")
        features['std_dev_f0'] = call(pitch, "Get standard deviation", 0, 0, "Hertz")
        features['mean_intensity'] = call(intensity, "Get mean", 0, 0, "energy")

        # MFCCs
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=config.N_MFCC_RF)
        for i in range(config.N_MFCC_RF):
            features[f'mfcc_{i+1}_mean'] = np.mean(mfccs[i])
            features[f'mfcc_{i+1}_std'] = np.std(mfccs[i])

        # Spectral features
        features['spectral_centroid'] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        features['spectral_rolloff'] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
        
        # Handle NaN/inf
        for key, value in features.items():
            if np.isnan(value) or np.isinf(value):
                features[key] = 0

        return features
    except Exception as e:
        print(f"Error processing {os.path.basename(wav_path)} for RF features: {e}")
        return None


def extract_spectrogram_for_crnn(wav_path):
    """Extracts a Mel Spectrogram for the CRNN model."""
    try:
        y, sr = librosa.load(wav_path, sr=config.SAMPLE_RATE)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=config.N_MELS_CRNN)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        
        # Pad or truncate spectrogram to a fixed length
        if mel_spec_db.shape[1] > config.MAX_PAD_LEN_CRNN:
            mel_spec_db = mel_spec_db[:, :config.MAX_PAD_LEN_CRNN]
        else:
            pad_width = config.MAX_PAD_LEN_CRNN - mel_spec_db.shape[1]
            mel_spec_db = np.pad(mel_spec_db, pad_width=((0, 0), (0, pad_width)), mode='constant')
        return mel_spec_db
    except Exception as e:
        print(f"Error extracting spectrogram: {e}")
        return np.zeros((config.N_MELS_CRNN, config.MAX_PAD_LEN_CRNN))

## 3. Data Loading and Preprocessing

This function now scans the subdirectories in `DATA_DIR`, using the folder names as labels. It extracts both sets of features for each `.wav` file it finds.

In [15]:
def load_and_prepare_data_from_folders():
    print("Step 1: Loading data and extracting features from audio folders...")
    
    if not os.path.exists(config.DATA_DIR):
        print(f"\n--- ERROR: Data directory '{config.DATA_DIR}' not found. ---")
        return None, None, None, None

    rf_features_list = []
    crnn_features_list = []
    labels = []

    # Iterate through each subfolder (which represents a class/label)
    for label in sorted(os.listdir(config.DATA_DIR)):
        class_dir = os.path.join(config.DATA_DIR, label)
        if os.path.isdir(class_dir):
            print(f"Processing files for class: {label}")
            for filename in os.listdir(class_dir):
                if filename.endswith('.wav'):
                    file_path = os.path.join(class_dir, filename)
                    
                    # Extract both sets of features
                    rf_features = extract_features_for_rf(file_path)
                    crnn_features = extract_spectrogram_for_crnn(file_path)
                    
                    if rf_features is not None and crnn_features is not None:
                        rf_features_list.append(rf_features)
                        crnn_features_list.append(crnn_features)
                        labels.append(label)
    
    if not rf_features_list:
        print("\n--- ERROR: No .wav files were found and processed. Check your DATA_DIR path and folder structure. ---")
        return None, None, None, None

    df_rf = pd.DataFrame(rf_features_list)
    X_crnn = np.array(crnn_features_list)
    
    print(f"\nSuccessfully loaded and processed {len(df_rf)} files.")
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(labels)
    
    return df_rf, X_crnn, y_encoded, label_encoder

## 4. Model Architecture (Deeper)

This function defines the **deeper** CRNN model architecture. We have added an extra `Conv2D` block and an extra `Dense` layer to increase its learning capacity.

In [16]:

def create_crnn_model(input_shape, num_classes):
    model = Sequential([
        # Ensure input is explicitly defined for CRNN
        Reshape((input_shape[0], input_shape[1], 1), input_shape=input_shape),

        # Block 1
        tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', padding="same"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.BatchNormalization(),

        # Block 2
        tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu', padding="same"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.BatchNormalization(),

        # Block 3
        tf.keras.layers.Conv2D(128, kernel_size=(3, 3), activation='relu', padding="same"),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.BatchNormalization(),

        # Flatten CNN output for RNN
        tf.keras.layers.Reshape((-1, 128)),

        # RNN layers
        tf.keras.layers.GRU(128, return_sequences=True),
        tf.keras.layers.GRU(64),

        # Dense layers
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),

        # Output layer
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


## 5. Main Training and Evaluation Pipeline

This is the main execution block. It runs the entire pipeline step-by-step:
1. Loads the data directly from audio folders.
2. Splits data into training and testing sets.
3. **Scales the Random Forest features and applies SMOTE** to the training set to handle class imbalance.
4. Trains the Random Forest model.
5. Trains the CRNN model on the spectrograms.
6. Combines the predictions from both models (ensemble) and evaluates the final performance.

In [17]:
print("--- Starting NeuroVocal AI Training Pipeline ---")

# Step 1: Load data
X_rf_df, X_crnn, y, label_encoder = load_and_prepare_data_from_folders()

if X_rf_df is not None:
    num_classes = len(label_encoder.classes_)
    X_rf = X_rf_df.values

    # Step 2: Split data
    print("\nStep 2: Splitting data...")
    X_rf_train, X_rf_test, X_crnn_train, X_crnn_test, y_train, y_test = train_test_split(
        X_rf, X_crnn, y, test_size=config.TEST_SIZE, random_state=config.RANDOM_STATE, stratify=y
    )

    # Step 3: Scale RF features and apply SMOTE
    print("Step 3: Scaling RF features and applying SMOTE...")
    scaler_rf = StandardScaler()
    X_rf_train_scaled = scaler_rf.fit_transform(X_rf_train)
    X_rf_test_scaled = scaler_rf.transform(X_rf_test)
    
    smote = SMOTE(random_state=config.RANDOM_STATE)
    X_rf_train_resampled, y_train_resampled = smote.fit_resample(X_rf_train_scaled, y_train)

    # Step 4: Train Random Forest Model
    print("\nStep 4: Training Random Forest model...")
    rf_model = RandomForestClassifier(n_estimators=config.N_ESTIMATORS_RF, random_state=config.RANDOM_STATE)
    rf_model.fit(X_rf_train_resampled, y_train_resampled)

    # Step 5: Train CRNN Model
    print("\nStep 5: Training CRNN model...")
    # We use the original y_train for CRNN to match X_crnn_train
    y_train_categorical = to_categorical(y_train, num_classes=num_classes)
    y_test_categorical = to_categorical(y_test, num_classes=num_classes)
    
    crnn_model = create_crnn_model(input_shape=(config.N_MELS_CRNN, config.MAX_PAD_LEN_CRNN), num_classes=num_classes)
    crnn_model.summary() # Print model details
    crnn_model.fit(X_crnn_train, y_train_categorical, 
                   epochs=config.EPOCHS_CRNN, batch_size=config.BATCH_SIZE_CRNN, 
                   validation_data=(X_crnn_test, y_test_categorical), verbose=1)

    # Step 6: Ensemble Evaluation
    print("\nStep 6: Evaluating the ensemble model...")
    rf_probs = rf_model.predict_proba(X_rf_test_scaled)
    crnn_probs = crnn_model.predict(X_crnn_test)
    
    ensemble_probs = (crnn_probs + rf_probs) / 2.0
    ensemble_preds = np.argmax(ensemble_probs, axis=1)
    
    print("\n--- Final Ensemble Classification Report ---")
    print(classification_report(y_test, ensemble_preds, target_names=label_encoder.classes_))
    print(f"Final Ensemble Accuracy: {accuracy_score(y_test, ensemble_preds) * 100:.2f}%")

--- Starting NeuroVocal AI Training Pipeline ---
Step 1: Loading data and extracting features from audio folders...
Processing files for class: alzheimer
Processing files for class: depression
Processing files for class: healthy
Processing files for class: parkinson
Error processing PD1a1_LF - Copy.wav for RF features: To analyse this Sound, “minimum pitch” must not be less than 324.3243243243243 Hz.
Sound "untitled": pitch analysis not performed.
Sound "untitled": periodic pulses (cc) not computed.
Error processing PD1a1_LF.wav for RF features: To analyse this Sound, “minimum pitch” must not be less than 324.3243243243243 Hz.
Sound "untitled": pitch analysis not performed.
Sound "untitled": periodic pulses (cc) not computed.
Error processing PD1a2_LF - Copy.wav for RF features: To analyse this Sound, “minimum pitch” must not be less than 313.04347826086956 Hz.
Sound "untitled": pitch analysis not performed.
Sound "untitled": periodic pulses (cc) not computed.
Error processing PD1a2_LF

Epoch 1/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 1s/step - accuracy: 0.4382 - loss: 1.2402 - val_accuracy: 0.4233 - val_loss: 1.5234
Epoch 2/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 572ms/step - accuracy: 0.6401 - loss: 0.8651 - val_accuracy: 0.4180 - val_loss: 2.2968
Epoch 3/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 631ms/step - accuracy: 0.7291 - loss: 0.6523 - val_accuracy: 0.4127 - val_loss: 2.6918
Epoch 4/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 894ms/step - accuracy: 0.8433 - loss: 0.4595 - val_accuracy: 0.4233 - val_loss: 2.4441
Epoch 5/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 544ms/step - accuracy: 0.9044 - loss: 0.3185 - val_accuracy: 0.5661 - val_loss: 1.5090
Epoch 6/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 525ms/step - accuracy: 0.9031 - loss: 0.2945 - val_accuracy: 0.6085 - val_loss: 1.1258
Epoch 7/50
[1m24/24[0m 

## 6. Save Models and Objects for Deployment

This final step saves all the components needed by the `app.py` backend server. After running this cell, you will have the following files ready for your web application:
- `crnn_model.h5`: The trained Keras CRNN model.
- `random_forest_model.joblib`: The trained Scikit-learn Random Forest model.
- `audio_scaler.joblib`: The scaler fitted on the Random Forest training data.
- `label_encoder.joblib`: The label encoder to convert model outputs back to class names.
- `rf_feature_columns.joblib`: A list of the feature names in the correct order for the RF model.

In [None]:
print("\n--- Saving all components for deployment ---")

# Save the CRNN model
crnn_model.save('crnn_model.h5')
print("Saved crnn_model.h5")

# Save the Random Forest model
joblib.dump(rf_model, 'random_forest_model.joblib')
print("Saved random_forest_model.joblib")cd

# Save the scaler for the RF features
joblib.dump(scaler_rf, 'audio_scaler.joblib')
print("Saved audio_scaler.joblib")

# Save the label encoder
joblib.dump(label_encoder, 'label_encoder.joblib')
print("Saved label_encoder.joblib")

# Save the column order for the RF model
joblib.dump(list(X_rf_df.columns), 'rf_feature_columns.joblib')
print("Saved rf_feature_columns.joblib")

print("\nAll components are ready for the Flask application!")




--- Saving all components for deployment ---
Saved crnn_model.h5
Saved random_forest_model.joblib
Saved audio_scaler.joblib
Saved label_encoder.joblib
Saved rf_feature_columns.joblib

All components are ready for the Flask application!


In [6]:
import os
import numpy as np
import pandas as pd
import librosa
import parselmouth
from parselmouth.praat import call
import joblib
import tensorflow as tf
import warnings

# Suppress unnecessary warnings for a cleaner output
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

# --- 1. Configuration (Must be identical to the training script) ---
class Config:
    SAMPLE_RATE = 44100
    N_MELS_CRNN = 128
    MAX_PAD_LEN_CRNN = 250
    N_MFCC_RF = 13

config = Config()

# --- 2. Load All Saved Models and Assets ---
# This block simulates the startup of your web application
try:
    print("Loading saved models and assets...")
    CRNN_MODEL = tf.keras.models.load_model('crnn_model.h5')
    RF_MODEL = joblib.load('random_forest_model.joblib')
    SCALER = joblib.load('audio_scaler.joblib')
    LABEL_ENCODER = joblib.load('label_encoder.joblib')
    RF_FEATURE_COLUMNS = joblib.load('rf_feature_columns.joblib')
    print("✅ All components loaded successfully!")
except Exception as e:
    print(f"❌ Error loading files: {e}")
    print("Please ensure all .h5 and .joblib files are in the same directory as the notebook.")

# --- 3. Feature Extraction Functions (Copied from your notebook) ---
def extract_features_for_rf(wav_path):
    features = {}
    try:
        y, sr = librosa.load(wav_path, sr=config.SAMPLE_RATE)
        sound = parselmouth.Sound(wav_path)
        point_process = call(sound, "To PointProcess (periodic, cc)", 75, 500)
        features['jitter_local'] = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
        features['shimmer_local'] = call([sound, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
        harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
        features['hnr'] = call(harmonicity, "Get mean", 0, 0)
        pitch = sound.to_pitch(None, 75, 600)
        intensity = sound.to_intensity()
        features['mean_f0'] = call(pitch, "Get mean", 0, 0, "Hertz")
        features['std_dev_f0'] = call(pitch, "Get standard deviation", 0, 0, "Hertz")
        features['mean_intensity'] = call(intensity, "Get mean", 0, 0, "energy")
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=config.N_MFCC_RF)
        for i in range(config.N_MFCC_RF):
            features[f'mfcc_{i+1}_mean'] = np.mean(mfccs[i])
            features[f'mfcc_{i+1}_std'] = np.std(mfccs[i])
        features['spectral_centroid'] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        features['spectral_rolloff'] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
        for key, value in features.items():
            if np.isnan(value) or np.isinf(value):
                features[key] = 0
        return features
    except Exception:
        return None

def extract_spectrogram_for_crnn(wav_path):
    try:
        y, sr = librosa.load(wav_path, sr=config.SAMPLE_RATE)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=config.N_MELS_CRNN)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        if mel_spec_db.shape[1] > config.MAX_PAD_LEN_CRNN:
            mel_spec_db = mel_spec_db[:, :config.MAX_PAD_LEN_CRNN]
        else:
            pad_width = config.MAX_PAD_LEN_CRNN - mel_spec_db.shape[1]
            mel_spec_db = np.pad(mel_spec_db, pad_width=((0, 0), (0, pad_width)), mode='constant')
        return mel_spec_db
    except Exception:
        return None

# --- 4. Main Prediction Logic ---

# --- ACTION REQUIRED: Update the path to the audio file you want to test ---
file_to_predict = r'C:\\Users\\yajna\\Downloads\\preprocessing\\processed_audio_wav_alzimers_recall\\recall_705-0_processed.wav'

if not os.path.exists(file_to_predict):
    print(f"❌ Error: The file was not found. Please check the path: {file_to_predict}")
else:
    try:
        print(f"\n▶️  Analyzing file: '{os.path.basename(file_to_predict)}'...")
        
        # Step 1: Extract both sets of features
        rf_features = extract_features_for_rf(file_to_predict)
        crnn_spec = extract_spectrogram_for_crnn(file_to_predict)
        
        if rf_features is not None and crnn_spec is not None:
            # Step 2: Preprocess features for Random Forest
            rf_df = pd.DataFrame([rf_features], columns=RF_FEATURE_COLUMNS)
            rf_scaled = SCALER.transform(rf_df)
            
            # Step 3: Preprocess features for CRNN
            crnn_reshaped = np.expand_dims(crnn_spec, axis=0)
            
            # Step 4: Get predictions from the loaded models
            rf_probs = RF_MODEL.predict_proba(rf_scaled)
            crnn_probs = CRNN_MODEL.predict(crnn_reshaped, verbose=0)
            
            # Step 5: Ensemble the predictions
            ensemble_probs = (rf_probs + crnn_probs) / 2.0
            prediction_index = np.argmax(ensemble_probs, axis=1)
            
            # Step 6: Decode the result using the loaded encoder
            prediction_label = LABEL_ENCODER.inverse_transform(prediction_index)
            
            # Step 7: Display the final result
            print("\n" + "="*35)
            print("      ✅ FINAL PREDICTION")
            print("="*35)
            print(f"The predicted class is: {prediction_label[0].upper()}")
            print("="*35)
        else:
            print("❌ Error: Feature extraction failed. The audio file might be too short or corrupted.")
            
    except Exception as e:
        print(f"\n❌ An unexpected error occurred during prediction: {e}")

Loading saved models and assets...




✅ All components loaded successfully!

▶️  Analyzing file: 'recall_705-0_processed.wav'...

      ✅ FINAL PREDICTION
The predicted class is: ALZHIMER
