In [1]:
!pip install xgboost



In [2]:
import librosa
import librosa.display
import numpy as np
import pandas as pd
import os
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix, balanced_accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Data Load

In [3]:
TRAIN_AUDIO_DIR = 'data/processed_audio/train_processed_files'
DEV_AUDIO_DIR = 'data/processed_audio/dev_processed_files'
TEST_AUDIO_DIR = 'data/processed_audio/train_processed_files'
LABELS_FILE = 'data/ComParE2017_Cold.tsv'

In [4]:
print(f"Checking if audio files exist in: {TRAIN_AUDIO_DIR}")
audio_files_found = [f for f in os.listdir(TRAIN_AUDIO_DIR) if f.endswith(('.wav', '.mp3', '.flac'))]
if not os.path.exists(TRAIN_AUDIO_DIR):
    print(f"Error: The directory '{TRAIN_AUDIO_DIR}' does not exist. Please check the path.")
    exit() # Exit the script if the directory isn't found
elif not audio_files_found:
    print(f"Warning: No audio files (wav, mp3, flac) found in '{TRAIN_AUDIO_DIR}'.")
    print("Please ensure your audio files are in this directory.")
    # You might still want to proceed if you expect labels_map to filter out files
    # but it's good to know if the directory is empty.
else:
    print(f"Found {len(audio_files_found)} audio files in '{TRAIN_AUDIO_DIR}'.")

Checking if audio files exist in: data/processed_audio/train_processed_files
Found 9505 audio files in 'data/processed_audio/train_processed_files'.


In [5]:
# Load ground truth
def load_labels(labels_file):
    """
    Loads labels from a TSV file and maps 'C' to 1 and 'NC' to 0.
    """
    df = pd.read_csv(labels_file, sep='\t')
    labels_dict = df.set_index('file_name')['Cold (upper respiratory tract infection)'].to_dict()

    # Map 'C' to 1 and 'NC' to 0
    mapped_labels_dict = {}
    for file_name, label in labels_dict.items():
        if label == 'C':
            mapped_labels_dict[file_name] = 1
        elif label == 'NC':
            mapped_labels_dict[file_name] = 0
        else:
            # Handle unexpected labels if necessary, e.g., raise an error or assign a default
            print(f"Warning: Unexpected label '{label}' for file '{file_name}'. Skipping or handling appropriately.")
            pass # Or mapped_labels_dict[file_name] = -1 for an unknown class

    return mapped_labels_dict
print(load_labels(LABELS_FILE))

{'train_0001.wav': 1, 'train_0002.wav': 0, 'train_0003.wav': 0, 'train_0004.wav': 1, 'train_0005.wav': 0, 'train_0006.wav': 0, 'train_0007.wav': 0, 'train_0008.wav': 1, 'train_0009.wav': 0, 'train_0010.wav': 0, 'train_0011.wav': 0, 'train_0012.wav': 0, 'train_0013.wav': 0, 'train_0014.wav': 1, 'train_0015.wav': 0, 'train_0016.wav': 0, 'train_0017.wav': 0, 'train_0018.wav': 0, 'train_0019.wav': 0, 'train_0020.wav': 0, 'train_0021.wav': 0, 'train_0022.wav': 0, 'train_0023.wav': 0, 'train_0024.wav': 0, 'train_0025.wav': 0, 'train_0026.wav': 0, 'train_0027.wav': 0, 'train_0028.wav': 0, 'train_0029.wav': 0, 'train_0030.wav': 0, 'train_0031.wav': 0, 'train_0032.wav': 1, 'train_0033.wav': 0, 'train_0034.wav': 0, 'train_0035.wav': 1, 'train_0036.wav': 1, 'train_0037.wav': 0, 'train_0038.wav': 1, 'train_0039.wav': 1, 'train_0040.wav': 0, 'train_0041.wav': 0, 'train_0042.wav': 0, 'train_0043.wav': 0, 'train_0044.wav': 1, 'train_0045.wav': 1, 'train_0046.wav': 0, 'train_0047.wav': 0, 'train_0048.

# Feature Extraction

In [6]:
SR = 22050
N_FFT = 2048
HOP_LENGTH = 512
N_MFCC = 20

def extract_features(audio_path, sr, n_fft, hop_length, n_mfcc):
    """Extracts various features from an audio file."""
    y, sr = librosa.load(audio_path, sr=sr)

    features = {}

    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    features['mfccs_mean'] = np.mean(mfccs, axis=1)
    features['mfccs_std'] = np.std(mfccs, axis=1)
    
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    features['mel_spectrogram_mean'] = np.mean(mel_spectrogram_db, axis=1)
    features['mel_spectrogram_std'] = np.std(mel_spectrogram_db, axis=1)

    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
    features['chroma_stft_mean'] = np.mean(chroma_stft, axis=1)
    features['chroma_stft_std'] = np.std(chroma_stft, axis=1)

    cent = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
    features['spectral_centroid_mean'] = np.mean(cent)
    features['spectral_centroid_std'] = np.std(cent)

    bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
    features['spectral_bandwidth_mean'] = np.mean(bandwidth)
    features['spectral_bandwidth_std'] = np.std(bandwidth)

    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
    features['spectral_rolloff_mean'] = np.mean(rolloff)
    features['spectral_rolloff_std'] = np.std(rolloff)

    zcr = librosa.feature.zero_crossing_rate(y=y, hop_length=hop_length)
    features['zcr_mean'] = np.mean(zcr)
    features['zcr_std'] = np.std(zcr)

    rms = librosa.feature.rms(y=y, frame_length=n_fft, hop_length=hop_length)
    features['rms_mean'] = np.mean(rms)
    features['rms_std'] = np.std(rms)

    feature_vector = np.array([])
    for key in sorted(features.keys()):
        feature_vector = np.concatenate((feature_vector, np.atleast_1d(features[key])))

    return feature_vector

In [7]:
def prepare_dataset(audio_dir, labels_map, sr, n_fft, hop_length, n_mfcc):
    """Prepares feature vectors and labels for a given audio directory."""
    X = []
    y = []
    processed_files = []
    for filename in os.listdir(audio_dir):
        if filename.endswith(('.wav')):
            audio_id = filename
            if audio_id in labels_map: # Check if the mapped label exists
                audio_path = os.path.join(audio_dir, filename)
                try:
                    features = extract_features(audio_path, sr, n_fft, hop_length, n_mfcc)
                    X.append(features)
                    y.append(labels_map[audio_id])
                    processed_files.append(audio_id)
                except Exception as e:
                    print(f"Error processing {filename}: {e}")
    return np.array(X), np.array(y), processed_files

In [8]:
def plot_confusion_matrix(y_true, y_pred, classes, title='Confusion Matrix', cmap=plt.cm.Blues):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, cbar=False,
                xticklabels=classes, yticklabels=classes)
    plt.title(title)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

In [None]:
if __name__ == "__main__":
    print("Loading labels...")
    labels_map = load_labels(LABELS_FILE) # Labels will now be 0 or 1
    print(f"Loaded {len(labels_map)} labels.")

    print("Extracting features from TRAIN set...")
    X_train, y_train, train_files = prepare_dataset(TRAIN_AUDIO_DIR, labels_map, SR, N_FFT, HOP_LENGTH, N_MFCC)
    print(f"Extracted features for {len(X_train)} training samples.")

    print("Extracting features from DEV set...")
    X_dev, y_dev, dev_files = prepare_dataset(DEV_AUDIO_DIR, labels_map, SR, N_FFT, HOP_LENGTH, N_MFCC)
    print(f"Extracted features for {len(X_dev)} development samples.")

    if len(X_train) == 0 or len(X_dev) == 0:
        print("Error: No features extracted. Check paths and label file consistency.")
    else:
        unique_train, counts_train = np.unique(y_train, return_counts=True)
        print(f"\nTrain set class distribution: {dict(zip(unique_train, counts_train))}")
        unique_dev, counts_dev = np.unique(y_dev, return_counts=True)
        print(f"Dev set class distribution: {dict(zip(unique_dev, counts_dev))}")

        CLASS_NC = 0
        CLASS_C = 1
        class_labels = [CLASS_NC, CLASS_C]

        # y_dev is already binary, so no mapping needed here
        y_dev_binary_for_roc = y_dev
        
        #SWM WITH GRID SEARCH
        print("\nTraining SVM with Grid Search (with class_weight='balanced'...")
        svm_balanced = SVC(random_state=42, class_weight='balanced', probability=True)

        param_grid = {
            'C': [0.1, 1, 10, 100],
            'gamma': ['scale', 'auto'],
            'kernel': ['rbf', 'linear']
        }

        grid_search = GridSearchCV(svm_balanced, param_grid, cv=5, verbose=2, n_jobs=-1, scoring='roc_auc')
        grid_search.fit(X_train, y_train)

        print("\nBest parameters found on training set (with class_weight='balanced'):")
        print(grid_search.best_params_)
        print(f"Best cross-validation ROC AUC on training set: {grid_search.best_score_:.4f}")

        print("\nEvaluating best model on DEVELOPMENT set (validation):")
        best_svm_model = grid_search.best_estimator_
        y_dev_pred = best_svm_model.predict(X_dev)
        # `predict_proba` returns probabilities for each class.
        # The positive class (1, 'C') is now explicitly at index `1` if classes are [0, 1]
        y_dev_proba = best_svm_model.predict_proba(X_dev)[:, best_svm_model.classes_.tolist().index(CLASS_C)]

        print("\nClassification Report on Development Set:")
        print(classification_report(y_dev, y_dev_pred, target_names=['NC', 'C']))

        try:
            roc_auc = roc_auc_score(y_dev_binary_for_roc, y_dev_proba)
            print(f"ROC AUC on Development Set: {roc_auc:.4f}")
        except ValueError as e:
            print(f"Could not calculate ROC AUC: {e}. Ensure probabilities are available and classes match.")

        plot_confusion_matrix(y_dev, y_dev_pred, classes=['NC', 'C'],
                              title='Confusion Matrix on Development Set')
        uar = balanced_accuracy_score(y_dev, y_dev_pred)
        print(f"Unweighted Average Recall (Balanced Accuracy) on Development Set: {uac:.4f}")
        
        #SWM WITH SMOTE FOR IMBALANCED DATA
        print("\n--- Training with SMOTE and class_weight='balanced' (Pipeline) ---")
        pipeline = Pipeline([
            ('smote', SMOTE(random_state=42)),
            ('svm', SVC(random_state=42, class_weight='balanced', probability=True))
        ])

        param_grid_smote = {
            'svm__C': [0.1, 1, 10],
            'svm__gamma': ['scale', 'auto'],
            'svm__kernel': ['rbf']
        }

        grid_search_smote = GridSearchCV(pipeline, param_grid_smote, cv=5, verbose=2, n_jobs=-1, scoring='roc_auc')
        grid_search_smote.fit(X_train, y_train)

        print("\nBest parameters found on training set (with SMOTE and class_weight='balanced'):")
        print(grid_search_smote.best_params_)
        print(f"Best cross-validation ROC AUC on training set: {grid_search_smote.best_score_:.4f}")

        print("\nEvaluating best SMOTE+SVM model on DEVELOPMENT set (validation):")
        best_smote_svm_model = grid_search_smote.best_estimator_
        y_dev_pred_smote = best_smote_svm_model.predict(X_dev)
        # Get probability for the positive class (CLASS_C)
        y_dev_proba_smote = best_smote_svm_model.predict_proba(X_dev)[:, best_smote_svm_model.named_steps['svm'].classes_.tolist().index(CLASS_C)]

        print("\nClassification Report on Development Set (SMOTE+SVM):")
        print(classification_report(y_dev, y_dev_pred_smote, target_names=['NC', 'C']))

        try:
            roc_auc_smote = roc_auc_score(y_dev_binary_for_roc, y_dev_proba_smote)
            print(f"ROC AUC on Development Set (SMOTE+SVM): {roc_auc_smote:.4f}")
        except ValueError as e:
            print(f"Could not calculate ROC AUC for SMOTE model: {e}. Ensure probabilities are available and classes match.")

        plot_confusion_matrix(y_dev, y_dev_pred_smote, classes=['NC', 'C'],
                              title='Confusion Matrix on Development Set (SMOTE+SVM, MFCCs included)')
        uar_smote = balanced_accuracy_score(y_dev, y_dev_pred_smote)
        print(f"Unweighted Average Recall (Balanced Accuracy) on Development Set (SMOTE+SVM): {uac_smote:.4f}")
        
        #XGBOOST
        print("\n--- Training XGBoost ---")
        xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', 
                                  scale_pos_weight=counts_train[0] / counts_train[1],  # handle imbalance
                                  random_state=42)

        param_grid_xgb = {
            'n_estimators': [100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2]
        }

        grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=5, verbose=2, n_jobs=-1, scoring='roc_auc')
        grid_search_xgb.fit(X_train, y_train)

        print("\nBest parameters for XGBoost:")
        print(grid_search_xgb.best_params_)
        print(f"Best cross-validation ROC AUC (XGBoost): {grid_search_xgb.best_score_:.4f}")

        best_xgb_model = grid_search_xgb.best_estimator_
        y_dev_pred_xgb = best_xgb_model.predict(X_dev)
        y_dev_proba_xgb = best_xgb_model.predict_proba(X_dev)[:, 1]

        print("\nClassification Report on Development Set (XGBoost):")
        print(classification_report(y_dev, y_dev_pred_xgb, target_names=['NC', 'C']))

        roc_auc_xgb = roc_auc_score(y_dev_binary_for_roc, y_dev_proba_xgb)
        print(f"ROC AUC (XGBoost): {roc_auc_xgb:.4f}")
        plot_confusion_matrix(y_dev, y_dev_pred_xgb, classes=['NC', 'C'], title='Confusion Matrix (XGBoost)')
        uar_xgb = balanced_accuracy_score(y_dev, y_dev_pred_xgb)
        print(f"UAR (XGBoost): {uar_xgb:.4f}")
        
        #ENSEMBLE STACKING 
        print("\n--- Building Stacking Ensemble: SVM (SMOTE) + XGBoost ---")

        # Reuse best_smote_svm_model and best_xgb_model
        estimators = [
            ('svm', best_smote_svm_model.named_steps['svm']),
            ('xgb', best_xgb_model)
        ]

        stacking_clf = StackingClassifier(
            estimators=estimators,
            final_estimator=LogisticRegression(),
            cv=5,
            n_jobs=-1,
            passthrough=True
        )

        # Recreate SMOTE+XGBoost pipeline
        from imblearn.pipeline import make_pipeline
        stacking_pipeline = make_pipeline(SMOTE(random_state=42), stacking_clf)

        print("Training stacking ensemble...")
        stacking_pipeline.fit(X_train, y_train)

        y_dev_pred_stack = stacking_pipeline.predict(X_dev)
        y_dev_proba_stack = stacking_pipeline.predict_proba(X_dev)[:, 1]

        print("\nClassification Report (Stacking Ensemble):")
        print(classification_report(y_dev, y_dev_pred_stack, target_names=['NC', 'C']))
        roc_auc_stack = roc_auc_score(y_dev_binary_for_roc, y_dev_proba_stack)
        print(f"ROC AUC (Stacking): {roc_auc_stack:.4f}")
        plot_confusion_matrix(y_dev, y_dev_pred_stack, classes=['NC', 'C'], title='Confusion Matrix (Stacking)')
        uar_stack = balanced_accuracy_score(y_dev, y_dev_pred_stack)
        print(f"UAR (Stacking): {uar_stack:.4f}")

        #COMBINE train and dev for final training
        print("\n--- Final Model Training on Combined Train+Dev (using the better strategy) ---")
        X_combined = np.vstack((X_train, X_dev))
        y_combined = np.concatenate((y_train, y_dev))

        print("Retraining the best model on combined Train+Dev data...")
        #final_model_for_submission = grid_search_smote.best_estimator_ 
        final_model_for_submission = stacking_pipeline
        final_model_for_submission.fit(X_combined, y_combined)
        print("Final model trained on combined dataset.")

        print("\nReady for final submission: Extract features from TEST set and predict.")
        X_test, _, test_files = prepare_dataset(TEST_AUDIO_DIR, labels_map, SR, N_FFT, HOP_LENGTH, N_MFCC)
        if len(X_test) > 0:
            y_test_pred_final = final_model_for_submission.predict(X_test)
            submission_df = pd.DataFrame({'filename': test_files, 'predicted_label': y_test_pred_final})
            submission_df.to_csv('submission.csv', index=False)
            print("Test predictions saved to submission.csv") 
        else:
            print("No test files processed for final prediction.")

Loading labels...
Loaded 19101 labels.
Extracting features from TRAIN set...
Extracted features for 9505 training samples.
Extracting features from DEV set...
Extracted features for 9596 development samples.

Train set class distribution: {0: 8535, 1: 970}
Dev set class distribution: {0: 8585, 1: 1011}

Training SVM with Grid Search (with class_weight='balanced'...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
