<br/>
<div align="center" >

![Confusion Matrix](../images/ENSC.png)

# <u> ENSC Parcours IA </u>
## Data Challenge - Détection de clics d'odontocètes

</div>

As part of the [Artificial Intelligence specialization](https://3aia.notion.site/3aia/Parcours-3A-IA-2023-9917027c682b457dae71fea68c067ad1) at the [ENSC](https://ensc.bordeaux-inp.fr/fr), we participated in a data challenge provided by the University of Toulon in the [ChallengeData](https://challengedata.ens.fr/) website. 

This challenge specifically aims to detect the presence of odontoceti clicks in underwater audio recordings in the Caribbean sea.

## Import necessary libraries

In [None]:
import pandas as pd
from pathlib import Path
import os
import librosa
import librosa.display
import librosa.feature as feat
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import soundfile
from scipy import signal
import seaborn as sns
import numpy as np
import IPython.display as ipd
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [None]:
# Set the path to the downloaded data
download_path = Path.cwd() / "../" / ".dataset"

# Read labels file
labels_file = download_path / "Y_train_ofTdMHi.csv"
df = pd.read_csv(labels_file)

# Construct file path by concatenating folder and file name
df["relative_path"] = str(download_path) + "/X_train/" + df["id"]

# Drop id column (replaced it with relative_path)
df.drop(columns=["id"], inplace=True)

df.rename(columns={"pos_label": "label"}, inplace=True)

# invert relative_path and label columns positions
df = df[["relative_path", "label"]]
display(ipd.Markdown(f"### There are **_{len(df)}_** audio files in the dataset."))

table = f"""
Here is the split into good and bad signals:
| Label   | Count   |
|:-------:|:-------:|
| 0       | {df['label'].value_counts()[0]} |
| 1       | {df['label'].value_counts()[1]} |"""
display(ipd.Markdown(table))

Take one file to check mfcc with librosa

In [None]:
# Get the file and load it
file_paths = [df.loc[0, "relative_path"], df.loc[1, "relative_path"]]
file_labels= [df.loc[0, "label"], df.loc[1, "label"]]
audio_data1, sr1 = librosa.load(file_paths[0], sr=None)
audio_data2, sr2 = librosa.load(file_paths[1], sr=None)
print(f"Audio data shape: {audio_data1.shape}, Sample rate: {sr1}")

# Plot the audio signal
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5), sharey=True)
fig.suptitle('Waveforms for good and wrong signal', fontsize=16)
for i in range(2):
    ax[i].set_xlabel("Time")
    ax[i].set_ylabel("Amplitude")

ax[0].plot(audio_data1)
ax[0].title.set_text('Waveform for wrong signal')

ax[1].plot(audio_data2)
ax[1].title.set_text('Waveform for good signal')

plt.show()

In [None]:
class AudioUtil:
    """Static class for audio processing helper functions."""
    
    @staticmethod
    def open(audio_file: str):
        """Load an audio file. Return the signal as a tensor and the sample rate"""
        sig, sr = librosa.load(audio_file, sr=256000)
        return (sig, sr)
    
    @staticmethod
    def get_audio_duration(sig, sr):
        """Return the duration of an audio signal in seconds"""
        return librosa.get_duration(sig, sr)
    
    @staticmethod
    def mel_spectro_gram(sig: np.array, sr: int, n_mels=32, n_fft=1024):
        """Generate a Spectrogram"""
        # get mel spectrogram
        spec = librosa.feature.melspectrogram(y=sig, sr=sr)
        spec = librosa.amplitude_to_db(spec)
        return spec
    
    @staticmethod
    def extract_mfccs(file_path):
        audio_data, sr = librosa.load(file_path, sr = None)
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=40)
        mfccs_scaled_features = np.mean(mfccs.T,axis=0)
        return mfccs_scaled_features
    
    @staticmethod
    def get_audio_specs_size(spec):
        """Return the size of a spectrogram image"""
        return spec.shape
    
    @staticmethod
    def plot_mel_spectro_gram(spec: np.array, sr: int):
        """Plot a Spectrogram"""
        # plot mel spectrogram
        fig, ax = plt.subplots()
        S_dB = librosa.power_to_db(spec, ref=np.max)
        img = librosa.display.specshow(S_dB, x_axis='time',
                                y_axis='mel', sr=sr,
                                ax=ax)
        fig.colorbar(img, ax=ax, format='%+2.0f dB')
        ax.set(title='Mel-frequency spectrogram')

In [None]:
def save_mfccs(nb_files = len(df)):
    audio_util = AudioUtil()
    label_files = np.empty(0)
    audio_mfccs = []
    features_and_labels = []
    print("Starting mfccs generation...")
    for line_num in tqdm(range(nb_files), unit="file", desc="Generating mfccs"):
        file_path = df.loc[line_num, "relative_path"]
        mfccs = audio_util.extract_mfccs(file_path) # , n_mels = 
        audio_mfccs.append(mfccs)
        label_files = np.append(label_files, int(df.loc[line_num, "label"]))
        features_and_labels.append((mfccs, df.loc[line_num, "label"]))
    print("Mfccs generated !", end='\n\n')

    print("Saving mfccs...")
    os.mkdir("numpy_data") if not os.path.exists("numpy_data") else None
    np.save(os.path.join("numpy_data", "audio_mfccs.npy"), audio_mfccs)
    np.save(os.path.join("numpy_data", "label_files.npy"), label_files)
    print("Mfccs saved !")
    features_and_labels = pd.DataFrame(features_and_labels, columns=["mfccs", "label"])

    print("Global shape : ", features_and_labels.shape)
    print(features_and_labels.head())

In [None]:
save_mfccs()

In [None]:
def save_labels(nb_files = len(df)):
    label_files = np.empty(0)
    print("Saving labels...")
    for line_num in tqdm(range(nb_files), unit="file", desc="Retrieving labels"):
        label_files = np.append(label_files, int(df.loc[line_num, "label"]))
    os.mkdir("numpy_data") if not os.path.exists("numpy_data") else None
    np.save(os.path.join("numpy_data", "label_files.npy"), label_files)
    print("Labels saved !")

In [None]:
save_labels()

In [None]:
def extract_features(audio_file):
    sig, sr = soundfile.read(audio_file)

    sos = signal.butter(6, [5000, 100000], 'bandpass', fs=sr, output='sos')
    sig = signal.sosfiltfilt(sos, sig)
    rms = feat.rms(y=sig) 
    sc = feat.spectral_centroid(y=sig, sr=sr)
    sb = feat.spectral_bandwidth(y=sig,sr=sr)
    sf = feat.spectral_flatness(y=sig)

    features = [np.mean(rms), np.std(rms), np.min(rms), np.max(rms),\
                np.mean(sc), np.std(sc), np.min(sc), np.max(sc),\
                np.mean(sb), np.std(sb), np.min(sb), np.max(sb),\
                np.mean(sf), np.std(sf), np.min(sf), np.max(sf)]
    return features

def save_features(nb_files = len(df)):
    features_and_labels = []
    for line_num in tqdm(range(nb_files), unit="file", desc="Generating features"):
        label_file = df.loc[line_num, "label"]
        file_path = df.loc[line_num, "relative_path"]
        features = extract_features(file_path)
        features_and_labels.append((features, label_file))

    features_and_labels = pd.DataFrame(features_and_labels, columns=["features", "label"])
    print("Global shape : ", features_and_labels.shape)
    print(features_and_labels.head())

    print("Saving features...")
    os.mkdir("numpy_data") if not os.path.exists("numpy_data") else None
    np.save(os.path.join("numpy_data", "features.npy"), features_and_labels["features"])
    print("Features saved !")

In [None]:
save_features()

In [None]:
def get_mfccs_from_file(file_path):
    audio_specs = np.load(file_path)
    return np.array(audio_specs.tolist())

def get_labels_from_file(file_path):
    label_files = np.load(file_path)
    return np.array(label_files.tolist())

def get_features_from_file(file_path):
    features = np.load(file_path, allow_pickle=True)
    return np.array(features.tolist())

X = get_features_from_file(os.path.join(os.getcwd(),"numpy_data", "features.npy"))
# X = pd.DataFrame(get_mfccs_from_file(os.path.join(os.getcwd(),"numpy_data", "audio_mfccs.npy")))
y = get_labels_from_file(os.path.join(os.getcwd(),"numpy_data", "label_files.npy"))

print(X.shape)
print(y.shape)
# assert len(X[0]) == 16, "Wrong number of features !"

Make features as columns

In [None]:
features=["rms_mean","rms_std", "rms_min", "rms_max", "sc_mean", "sc_std", "sc_min", "sc_max", "sb_mean", "sb_std", "sb_min", "sb_max", "sf_mean", "sf_std", "sf_min", "sf_max"]
X_with_features_columns = pd.DataFrame(columns=features)

for i, X_element in tqdm(enumerate(X), unit=" Element", total=X.shape[0]):
    for j, feature in enumerate(X_element):
        X_with_features_columns.loc[i, features[j]] = feature
X = X_with_features_columns

Split the dataset into training, testing (and validation) datasets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,train_size=0.75)
# X_train, X_validation, y_train, y_validation=train_test_split(X_train,y_train,train_size=0.8, random_state=64)

print(f"X_train contains {X_train.shape[0]} files of shape {X_train.shape[1:]}")
print(f"X_test contains {X_test.shape[0]} files of shape {X_test.shape[1:]}", end='\n\n')
# print(f"X_validation contains {X_validation.shape[0]} files of shape {X_validation.shape[1:]}")

# print(f"Features are : {X_train.columns.tolist()}") 
# X_train.head()

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

n_fold = 2
model = LogisticRegression()
accuracy_scores = []
scaler = StandardScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X))

X_train, X_test, y_train, y_test = [], [], [], []
skf = StratifiedKFold(n_splits=n_fold)

for train, test in skf.split(X_scaled, y):
    X_train.append(train), X_test.append(test), y_train.append(train), y_test.append(test)

print("N° of folds: ", len(X_train))
print("N° of audios per fold: ",X_train[0].shape[0])

for i in tqdm(range(n_fold), unit=" fold", desc="Training"):
    model.fit(X_train[i].reshape(-1,1), y_train[i])
    # Make predictions on the validation set
    y_pred = model.predict(X_test[i].reshape(-1, 1))
    
    # Calculate the accuracy score for this fold
    acc = accuracy_score(y_test[i], y_pred)
    
    # Append the accuracy score to the list
    accuracy_scores.append(acc)

print("Accuracy scores: ", accuracy_scores)

Helper function for prediction on random sample

In [None]:
import random
def predict_on_random_sample(classifier, X_test, y_test):
    random_index = random.randint(0, len(X_test)-1)
    while random_index not in X_test.index:
        random_index = random.randint(0, len(X_test)-1)
    sample = X_test.loc[random_index]
    print("Correct ✅" if classifier.predict([sample])[0] == y_test[random_index] else "Incorrect ❌")
    print(f'Prediction for sample {random_index:4}: {int(classifier.predict([sample])[0])}')
    print(f'Actual label              : {int(y_test[random_index])}')

    prob_table = f"""
    Probabilities for each class:
     _________________________
    | Label   | Probability   |
    |---------|---------------|
    | 0       | {f"{classifier.predict_proba([sample])[0][0]:.3f}":13} |
    | 1       | {f"{classifier.predict_proba([sample])[0][1]:.3f}":13} |
    |_________|_______________|"""
    display(ipd.Markdown(prob_table))


We will want to display the confusion matrixes after fitting the models

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

def plot_confusion_matrix(classifier, X_test, y_test, cmap=plt.cm.Blues):
    plt.figure(figsize=(6,6))
    score_classifier = classifier.score(X_test, y_test)
    disp = ConfusionMatrixDisplay.from_estimator(
        classifier,
        X_test,
        y_test,
        display_labels=[0,1],
        cmap=cmap,
        normalize='true',
    )
    plt.ylabel('Actual label', fontsize = 17);
    plt.xlabel('Predicted label', fontsize = 17);
    plt.title(f'Accuracy Score: {score_classifier:.3f}', size = 17);
    plt.tick_params(labelsize= 15)

## Scale data

For some models (e.g. Logistic Regression), data needs to be scaled. In order to do that we will use scikit-learn's StandardScaler.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_train.head()

## <div align="center"> ---------- Logistic Regression ---------- </div>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg)}")

#### Try a prediction on a random sample

In [None]:
predict_on_random_sample(log_reg, X_test_scaled, y_test)

#### Score of this model

In [None]:
score_log_reg = log_reg.score(X_test_scaled, y_test)
print(f"Logistic Regression score: {score_log_reg:.3f}")

#### Print the confusion matrix on test set

In [None]:
plot_confusion_matrix(log_reg, X_test_scaled, y_test)

#### Check the ROC AUC score

In [None]:
print(f"ROC AUC Score with Logistic Regression : {roc_auc_score(y_test, y_pred_log_reg)}")
print(f"F1 Score with Logistic Regression      : {f1_score(y_test, y_pred_log_reg)}")

## <div align="center"> ---------- Decision Tree ---------- </div>

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(max_depth=15, random_state=0)
tree_clf.fit(X_train.values, y_train)
y_pred_tree = tree_clf.predict(X_test.values)
print(f"Accuracy: {tree_clf.score(X_test.values, y_test)}")

#### Try a prediction on a random sample

In [None]:
predict_on_random_sample(tree_clf, X_test, y_test)

#### Measure the model performance

In [None]:
score_tree_clf = tree_clf.score(X_test.values, y_test)
print(f"Decision Tree score: {score_tree_clf:.3f}")

#### Try to find the optimal `max_depth`

In [None]:
# List of values to try for max_depth:
max_depth_range = list(range(1, 20))

# List to store the average RMSE for each value of max_depth:
accuracy = []

for depth in tqdm(max_depth_range, unit='depth', desc='Testing max_depth'):
    
    clf = DecisionTreeClassifier(max_depth = depth, 
                             random_state = 0)
    clf.fit(X_train, y_train)

    score = clf.score(X_test, y_test)
    accuracy.append(score)

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (10,7));

ax.plot(max_depth_range,
        accuracy,
        lw=2,
        color='k')

ax.set_xlim([1, len(max_depth_range)])
ax.set_ylim([.50, 1.00])
ax.grid(True,
        axis = 'both',
        zorder = 0,
        linestyle = ':',
        color = 'k')

yticks = ax.get_yticks()

y_ticklist = []
for tick in yticks:
    y_ticklist.append(str(tick).ljust(4, '0')[0:4])
ax.set_yticklabels(y_ticklist)
ax.tick_params(labelsize = 18)
ax.set_xlabel('max_depth', fontsize = 24)
ax.set_ylabel('Accuracy', fontsize = 24)
fig.tight_layout()

#### Visualize the decision tree

In [None]:
from sklearn import tree
# Caution on the max_depth parameter, it makes the tree unreadable if too big
# tree.plot_tree(tree_clf) 

#### Plot the confusion matrix

In [None]:
plot_confusion_matrix(tree_clf, X_test.values, y_test, cmap=plt.cm.Greens)

#### Check the ROC AUC & F1 scores

In [None]:
print(f"ROC AUC Score with Decision Tree : {roc_auc_score(y_test, y_pred_tree):.3f}")
print(f"F1 Score with Decision Tree      : {f1_score(y_test, y_pred_tree):.3f}")

## <div align="center"> ---------- Bagged Tree ---------- </div>

In [None]:
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier()

bag_clf.fit(X_train.values, y_train)
y_pred_bag = bag_clf.predict(X_test.values)
print(f"Accuracy: {bag_clf.score(X_test.values, y_test)}")

#### Try prediction on a random sample

In [None]:
predict_on_random_sample(bag_clf, X_test, y_test)

#### Score of this model

In [None]:
score_bag = bag_clf.score(X_test.values, y_test)
print(f"Bagged tree score: {score_bag:.3f}")

#### Confusion Matrix

In [None]:
plot_confusion_matrix(bag_clf, X_test.values, y_test, cmap=plt.cm.Purples)

#### Check ROC AUC & F1 scores

In [None]:
print(f"ROC AUC Score with Bagged Tree : {roc_auc_score(y_test, y_pred_bag):.3f}")
print(f"F1 Score with Bagged Tree      : {f1_score(y_test, y_pred_bag):.3f}")

## <div align="center"> ---------- Random Forest ---------- </div>

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100)
forest_clf.fit(X_train.values, y_train)
y_pred_forest = forest_clf.predict(X_test.values)
print(f"Accuracy: {accuracy_score(y_test, y_pred_forest)}")

#### Try predicting on a random sample

In [None]:
predict_on_random_sample(forest_clf, X_test, y_test)

In [None]:
score_forest = forest_clf.score(X_test.values, y_test)
print(f"Random forest score: {score_forest:.3f}")

#### Confusion Matrix

In [None]:
plot_confusion_matrix(forest_clf, X_test.values, y_test, cmap=plt.cm.Oranges)

#### Check the ROC AUC & F1 scores

In [None]:
print(f"ROC AUC Score with Random Forest : {roc_auc_score(y_test, y_pred_forest):.3f}")
print(f"F1 Score with Random Forest      : {f1_score(y_test, y_pred_forest):.3f}")

#### Get the importance of the features

In [None]:
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(forest_clf.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
importances.head()

## <div align="center"> ---------- XGBClassifier ---------- </div>

In [None]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier()
xgb_clf.fit(X_train.values, y_train)
y_pred_xgb = xgb_clf.predict(X_test.values)
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb)}")


### Predict on a random sample

In [None]:
predict_on_random_sample(xgb_clf, X_test, y_test)

### Score of this model

In [None]:
score_xgb = xgb_clf.score(X_test.values, y_test)
print(f"XGB score: {score_xgb:.3f}")

### Print the confusion Matrix

In [None]:
plot_confusion_matrix(xgb_clf, X_test.values, y_test, cmap=plt.cm.Reds)

#### Check ROC AUC & F1 scores

In [None]:
print(f"ROC AUC Score with XGB : {roc_auc_score(y_test, y_pred_xgb):.3f}")
print(f"F1 Score with XGB      : {f1_score(y_test, y_pred_xgb):.3f}")

In [None]:
from sklearn.model_selection import cross_val_score, KFold
scores = cross_val_score(xgb_clf, X_train, y_train, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())

kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xgb_clf, X_train, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

## Recap of previous models and their respective scores

In [None]:
log_reg.get_params()
type(log_reg)

In [None]:
# Sort descending by score
classifiers_names = ["Logistic Regression", "Decision Tree", "Bagging", "Random Forest", "XGBClassifier"]
classifiers_scores = [score_log_reg, score_tree_clf, score_bag, score_forest, score_xgb]

classifiers_f1_scores = [f1_score(y_test, y_pred_log_reg), f1_score(y_test, y_pred_tree), f1_score(y_test, y_pred_bag), f1_score(y_test, y_pred_forest), f1_score(y_test, y_pred_xgb)]

classifiers_and_scores = list(zip(classifiers_names, classifiers_scores, classifiers_f1_scores))
classifiers_and_scores.sort(key=lambda x: x[1], reverse=True)

scores_lines = [f"| **{name}** | **{score:.3f}** | **{f1:.3f}** |\n" for index, (name, score, f1) in enumerate(classifiers_and_scores)]

scores_table = f"""
<div align="center">
    Classifiers scores

| Classifier   | Score  | F1 Score   |

{"".join(scores_lines)}

</div>
"""

display(ipd.Markdown(scores_table))

The best classifier seems to be the XGB classifier, thus we will make predictions on the test waveforms.

In [None]:
download_path = Path.cwd() / ".." / ".dataset"

# Read metadata file
labels_file = download_path / "Y_random_Xwjr6aB.csv"
df_test = pd.read_csv(labels_file)

# Construct file path by concatenating folder and file name
df_test["relative_path"] = str(download_path) + "/X_test/" + df_test["id"]
df_test.drop(columns=["id"], inplace=True)
df_test.rename(columns={"pos_label": "label"}, inplace=True)
# invert relative_path and label columns positions
df_test = df_test[["relative_path", "label"]]
print(f"There are ** {len(df_test)} ** audio files in the test dataset.", end="\n\n")
df_test.head()

In [None]:
from datetime import datetime
audio_util = AudioUtil()
predictions = []

for line_num in tqdm(range(len(df_test)), unit="file", desc="Predicting labels"):
    test_audio_file = df_test.loc[line_num, "relative_path"]
    test_file_features = np.array(extract_features(test_audio_file))
    
    # Predict the label of the audio file
    prediction = xgb_clf.predict_proba(test_file_features.reshape(1, -1))

    predictions.append([df_test.loc[line_num, "relative_path"].split(sep='/')[-1], max(prediction[0][0], prediction[0][1])])

    
predictions = pd.DataFrame(predictions, columns=["id", "pos_label"])
print(predictions.head())

now = datetime.now()
date = now.strftime("%d-%m-%Y_%H-%M-%S")
predictions.to_csv(f"Y_predict_{date}.csv", index=False)