<br/>
<div align="center" >

![Confusion Matrix](../images/ENSC.png)

# <u> ENSC Parcours IA </u>
## Data Challenge - Détection de clics d'odontocètes

</div>

As part of the [Artificial Intelligence specialization](https://3aia.notion.site/3aia/Parcours-3A-IA-2023-9917027c682b457dae71fea68c067ad1) at the [ENSC](https://ensc.bordeaux-inp.fr/fr), we participated in a data challenge provided by the University of Toulon in the [ChallengeData](https://challengedata.ens.fr/) website. 

This challenge specifically aims to detect the presence of odontoceti clicks in underwater audio recordings in the Caribbean sea.

In [None]:
import pandas as pd
from pathlib import Path
import os
import librosa
import librosa.display
import librosa.feature as feat
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from scipy import signal
import reservoirpy as rp
import seaborn as sns
import numpy as np
import IPython.display as ipd
import matplotlib.pyplot as plt
from tqdm import tqdm

import sys

sys.path.append('../')
from utils import retrieve_hyper_params_from_json, load_and_preprocess_data, load_and_preprocess_data_augmented

%matplotlib inline

In [None]:
#! ===== Set parameters ======
grandparent_dir = Path.cwd().parents[0]
print(grandparent_dir)
test_directory = grandparent_dir / ".dataset" / "X_test"
train_directory = grandparent_dir / ".dataset" / "X_train"

# Set the path to the downloaded data
download_path = grandparent_dir / ".dataset"

# Audio parameters
sample_rate = 256000
audio_duration_seconds = 0.2 

In [None]:
#! ====== Load and preprocess data ====== 
# Read labels file
labels_file = download_path / "Y_train_ofTdMHi.csv"
df = pd.read_csv(labels_file)

# Construct file path by concatenating folder and file name
df["relative_path"] = Path(download_path) / "X_train" / df["id"]
# df["relative_path"] = str(download_path) + "/X_train/" + df["id"]

# Drop id column (replaced it with relative_path)
df.drop(columns=["id"], inplace=True)

df.rename(columns={"pos_label": "label"}, inplace=True)

# invert relative_path and label columns positions
df = df[["relative_path", "label"]]
print(f"### There are {len(df)} audio files in the dataset.")

table = f"""
Here is the split into good and bad signals:
| Label   | Count   |
|:-------:|:-------:|
| 0       | {df['label'].value_counts()[0]:7} |
| 1       | {df['label'].value_counts()[1]:7} |"""
print(table, end="\n\n")
print("### Here is a sample of the data:")

In [None]:
target_length = int(sample_rate * audio_duration_seconds)
# X = load_and_preprocess_data(df, target_length)
data, labels = load_and_preprocess_data_augmented(df, target_length)

In [None]:
y = labels.astype(int)
y[:5],y.shape

In [None]:
# One-hot encode the labels
y = np.eye(2)[y]
y[:5], y.shape

Prepare the training and test data & labels

In [None]:
N_SAMPLES = round(data.shape[0]*0.8)

from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(data, y)

print(train_data.shape, train_labels.shape)
print(test_data.shape, test_labels.shape)

---

In [None]:
import json
# with open("hyperopt-odontoceti-2/results/0.4222609_hyperopt_results_1call.json", "r") as f:
#     y = json.load(f)

# print(y['current_params'])
# retrieve N, iss, lr, ridge, seed, sr from y['current_params']
# N, iss, lr, ridge, seed, sr = retrieve_hyper_params_from_json("hyperopt-odontoceti-2/results/0.4222609_hyperopt_results_1call.json")
N, iss, lr, ridge, seed, sr = 1000, 0.9, 0.6571588388986349, 0.002820052832209098, 1234, 0.051153084643003444

print(N, iss, lr, ridge, seed, sr)


In [None]:
from reservoirpy.nodes import Reservoir, Ridge, Input

source = Input()
reservoir = Reservoir(N, sr=sr, lr=lr, iss=iss, seed=seed)
readout = Ridge(ridge=ridge)

In [None]:
train_states = reservoir.run(train_data)

In [None]:
readout.fit(train_states, train_labels, warmup=100)

In [None]:
test_states = reservoir.run(test_data)

In [None]:
Y_pred = readout.run(test_states)

In [None]:
Y_pred[Y_pred > 1] = 1
Y_pred[Y_pred < 0] = 0

In [None]:
print(np.max(Y_pred, axis=1)[:10])


In [None]:
# Calculate the accuracy
Y_pred_class = [np.argmax(y_p) for y_p in Y_pred]
Y_test_class = [np.argmax(y_t) for y_t in test_labels]

accuracy = accuracy_score(Y_test_class, Y_pred_class)
print(f"Accuracy: {accuracy:.2f}")

# Calculate the F1 score
f1 = f1_score(Y_test_class, Y_pred_class)
print(f"F1 score: {f1:.2f}")

# Calculate the ROC AUC score
roc_auc = roc_auc_score(Y_test_class, Y_pred_class)
print(f"ROC AUC score: {roc_auc:.2f}")

In [None]:
# plt.figure(figsize=(15,5))
# import confusion matrix
from sklearn.metrics import confusion_matrix

print(confusion_matrix(Y_test_class, Y_pred_class))

# plot confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay
cm = confusion_matrix(Y_test_class, Y_pred_class)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["good", "bad"])
disp.plot()

In [None]:
def load_test_data(folder_path, target_length):
    file_paths = list(Path(folder_path).rglob('*.wav'))  # Assuming the audio files are in WAV format
    return load_and_preprocess_data(file_paths, target_length)

test_directory = Path.cwd() / "../" / ".dataset" / "X_test"
print(test_directory)
X_test = load_test_data(test_directory, target_length)
submission_states = reservoir.run(X_test)

predictions = readout.run(submission_states)

In [None]:
print(np.max(predictions, axis=1)[:10])
submission_predictions = np.max(predictions, axis=1)
f"First prediction is {submission_predictions[0]}"

In [None]:
file_names = [file_path.name for file_path in Path(test_directory).rglob('*.wav')]

df = pd.DataFrame({'id': file_names, 'pos_label': submission_predictions[:]})
df.to_csv("reservoir_submission.csv", index=False)

In [None]:
class StopExecution(Exception):
    def _render_traceback_(self):
        return []

raise StopExecution

---

In [None]:
rp.verbosity(0)

In [None]:
print(train_states.shape, train_labels.shape, test_states.shape, test_labels.shape)

In [None]:
dataset = (train_states, train_labels, test_states, test_labels)

In [None]:
from reservoirpy.observables import nrmse, rsquare
from reservoirpy.nodes import Reservoir, Ridge, Input
from sklearn.metrics import f1_score

# Objective functions accepted by ReservoirPy must respect some conventions:
#  - dataset and config arguments are mandatory, like the empty '*' expression.
#  - all parameters that will be used during the search must be placed after the *.
#  - the function must return a dict with at least a 'loss' key containing the result
# of the loss function. You can add any additional metrics or information with other 
# keys in the dict. See hyperopt documentation for more informations.
def objective(dataset, config, *, iss, N, sr, lr, ridge, seed):
    
    # This step may vary depending on what you put inside 'dataset'
    (X_train, y_train, X_test, y_test) = dataset
    
    # You can access anything you put in the config 
    # file from the 'config' parameter.
    instances = config["instances_per_trial"]
    
    # The seed should be changed across the instances, 
    # to be sure there is no bias in the results 
    # due to initialization.
    variable_seed = seed 
    
    losses = []; r2s = [];
    for n in range(instances):
        # Build your model given the input parameters
        reservoir = Reservoir(N, 
                              sr=sr, 
                              lr=lr, 
                              inut_scaling=iss, 
                              seed=variable_seed)
        
        readout = Ridge(ridge=ridge)

        # model = reservoir >> readout
        model = [source >> reservoir, source >> reservoir, source] >> readout


        # Train your model and test your model.
        predictions = model.fit(X_train, y_train) \
                           .run(X_test)
        

        Y_pred_class = [np.argmax(y_p) for y_p in predictions]
        Y_test_class = [np.argmax(y_t) for y_t in test_labels]

        loss = 1 - f1_score(Y_test_class, Y_pred_class, average='weighted')
        r2 = rsquare(Y_test_class, Y_pred_class)
        
        # Change the seed between instances
        variable_seed += 1
        
        losses.append(loss)
        r2s.append(r2)

    # Return a dictionnary of metrics. The 'loss' key is mandatory when
    # using hyperopt.
    return {'loss': np.mean(losses),
            'r2': np.mean(r2s),
            'f1_score': f1_score(Y_test_class, Y_pred_class, average='weighted'),}

In [None]:
hyperopt_config = {
    "exp": f"hyperopt-odontoceti-3",     # the experimentation name
    "hp_max_evals": 200,                # the number of differents sets of parameters hyperopt has to try
    "hp_method": "random",              # the method used by hyperopt to chose those sets (see below)
    "seed": 42,                         # the random state seed, to ensure reproducibility
    "instances_per_trial": 3,           # how many random ESN will be tried with each sets of parameters
    "hp_space": {                       # what are the ranges of parameters explored
        "N": ["choice", 1000],           # the number of neurons is fixed to 500
        "sr": ["loguniform", 1e-3, 1e-1], # the spectral radius 
        "lr": ["loguniform", 1e-2, 1],      # idem with the leaking rate, from 1e-3 to 1
        "iss": ["uniform", 0.4, 0.9],         # the input scaling 
        "ridge": ["loguniform", 1e-4, 1e-1],  # regularization .
        "seed": ["choice", 1234]        # an other random seed for the ESN initialization
    }
}


import json

# we precautionously save the configuration in a JSON file
# each file will begin with a number corresponding to the current experimentation run number.
with open(f"{hyperopt_config['exp']}.config.json", "w+") as f:
    json.dump(hyperopt_config, f)

In [None]:
from reservoirpy.hyper import research

best = research(objective, dataset, f"{hyperopt_config['exp']}.config.json", ".")