In [None]:
# Project 2 - CSS 383 Bioinformatics: Bird Soundscape Analysis Code

In [None]:
# imports
import os
import pandas as pd
import numpy as np
import seaborn as sns
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Importing csv data
train_csv = pd.read_csv("/Users/tesfashenkute/Downloads/birdsong-recognition/train.csv")
test_csv = pd.read_csv("/Users/tesfashenkute/Downloads/birdsong-recognition/test.csv")

In [None]:
# checking the data
train_csv.head
test_csv.head

In [None]:
# Narrowing down bird species to save on computational power
# Narrowing down to aldfly, dowwoo and hamfly

train_data = pd.DataFrame()

for row in range(train_csv.shape[0]) :
    for name in ['aldfly','dowwoo', 'hamfly'] :
        if train_csv.iloc[row]['ebird_code'] == name :
            train_data = train_data.append(train_csv.iloc[row])

In [None]:
# building audio data path
main_dir = '/Users/tesfashenkute/Downloads/birdsong-recognition/train_audio'
train_data['full_path'] = main_dir + '/' + train_csv['ebird_code'] + '/' + train_data['filename']


In [None]:
# setting audio data according to bird species code to test for audio corruption
aldfly = train_data[train_data['ebird_code'] == "aldfly"].sample(1, random_state = 33)['full_path'].values[0]
dowwoo = train_data[train_data['ebird_code'] == "dowwoo"].sample(1, random_state = 33)['full_path'].values[0]
hamfly = train_data[train_data['ebird_code'] == "hamfly"].sample(1, random_state = 33)['full_path'].values[0]

In [None]:
# Generating STFT

import json 
import os
import math
import librosa
import warnings
import numpy as np
warnings.simplefilter("ignore")

DATASET_PATH = "/Users/tesfashenkute/Downloads/birdsong-recognition/train_audio/"
JSON_PATH = "STFT.json"
SAMPLE_RATE = 22050

TO_PROCESS = ["aldfly", "dowwoo","hamfly"] # Species we want to process (decreases computational time im on a macbook pro lol)

# parameters changes because STFTs are much more memory-consuming than MFCCs
# This method was found on https://www.kaggle.com/looc60/stft-audio-extraction
def save_stft(dataset_path, json_path, n_fft=512, hop_length=2048, segment_duration=4): 
    """Extracts STFTs from music dataset and saves them into a json file along witgh genre labels.

        :param dataset_path (str): Path to dataset
        :param json_path (str): Path to json file used to save STFTs
        :param n_fft (int): Interval we consider to apply FFT. Measured in # of samples
        :param hop_length (int): Sliding window for FFT. Measured in # of samples
        :param: num_segments (int): Number of segments we want to divide sample tracks into
        :return:
        """

    # dictionary to store mapping, labels, and STFTs
    data = {
        "mapping": [], # genres
        "labels": [], # a number (corresponding to a genres) : targets that we expect
        "stft": [] # will bbe the inputs
    }
    
    file_count = 0 # keeps teack of the loading process

    # loop through all genre sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        
        # ensure we're processing a genre sub-folder level
        if dirpath is not dataset_path:

            # save genre label (i.e., sub-folder name) in the mapping
            semantic_label = dirpath.split('/')[-1]
            
            # Proceed to data extraction only for few species
            if semantic_label in TO_PROCESS:
                
                # Keeps track of loading process
                file_count = file_count + 1
            
                data["mapping"].append(semantic_label)
                print("\nProcessing: {}".format(semantic_label))

                # process all audio files in genre sub-dir
                
                num_file = 0
                for f in filenames:
                    num_file += 1
                    
                    # audio file
                    file_path = os.path.join(dirpath, f)
                    signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE) # audio file in array

                    audio_duration = librosa.get_duration(signal, sr=SAMPLE_RATE) # different duration for each sample
                    num_segments = int(audio_duration // segment_duration) # number of segments the audio can be cut into
                    # we want audios of the same duration to allow comparisons

                    samples_per_segment = int(SAMPLE_RATE * segment_duration)
                    num_stft_vectors_per_segment = math.ceil(samples_per_segment / hop_length)
                    # that will be what we study

                    
                    # print("{}, segment:{}".format(file_path, d+1), end = '\r', flush=True)
                    print("processing file {} on {}, folder {} on {}".format(num_file,len(filenames),file_count,len(TO_PROCESS)))
                    
                    # process all segments of audio file
                    for d in range(num_segments):

                        # calculate start and finish sample for current segment
                        start = samples_per_segment * d  # sample at which the segment begin
                        finish = start + samples_per_segment # sample at which the segment stops

                        # extract stft (what we will use)
                        stft = np.abs(librosa.stft(signal[start:finish], 
                                                    n_fft=n_fft, 
                                                    hop_length=hop_length))
                        stft = librosa.amplitude_to_db(stft, ref = np.max)
                        stft = stft.T


                        # store only stft feature with expected number of vectors
                        if len(stft) == num_stft_vectors_per_segment: 
                            data["stft"].append(stft.tolist())
                            data["labels"].append(i-1)


    # save STFTs to json file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent = 4)
        print("Data successfully saved !")  

# Function execution
save_stft(DATASET_PATH, JSON_PATH, segment_duration=6)

In [None]:
# Generating MFCC

import json 
import os
import math
import librosa
import warnings
warnings.simplefilter("ignore")

DATASET_PATH = "/Users/tesfashenkute/Downloads/birdsong-recognition/train_audio/"
JSON_PATH = "MFCC.json"
SAMPLE_RATE = 22050

TO_PROCESS = ["aldfly", "dowwoo","hamfly"] # Species we want to process (decreases computational time)

# main function
# This method was found on https://www.kaggle.com/looc60/mfcc-audio-extraction
def save_mfcc(dataset_path, json_path, num_mfcc=13, n_fft=2048, hop_length=512, segment_duration=6):
    """Extracts MFCCs from music dataset and saves them into a json file along witgh genre labels.

        :param dataset_path (str): Path to dataset
        :param json_path (str): Path to json file used to save MFCCs
        :param num_mfcc (int): Number of coefficients to extract
        :param n_fft (int): Interval we consider to apply FFT. Measured in # of samples
        :param hop_length (int): Sliding window for FFT. Measured in # of samples
        :param: num_segments (int): Number of segments we want to divide sample tracks into
        :return:
        """

    # dictionary to store mapping, labels, and MFCCs
    data = {
        "mapping": [], # genres
        "labels": [], # a number (corresponding to a genres) : targets that we expect
        "mfcc": [] # will bbe the inputs
    }
    

    # loop through all genre sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        # ensure we're processing a genre sub-folder level
        if dirpath is not dataset_path:

            # save genre label (i.e., sub-folder name) in the mapping
            semantic_label = dirpath.split('/')[-1]
            
            # Proceed to data extraction only for few species
            if semantic_label in TO_PROCESS:
            
                data["mapping"].append(semantic_label)
                print("\nProcessing: {}".format(semantic_label))

                # process all audio files in genre sub-dir
                
                num_file = 0
                for f in filenames:
                    num_file += 1
                    
                    # audio file
                    file_path = os.path.join(dirpath, f)
                    signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE) # audio file in array

                    audio_duration = librosa.get_duration(signal, sr=SAMPLE_RATE) # different duration for each sample
                    num_segments = int(audio_duration // segment_duration) # number of segments the audio can be cut into
                    # we want audios of the same duration to allow comparisons

                    samples_per_segment = int(SAMPLE_RATE * segment_duration)
                    num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)
                    # that will be what we study

                    # process all segments of audio file
                    for d in range(num_segments):

                        # calculate start and finish sample for current segment
                        start = samples_per_segment * d  # sample at which the segment begin
                        finish = start + samples_per_segment # sample at which the segment stops

                        # extract mfcc (what we will use)
                        mfcc = librosa.feature.mfcc(signal[start:finish], 
                                                    sample_rate, 
                                                    n_mfcc=num_mfcc, 
                                                    n_fft=n_fft, 
                                                    hop_length=hop_length)
                        mfcc = mfcc.T


                        # store only mfcc feature with expected number of vectors
                        if len(mfcc) == num_mfcc_vectors_per_segment: 
                            data["mfcc"].append(mfcc.tolist())
                            data["labels"].append(i-1)
                            # print("{}, segment:{}".format(file_path, d+1), end = '\r', flush=True)
                            print("processing file {} on {}".format(num_file,len(filenames)), end = '\r', flush=True)

    # save MFCCs to json file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent = 4)
        print("Data successfully saved !")       

# Function execution
save_mfcc(DATASET_PATH, JSON_PATH, segment_duration=6)

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import tensorflow.keras as keras
import time

In [None]:
DATA_PATH_MFCC = "MFCC.json"
DATA_PATH_STFT = "STFT.json"
np.random.seed(42)

In [None]:
# Loading data set from json file

def load_data(data_path,data_type):
    with open(data_path, "r") as fp:
        data = json.load(fp)

    # Convert lists to numpy arrays
    X = np.array(data[data_type])
    y = np.array(data["labels"])

    print("Data loaded.")
    
    # Resetting indexes
    values = list(set(y))
    
    for i in range(len(values)):
        y[y == values[i]] = i
        
    # returns the inputs to be split in the following function
    return  X, y

In [None]:
def prepare_datasets(test_size, validation_size):
    X, y = load_data(DATA_PATH_MFCC,"mfcc")

    # splitting test, training and validation data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size)

    # returning test, training and validation split
    return X_train, X_validation, X_test, y_train, y_validation, y_test

In [None]:
def add_axis(X_train, X_validation, X_test):
    # building inputs from new axis

    X_train = X_train[..., np.newaxis]
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    return X_train, X_validation, X_test

In [None]:
# Building final data
# we will be using MFCC data to train as it is more efficient than STFT
X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(0.25, 0.2)

print("Dataset ready for training")

In [None]:
# Convolutional Neural Network
# Will attempt using input layer, 3 convolutional layers (all with relu activation function) and an output layer

In [None]:
# This code will build the CNN model for our training
def cnn_build(input_shape, output_shape):
    
    # initializing network
    cnn_model = keras.Sequential()

    # Layer 1
    cnn_model.add(keras.layers.Conv2D(64, (4, 4), activation='relu', input_shape=input_shape))
    cnn_model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    cnn_model.add(keras.layers.BatchNormalization())

    # Layer 2
    cnn_model.add(keras.layers.Conv2D(32, (3, 3), activation='relu'))
    cnn_model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
    cnn_model.add(keras.layers.BatchNormalization())

    # Layer 3
    cnn_model.add(keras.layers.Conv2D(64, (2, 2), activation='relu', input_shape=input_shape))
    cnn_model.add(keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
    cnn_model.add(keras.layers.BatchNormalization())

    # Inputting into output layer (flattening -> dense)
    cnn_model.add(keras.layers.Flatten())
    cnn_model.add(keras.layers.Dense(64, activation='relu'))
    cnn_model.add(keras.layers.Dropout(0.5))

    # Output
    cnn_model.add(keras.layers.Dense(output_shape, activation='softmax'))

    return cnn_model


In [None]:
# setting output parameter and labels

birds = ["aldfly", "dowwoo","hamfly"]
output_shape = len(birds)

In [None]:
# adding new axis for CNN inputs
X1_train, X1_validation, X1_test = add_axis(X_train, X_validation, X_test)


In [None]:
# Convolutional Neural Network Training

# new input shape
input_shape = (X1_train.shape[1], X1_train.shape[2], 1)

# Network creation
model_conv = cnn_build(input_shape, output_shape)

# Network compile on adam optimizer and categorical crossentropy loss function
adam_optimizer = keras.optimizers.Adam(learning_rate=0.0001) # Adam optimizer
model_conv.compile(optimizer=adam_optimizer,
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])

print("\nSummary:")
model_conv.summary()

# MODEL TRAINING
print("\nTraining:")
history_conv = model_conv.fit(X1_train, y_train, validation_data=(X1_validation, y_validation), batch_size=32, epochs=30)
print("\n")

# MODEL EVALUATION (test)
test_loss_conv, test_acc_conv = model_conv.evaluate(X_test_new, y_test, verbose=2)
print('Test accuracy:', test_acc_conv)


In [None]:

# Takes in training history of model and outputs graph of train and test accuracy and error rates

def plot_model_results(history):
    fig, axs = plt.subplots(2)

    axs[0].plot(history.history["accuracy"], label="train accuracy")
    axs[0].plot(history.history["val_accuracy"], label="test accuracy")
    axs[0].set_ylabel("Accuracy")
    axs[0].legend(loc="lower right")
    axs[0].set_title("Accuracy")

    axs[1].plot(history.history["loss"], label="train error")
    axs[1].plot(history.history["val_loss"], label="test error")
    axs[1].set_ylabel("Error")
    axs[1].set_xlabel("Epoch Number")
    axs[1].legend(loc="upper right")
    axs[1].set_title("Error")

    plt.show()
    

In [None]:
# Takes in trained model, input data, output and list of birds to generate a confusion matrix

def plot_confusion(model, X, y, labels):   
    prediction = model.predict(X)
    predicted_index = np.argmax(prediction, axis=1)
    
    cm = confusion_matrix(y, predicted_index, normalize='true')


    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(include_values=True,
                 cmap='cividis')

    axes = plt.gca()
    axes.xaxis.set_ticklabels(labels, fontsize = 10, verticalalignment = 'center')
    axes.yaxis.set_ticklabels(labels, fontsize = 10, verticalalignment = 'center', rotation = 90)
    plt.show()

In [None]:
plot_model_results(history_conv)
plot_confusion(model_conv, X_test_new, y_test, birds)