<a href="https://colab.research.google.com/github/yakovsushenok/Thesis/blob/main/BirdSongIdentification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import zipfile
import os
import pandas as pd
import math, random
import torch
import torchaudio
from torchaudio import transforms
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import init
from google.colab import drive
from sklearn.model_selection import StratifiedShuffleSplit
drive.mount('/content/gdrive')
import time
import matplotlib.pyplot as plt
import numpy as np
import numpy.matlib
try:
    from scipy.fftpack import fft, ifft
except ImportError:
    from numpy.fft import fft, ifft
from scipy.signal import lfilter
import scipy.io as sio
from scipy import signal
import gc
import h5py
np.random.seed(0)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Data

In [None]:
# extracting the training data (audio files) from the zip file (12 minutes)
zip_ref = zipfile.ZipFile('/content/gdrive/MyDrive/train_short_audio.zip', 'r')
zip_ref.extractall('/content/tmp') 
zip_ref.close()

In [2]:
# extracting the data which has audio files that are of similar length to the testing data
zip_ref = zipfile.ZipFile('/content/gdrive/MyDrive/train_soundscapes.zip', 'r')
zip_ref.extractall('/content/trainSoundscapes') 
zip_ref.close()

df = pd.read_csv("/content/gdrive/MyDrive/train_metadata.csv") # the metadata
df['relative_path'] = '/content/tmp/' + df['primary_label'] + '/' + df['filename'] 

### Subsetting the data so that we're left with subspecies which have 300+ samples and with audio rated 4.0+

In [3]:
df = df[(df['primary_label'].value_counts().reindex(df['primary_label'])>299).values & (df['rating'] > 3.5)]
df = df[['relative_path', 'primary_label']]
unique_labels = df.primary_label.unique()
mapping = dict(zip(unique_labels, range(len(unique_labels))))
df.primary_label = df.primary_label.map(mapping)

### We're going to partition our data into a training/validation/testing test with 80% being the training, 10% for validating and 10% for testing with each species category having the same distribution.

In [4]:
# splitting into train, val+test
X, y = df['relative_path'], df['primary_label']
split1 = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=0)
train_index, val_test = next(split1.split(X, y))
# splitting into val, test
X1, y1 = df.iloc[val_test, 0], df.iloc[val_test, 1]
split2 = StratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=0)
val_index, test_index = next(split2.split(X1, y1))
# subsetting the datasets
df_train = df.iloc[train_index,:]
df_val = df.iloc[val_index, :]
df_test = df.iloc[test_index, :]

# Utility classes

In [5]:
### MRCG CODE
epsc = 0.000001

def mrcg_extract(sig, sampFreq = 32000): # Sample frequency is always 32,000 in our case
    # Code From: https://github.com/MoongMoong/MRCG_python/blob/master/MRCG_python_master/mrcg/MRCG.py
    
    beta = 1000 / np.sqrt(sum(map(lambda x:x*x,sig)) / len(sig))
    sig = sig*beta
    sig = sig.reshape(len(sig), 1)
    g = gammatone(sig, 64, sampFreq)
    cochlea1 = np.log10(cochleagram(g, int(sampFreq * 0.025), int(sampFreq * 0.010)))
    cochlea2 = np.log10(cochleagram(g, int(sampFreq * 0.200), int(sampFreq * 0.010)))
    cochlea1 = cochlea1[:,:]
    cochlea2 = cochlea2[:,:]
    cochlea3 = get_avg(cochlea1, 5, 5)
    cochlea4 = get_avg(cochlea1, 11, 11)
    
    all_cochleas = np.concatenate([cochlea1,cochlea2,cochlea3,cochlea4],0)
    del0 = deltas(all_cochleas)
    ddel = deltas(deltas(all_cochleas, 5), 5)

    ouotput = np.concatenate((all_cochleas, del0, ddel), 0)

    return ouotput

def gammatone(insig, numChan=128, fs = 16000): 
    fRange = [50, 8000]
    filterOrder = 4
    gL = 2048
    sigLength = len(insig)
    phase = np.zeros([numChan, 1])
    erb_b = hz2erb(fRange)

    
    erb_b_diff = (erb_b[1]-erb_b[0])/(numChan-1)
    erb = np.arange(erb_b[0], erb_b[1]+epsc, erb_b_diff)
    cf = erb2hz(erb)
    b = [1.019 * 24.7 * (4.37 * x / 1000 + 1) for x in cf]
    gt = np.zeros([numChan, gL])
    tmp_t = np.arange(1,gL+1)/fs
    for i in range(numChan):
        gain = 10**((loudness(cf[i])-60)/20)/3*(2 * np.pi * b[i] / fs)**4
        tmp_temp = [gain*(fs**3)*x**(filterOrder - 1)*np.exp(-2 * np.pi * b[i] * x)*np.cos(2 * np.pi * cf[i] * x + phase[i]) for x in tmp_t]
        tmp_temp2 = np.reshape(tmp_temp, [1, gL])

        gt[i, :] = tmp_temp2

    sig = np.reshape(insig,[sigLength,1])
    gt2 = np.transpose(gt)
    resig = np.matlib.repmat(sig,1,numChan)
    r = np.transpose(fftfilt(gt2,resig,numChan))
    return r

def hz2erb(hz):  
    erb1 = 0.00437
    erb2 = np.multiply(erb1,hz)
    erb3 = np.subtract(erb2,-1)
    erb4 = np.log10(erb3)
    erb = 21.4 *erb4
    return erb

def erb2hz(erb): 
    hz = [(10**(x/21.4)-1)/(0.00437) for x in erb]
    return hz

def loudness(freq): 
    dB=60
    fmat = sio.loadmat('/content/gdrive/MyDrive/f_af_bf_cf.mat')
    af = fmat['af'][0]
    bf = fmat['bf'][0]
    cf = fmat['cf'][0]
    ff = fmat['ff'][0]
    i = 0
    while ff[i] < freq:
        i = i + 1

    afy = af[i - 1] + (freq - ff[i - 1]) * (af[i] - af[i - 1]) / (ff[i] - ff[i - 1])
    bfy = bf[i - 1] + (freq - ff[i - 1]) * (bf[i] - bf[i - 1]) / (ff[i] - ff[i - 1])
    cfy = cf[i - 1] + (freq - ff[i - 1]) * (cf[i] - cf[i - 1]) / (ff[i] - ff[i - 1])
    loud = 4.2 + afy * (dB - cfy) / (1 + bfy * (dB - cfy))
    return loud

def fftfilt(b,x,nfft): 
    fftflops = [18, 59, 138, 303, 660, 1441, 3150, 6875, 14952, 32373, 69762,
                149647, 319644, 680105, 1441974, 3047619, 6422736, 13500637, 28311786,
                59244791, 59244791*2.09]
    nb, _ = np.shape(b)
    nx, mx = np.shape(x)
    n_min = 0
    while 2**n_min < nb-1:
        n_min = n_min+1
    n_temp = np.arange(n_min, 21 + epsc, 1)
    n = np.power(2,n_temp)
    fftflops = fftflops[n_min-1:21]
    L = np.subtract(n,nb-1)
    lenL= np.size(L)
    temp_ind0 = np.ceil(np.divide(nx,L))
    temp_ind = np.multiply(temp_ind0,fftflops)
    temp_ind = np.array(temp_ind)
    ind = np.argmin(temp_ind)
    nfft=int(n[ind])
    L=int(L[ind])
    b_tr = np.transpose(b)
    B_tr = fft(b_tr,nfft)
    B = np.transpose(B_tr)
    y = np.zeros([nx, mx])
    istart = 0
    while istart < nx :
        iend = min(istart+L,nx)
        if (iend - istart) == 1 :
            X = x[0][0]*np.ones([nx,mx])
        else :
            xtr = np.transpose(x[istart:iend][:])
            Xtr = fft(xtr,nfft)
            X = np.transpose(Xtr)
        temp_Y = np.transpose(np.multiply(B,X))
        Ytr = ifft(temp_Y,nfft)
        Y = np.transpose(Ytr)
        yend = np.min([nx, istart + nfft])
        y[istart:yend][:] = y[istart:yend][:] + np.real(Y[0:yend-istart][:])

        istart = istart + L
    
    return y

def cochleagram(r, winLength = 320, winShift=160): 
    numChan, sigLength = np.shape(r)
    increment = winLength / winShift
    M = np.floor(sigLength / winShift)
    a = np.zeros([numChan, int(M)])
    rs = np.square(r)
    rsl = np.concatenate((np.zeros([numChan,winLength-winShift]),rs),1)
    for m in range(int(M)):
        temp = rsl[:,m*winShift : m*winShift+winLength]
        a[:, m] = np.sum(temp,1)

    return a

def get_avg( m , v_span, h_span): 
    nr,nc = np.shape(m)

    fil_size = (2 * v_span + 1) * (2 * h_span + 1)
    meanfil = np.ones([1+2*h_span,1+2*h_span])
    meanfil = np.divide(meanfil,fil_size)

    out = signal.convolve2d(m, meanfil, boundary='fill', fillvalue=0, mode='same')
    return out

def deltas(x, w=9) : 
    nr,nc = np.shape(x)
    if nc ==0 :
        d= x
    else :
        hlen = int(np.floor(w / 2))
        w = 2 * hlen + 1
        win=np.arange(hlen, int(-(hlen+1)), -1)
        temp = x[:, 0]
        fx = np.matlib.repmat(temp.reshape([-1,1]), 1, int(hlen))
        temp = x[:, nc-1]
        ex = np.matlib.repmat(temp.reshape([-1,1]), 1, int(hlen))
        xx = np.concatenate((fx, x, ex),1)
        d = lfilter(win, 1, xx, 1)
        d = d[:,2*hlen:nc+2*hlen]

    return d

Now that I have the MRCG code, I want to create a dataset which will hold these MRCG values. This will mean I'll not have to pre-process them every run of the model.

In [6]:
def get_mrcg_from_file(file):
  
  mid = 1937318 # zero padding so that the length is equal to the length of the longest waveform 
  waveform, sr = torchaudio.load(file)
  if len(waveform[0]) < mid:
    target = torch.zeros(mid)
    source = waveform[0]
    target[:len(source)] = source
    return mrcg_extract(target)
  else:
    waveform = waveform[0]
    return mrcg_extract(waveform[:mid])

In [None]:
df_list = [i for i in range(0,8600, 100)]
df_list[-1] = len(df_train)

for i in range(24,len(df_list)-1): # [2706, 0950, stopped at 22] <-- missed 22 (i=21+1)... need to do it.
                                   # [2806, 0906, stopped at i=43+1]
                                   # for some reason, indices i=14-19 are not being processed (for the tensors)
  print(f"currently on df {i+1}")
  df = df_train[df_list[i]:df_list[i+1]]
  #df['mrcg'] = df['relative_path'].apply(get_mrcg_from_file)
  df.to_csv(f'/content/gdrive/MyDrive/df_train{i+1}.csv') 
  x = torch.tensor(get_mrcg_from_file(df['relative_path'].iloc[0]))     
  x = x[None, : , :]
  for j in range(1,len(df)):
    x1 = torch.tensor(get_mrcg_from_file(df['relative_path'].iloc[i])) # 30 sec per sample for mrcg
    x1 = x1[None, :, :]
    x = torch.cat((x,x1), 0)
  torch.save(x, f'/content/gdrive/MyDrive/df_train_tensor{i+1}.pt')

# Data Loader

In [None]:
class SoundDS(Dataset):
  def __init__(self, df, mrcg_tensor):
    self.df = df
    self.mrcg_tensor = mrcg_tensor
    #self.df['mrcg'] = np.array([mrcg_tensor[i, :, :] for i in range(mrcg_tensor.shape[0])])
  
  def __len__(self):
    return len(self.df)    
    
  def __getitem__(self, idx):

   return (self.mrcg_tensor[idx, :, :], torch.tensor(self.df['primary_label'].iloc[idx]))
   #return (torch.tensor(self.df['mrcg'].iloc[idx]), torch.tensor(self.df['primary_label'].iloc[idx]))

# H5PY Dataloader, Dataloading
https://discuss.pytorch.org/t/save-torch-tensors-as-hdf5/39556

In [None]:

tensors = h5py.File('/content/gdrive/MyDrive/train_tensors.h5', 'w') 

In [None]:
N = 2000
data_train_predictor = tensors.create_dataset('data', shape=(N, 768, 6054), dtype=np.float32, fillvalue=0)

In [None]:
ind_range = [i for i in range(0,2100,100)]
ind_range[-1]= 1999

In [None]:
for i in range(1,len(ind_range)):
    x = torch.load(f"/content/gdrive/MyDrive/df_train_tensor{i}.pt")
    data_train_predictor[ind_range[i-1]:ind_range[i]] = x

In [None]:
ten1 = data_train_predictor[0]

In [None]:
tensors.close()

In [9]:
f = h5py.File('/content/gdrive/MyDrive/train_tensors.h5', 'r')

In [None]:
d = f['data'][:1000]

In [None]:
df_temp = df_train[:1000]

In [19]:
class H5DS(Dataset):
  def __init__(self, df, h5pyFile):
    self.data = h5pyFile
    self.df = df
  
  def __len__(self):
    return len(self.df)    
    
  def __getitem__(self, idx):

   return (self.data[idx], torch.tensor(self.df['primary_label'].iloc[idx]))


# Model

In [None]:
class AudioClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.input_fc = nn.Linear(input_dim, 50)
        self.hidden_fc = nn.Linear(50, 50)
        self.output_fc = nn.Linear(50, output_dim)

    def forward(self, x):

        # x = [batch size, height, width]
       # print(x.shape)
        batch_size = x.shape[0]


        x = x.view(batch_size, -1)
      #  print(x.shape)
        # x = [batch size, height * width]

        h_1 = F.relu(self.input_fc(x))

        # h_1 = [batch size, 250]

        h_2 = F.relu(self.hidden_fc(h_1))

        # h_2 = [batch size, 100]

        y_pred = self.output_fc(h_2)

        # y_pred = [batch size, output dim]

        return y_pred



# Training

In [None]:
# Create the model and put it on the GPU if available
myModel = AudioClassifier(768*6054, 39)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device


# ----------------------------
# Training Loop
# ----------------------------
def training(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = torch.tensor(data[0]).to(device), torch.tensor(data[1]).to(device)
       # print(labels.shape)
        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs.float())
      #  print(outputs.shape)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        #if i % 10 == 0:    # print every 10 mini-batches
        #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')

  print('Finished Training')

f = h5py.File('/content/gdrive/MyDrive/train_tensors.h5', 'r')
h5pyFile = f['data'][:1000]
NUM_EPOCHS = 10
#for epoch in range(NUM_EPOCHS):
for i in range(1,11):
  #print(f"Train DF: {i}")
  #df = pd.read_csv(f"/content/gdrive/MyDrive/df_train{i}.csv")
  #mrcg_tensor = torch.load(f"/content/gdrive/MyDrive/df_train_tensor{i}.pt")
  myds = H5DS(df, mrcg_tensor)

  # Random split of 80:20 between training and validation
  num_items = len(myds)
  num_train = round(num_items*(0.80))
  num_val = num_items - num_train
  train_ds, val_ds = random_split(myds, [num_train, num_val])

  # Create training and validation data loaders
  train_dl = torch.utils.data.DataLoader(train_ds, batch_size=5, shuffle=True)
  val_dl = torch.utils.data.DataLoader(val_ds, batch_size=5, shuffle=False)
    
  training(myModel, train_dl, 1)
  # Clearing RAM
  # del mrcg_tensor
  # gc.collect()
  # clearing GPU memory
  #torch.cuda.empty_cache() 


NameError: ignored

In [None]:


torch.cuda.empty_cache()

# Testing

In [None]:
# ----------------------------
# Inference
# ----------------------------
def inference (model, val_dl):
  correct_prediction = 0
  total_prediction = 0

  # Disable gradient updates
  with torch.no_grad():
    for data in val_dl:
      # Get the input features and target labels, and put them on the GPU
      inputs, labels = data[0].to(device), data[1].to(device)

      # Normalize the inputs
      inputs_m, inputs_s = inputs.mean(), inputs.std()
      inputs = (inputs - inputs_m) / inputs_s

      # Get predictions
      outputs = model(inputs)

      # Get the predicted class with the highest score
      _, prediction = torch.max(outputs,1)
      # Count of predictions that matched the target label
      correct_prediction += (prediction == labels).sum().item()
      total_prediction += prediction.shape[0]
    
  acc = correct_prediction/total_prediction
  print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

# Run inference on trained model with the validation set
inference(myModel, val_dl)

KeyboardInterrupt: ignored