<a href="https://colab.research.google.com/github/yakovsushenok/Thesis/blob/main/BirdSongIdentification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import zipfile
import os
import pandas as pd
import math, random
import torch
import torchaudio
from torchaudio import transforms
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import init
from google.colab import drive
from sklearn.model_selection import StratifiedShuffleSplit
drive.mount('/content/gdrive')
import time
import matplotlib.pyplot as plt
import numpy as np
import numpy.matlib
try:
    from scipy.fftpack import fft, ifft
except ImportError:
    from numpy.fft import fft, ifft
from scipy.signal import lfilter
import scipy.io as sio
from scipy import signal
np.random.seed(0)

Mounted at /content/gdrive


# Data

In [None]:
# extracting the training data (audio files) from the zip file (12 minutes)
zip_ref = zipfile.ZipFile('/content/gdrive/MyDrive/train_short_audio.zip', 'r')
zip_ref.extractall('/content/tmp') 
zip_ref.close()

In [None]:
# extracting the data which has audio files that are of similar length to the testing data
zip_ref = zipfile.ZipFile('/content/gdrive/MyDrive/train_soundscapes.zip', 'r')
zip_ref.extractall('/content/trainSoundscapes') 
zip_ref.close()

df = pd.read_csv("/content/gdrive/MyDrive/train_metadata.csv") # the metadata
df['relative_path'] = '/content/tmp/' + df['primary_label'] + '/' + df['filename'] 

### Subsetting the data so that we're left with subspecies which have 300+ samples and with audio rated 4.0+

In [None]:
df = df[(df['primary_label'].value_counts().reindex(df['primary_label'])>299).values & (df['rating'] > 3.5)]
df = df[['relative_path', 'primary_label']]
unique_labels = df.primary_label.unique()
mapping = dict(zip(unique_labels, range(len(unique_labels))))
df.primary_label = df.primary_label.map(mapping)

### We're going to partition our data into a training/validation/testing test with 80% being the training, 10% for validating and 10% for testing with each species category having the same distribution.

In [None]:
# splitting into train, val+test
X, y = df['relative_path'], df['primary_label']
split1 = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=0)
train_index, val_test = next(split1.split(X, y))
# splitting into val, test
X1, y1 = df.iloc[val_test, 0], df.iloc[val_test, 1]
split2 = StratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=0)
val_index, test_index = next(split2.split(X1, y1))
# subsetting the datasets
df_train = df.iloc[train_index,:]
df_val = df.iloc[val_index, :]
df_test = df.iloc[test_index, :]

# Utility classes

In [None]:
### MRCG CODE
epsc = 0.000001

def mrcg_extract(sig, sampFreq = 32000): # Sample frequency is always 32,000 in our case
    # Code From: https://github.com/MoongMoong/MRCG_python/blob/master/MRCG_python_master/mrcg/MRCG.py
    
    beta = 1000 / np.sqrt(sum(map(lambda x:x*x,sig)) / len(sig))
    sig = sig*beta
    sig = sig.reshape(len(sig), 1)
    g = gammatone(sig, 64, sampFreq)
    cochlea1 = np.log10(cochleagram(g, int(sampFreq * 0.025), int(sampFreq * 0.010)))
    cochlea2 = np.log10(cochleagram(g, int(sampFreq * 0.200), int(sampFreq * 0.010)))
    cochlea1 = cochlea1[:,:]
    cochlea2 = cochlea2[:,:]
    cochlea3 = get_avg(cochlea1, 5, 5)
    cochlea4 = get_avg(cochlea1, 11, 11)
    
    all_cochleas = np.concatenate([cochlea1,cochlea2,cochlea3,cochlea4],0)
    del0 = deltas(all_cochleas)
    ddel = deltas(deltas(all_cochleas, 5), 5)

    ouotput = np.concatenate((all_cochleas, del0, ddel), 0)

    return ouotput

def gammatone(insig, numChan=128, fs = 16000): 
    fRange = [50, 8000]
    filterOrder = 4
    gL = 2048
    sigLength = len(insig)
    phase = np.zeros([numChan, 1])
    erb_b = hz2erb(fRange)

    
    erb_b_diff = (erb_b[1]-erb_b[0])/(numChan-1)
    erb = np.arange(erb_b[0], erb_b[1]+epsc, erb_b_diff)
    cf = erb2hz(erb)
    b = [1.019 * 24.7 * (4.37 * x / 1000 + 1) for x in cf]
    gt = np.zeros([numChan, gL])
    tmp_t = np.arange(1,gL+1)/fs
    for i in range(numChan):
        gain = 10**((loudness(cf[i])-60)/20)/3*(2 * np.pi * b[i] / fs)**4
        tmp_temp = [gain*(fs**3)*x**(filterOrder - 1)*np.exp(-2 * np.pi * b[i] * x)*np.cos(2 * np.pi * cf[i] * x + phase[i]) for x in tmp_t]
        tmp_temp2 = np.reshape(tmp_temp, [1, gL])

        gt[i, :] = tmp_temp2

    sig = np.reshape(insig,[sigLength,1])
    gt2 = np.transpose(gt)
    resig = np.matlib.repmat(sig,1,numChan)
    r = np.transpose(fftfilt(gt2,resig,numChan))
    return r

def hz2erb(hz):  
    erb1 = 0.00437
    erb2 = np.multiply(erb1,hz)
    erb3 = np.subtract(erb2,-1)
    erb4 = np.log10(erb3)
    erb = 21.4 *erb4
    return erb

def erb2hz(erb): 
    hz = [(10**(x/21.4)-1)/(0.00437) for x in erb]
    return hz

def loudness(freq): 
    dB=60
    fmat = sio.loadmat('/content/gdrive/MyDrive/f_af_bf_cf.mat')
    af = fmat['af'][0]
    bf = fmat['bf'][0]
    cf = fmat['cf'][0]
    ff = fmat['ff'][0]
    i = 0
    while ff[i] < freq:
        i = i + 1

    afy = af[i - 1] + (freq - ff[i - 1]) * (af[i] - af[i - 1]) / (ff[i] - ff[i - 1])
    bfy = bf[i - 1] + (freq - ff[i - 1]) * (bf[i] - bf[i - 1]) / (ff[i] - ff[i - 1])
    cfy = cf[i - 1] + (freq - ff[i - 1]) * (cf[i] - cf[i - 1]) / (ff[i] - ff[i - 1])
    loud = 4.2 + afy * (dB - cfy) / (1 + bfy * (dB - cfy))
    return loud

def fftfilt(b,x,nfft): 
    fftflops = [18, 59, 138, 303, 660, 1441, 3150, 6875, 14952, 32373, 69762,
                149647, 319644, 680105, 1441974, 3047619, 6422736, 13500637, 28311786,
                59244791, 59244791*2.09]
    nb, _ = np.shape(b)
    nx, mx = np.shape(x)
    n_min = 0
    while 2**n_min < nb-1:
        n_min = n_min+1
    n_temp = np.arange(n_min, 21 + epsc, 1)
    n = np.power(2,n_temp)
    fftflops = fftflops[n_min-1:21]
    L = np.subtract(n,nb-1)
    lenL= np.size(L)
    temp_ind0 = np.ceil(np.divide(nx,L))
    temp_ind = np.multiply(temp_ind0,fftflops)
    temp_ind = np.array(temp_ind)
    ind = np.argmin(temp_ind)
    nfft=int(n[ind])
    L=int(L[ind])
    b_tr = np.transpose(b)
    B_tr = fft(b_tr,nfft)
    B = np.transpose(B_tr)
    y = np.zeros([nx, mx])
    istart = 0
    while istart < nx :
        iend = min(istart+L,nx)
        if (iend - istart) == 1 :
            X = x[0][0]*np.ones([nx,mx])
        else :
            xtr = np.transpose(x[istart:iend][:])
            Xtr = fft(xtr,nfft)
            X = np.transpose(Xtr)
        temp_Y = np.transpose(np.multiply(B,X))
        Ytr = ifft(temp_Y,nfft)
        Y = np.transpose(Ytr)
        yend = np.min([nx, istart + nfft])
        y[istart:yend][:] = y[istart:yend][:] + np.real(Y[0:yend-istart][:])

        istart = istart + L
    
    return y

def cochleagram(r, winLength = 320, winShift=160): 
    numChan, sigLength = np.shape(r)
    increment = winLength / winShift
    M = np.floor(sigLength / winShift)
    a = np.zeros([numChan, int(M)])
    rs = np.square(r)
    rsl = np.concatenate((np.zeros([numChan,winLength-winShift]),rs),1)
    for m in range(int(M)):
        temp = rsl[:,m*winShift : m*winShift+winLength]
        a[:, m] = np.sum(temp,1)

    return a

def get_avg( m , v_span, h_span): 
    nr,nc = np.shape(m)

    fil_size = (2 * v_span + 1) * (2 * h_span + 1)
    meanfil = np.ones([1+2*h_span,1+2*h_span])
    meanfil = np.divide(meanfil,fil_size)

    out = signal.convolve2d(m, meanfil, boundary='fill', fillvalue=0, mode='same')
    return out

def deltas(x, w=9) : 
    nr,nc = np.shape(x)
    if nc ==0 :
        d= x
    else :
        hlen = int(np.floor(w / 2))
        w = 2 * hlen + 1
        win=np.arange(hlen, int(-(hlen+1)), -1)
        temp = x[:, 0]
        fx = np.matlib.repmat(temp.reshape([-1,1]), 1, int(hlen))
        temp = x[:, nc-1]
        ex = np.matlib.repmat(temp.reshape([-1,1]), 1, int(hlen))
        xx = np.concatenate((fx, x, ex),1)
        d = lfilter(win, 1, xx, 1)
        d = d[:,2*hlen:nc+2*hlen]

    return d

Now that I have the MRCG code, I want to create a dataset which will hold these MRCG values. This will mean I'll not have to pre-process them every run of the model.

In [None]:
def get_mrcg_from_file(file):
  
  mid = 1937318 # zero padding so that the length is equal to the length of the longest waveform 
  waveform, sr = torchaudio.load(file)
  if len(waveform[0]) < mid:
    target = torch.zeros(mid)
    source = waveform[0]
    target[:len(source)] = source
    return mrcg_extract(target)
  else:
    waveform = waveform[0]
    return mrcg_extract(waveform[:mid])

In [None]:
df_list = [i for i in range(0,8600, 100)]

In [None]:
df_list[-1] = len(df_train)
df_list

In [None]:
# df_trains = [df_train1, df_train2, df_train3, df_train4, df_train5, df_train6, df_train7, df_train8, df_train9, df_train10]

for i in range(24,len(df_list)-1): # [2706, 0950, stopped at 22] <-- missed 22 (i=21+1)... need to do it.
                                   # [2806, 0906, stopped at i=43+1]
  print(f"currently on df {i+1}")
  df = df_train[df_list[i]:df_list[i+1]]
  #df['mrcg'] = df['relative_path'].apply(get_mrcg_from_file)
  df.to_csv(f'/content/gdrive/MyDrive/df_train{i+1}.csv') 
  x = torch.tensor(get_mrcg_from_file(df['relative_path'].iloc[0]))
  x = x[None, : , :]
  for j in range(1,len(df)):
    x1 = torch.tensor(get_mrcg_from_file(df['relative_path'].iloc[i]))
    x1 = x1[None, :, :]
    x = torch.cat((x,x1), 0)
  torch.save(x, f'/content/gdrive/MyDrive/df_train_tensor{i+1}.pt')

  

#df_train['mrcg'] = df_train['relative_path'].apply(get_mrcg_from_file) # 30 seconds per sample --> 70 hours for whole df_train which is 8.5k samples. About 7 hours per df_train{i} if split into 10


currently on df 25


  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


currently on df 26
currently on df 27
currently on df 28
currently on df 29
currently on df 30
currently on df 31
currently on df 32
currently on df 33
currently on df 34
currently on df 35
currently on df 36
currently on df 37
currently on df 38
currently on df 39
currently on df 40
currently on df 41
currently on df 42
currently on df 43
currently on df 44


In [None]:
d = torch.load("/content/gdrive/MyDrive/df_train_tensor1.pt")

EOFError: ignored

In [None]:
df_train_toy1 = df_train_toy.copy()

In [None]:
df_train_toy = df_train_toy1

In [None]:
df_train_toy.iloc[0,-1].shape

(768, 6054)

In [None]:
df_train_toy['mrcg']

1466    [[-1.8068877959212895, -0.275449599027982, 0.9...
1467    [[-2.396268977063757, 0.18771316777299862, 0.8...
Name: mrcg, dtype: object

In [None]:
x = torch.tensor([[[0, 1, 2, 3, 4], 
                  [2, 3, 4, 5, 6]]])
x1 = torch.cat((x,x),0)
print(x1)
print(x1.shape)

tensor([[[0, 1, 2, 3, 4],
         [2, 3, 4, 5, 6]],

        [[0, 1, 2, 3, 4],
         [2, 3, 4, 5, 6]]])
torch.Size([2, 2, 5])


In [None]:
x = torch.tensor(df_train_toy.iloc[0,-1])
x = x[None, :, :]



x = torch.cat((x,x1), 0)
for i in range(1,len(df_train_toy)):
  x1 = torch.tensor(df_train_toy.iloc[i,-1])
  x1 = x1[None, :, :]
  x = torch.cat((x,x1), 0)


In [None]:
torch.save(x, '/content/gdrive/MyDrive/df_train_toy_tensor.pt')

In [None]:
x.shape

torch.Size([5, 768, 6054])

In [None]:
xx = torch.load('/content/gdrive/MyDrive/df_train_toy_tensor.pt')
xx.shape

torch.Size([5, 768, 6054])

In [None]:
df_train_toy['mrcg'] = df_train_toy['mrcg'].map(list)
df_train_toy.to_csv('/content/gdrive/MyDrive/df_train_toy.csv', encoding='utf8', index=False)  
type(df_train_toy.iloc[0,-1])

list

In [None]:
import ast
def from_np_array(array_string):
    array_string = ','.join(array_string.replace('[ ', '[').split())
    return np.array(ast.literal_eval(array_string))

In [None]:
df_train_toy_read = pd.read_csv('/content/gdrive/MyDrive/df_train_toy.csv')
#df_train_toy_read['mrcg'] = pd.eval(df_train_toy_read['mrcg'])
#df_train_toy_read['mrcg'] = df_train_toy_read['mrcg'].map(list)
#pd.eval("mrcg = np.array(df_train_toy_read.mrcg)", target = df_train_toy_read)
df_train_toy_read['mrcg'] = df_train_toy_read['mrcg'].map(list)
type(df_train_toy_read.iloc[0,-1])

list

In [None]:
df_train_toy_read.iloc[0,-1]

['[',
 'a',
 'r',
 'r',
 'a',
 'y',
 '(',
 '[',
 '-',
 '1',
 '.',
 '8',
 '0',
 '6',
 '8',
 '8',
 '7',
 '8',
 ',',
 ' ',
 '-',
 '0',
 '.',
 '2',
 '7',
 '5',
 '4',
 '4',
 '9',
 '6',
 ',',
 ' ',
 ' ',
 '0',
 '.',
 '9',
 '5',
 '3',
 '7',
 '8',
 '2',
 '1',
 ',',
 ' ',
 '.',
 '.',
 '.',
 ',',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 '-',
 'i',
 'n',
 'f',
 ',',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 '-',
 'i',
 'n',
 'f',
 ',',
 '\n',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 '-',
 'i',
 'n',
 'f',
 ']',
 ')',
 ',',
 ' ',
 'a',
 'r',
 'r',
 'a',
 'y',
 '(',
 '[',
 '-',
 '0',
 '.',
 '7',
 '6',
 '7',
 '2',
 '1',
 '7',
 '6',
 '8',
 ',',
 ' ',
 ' ',
 '0',
 '.',
 '5',
 '1',
 '3',
 '2',
 '7',
 '2',
 '2',
 '8',
 ',',
 ' ',
 ' ',
 '1',
 '.',
 '6',
 '7',
 '9',
 '2',
 '0',
 '4',
 '7',
 '7',
 ',',
 ' ',
 '.',
 '.',
 '.',
 ',',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 '-',
 'i',
 'n',
 'f',
 ',',
 '\n',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 '

In [None]:
class AudioUtil():
  """
  This class will be for various functions that I will use for the audio data
  """
  @staticmethod
  def open(audio_file):
    """
    The load method loads the audio file and produces 2 outputs:
    - waveform (Tensor): A waveform is a graphical representation of a sound wave as it moves through a medium over time
    - sample rate (int): In audio production, a sample rate defines how many times per second a sound is sampled. Technically speaking, it is the frequency of samples used in a digital recording.
    The standard sample rate used for audio CDs is 44.1 kilohertz (44,100 hertz). That means each second of a song on a CD contains 44,100 individual samples.
    """
    waveform, samplerate = torchaudio.load(audio_file) # https://pytorch.org/tutorials/beginner/audio_preprocessing_tutorial.html#loading-audio-data-into-tensor
    return (waveform, samplerate) # waveform is the PCM data.
  
  # staticmethod
  # def rechannel(aud, new_channel):
  #   sig, sr = aud

  #   if (sig.shape[0] == new_channel):
  #     # Nothing to do
  #     return aud

  #   if (new_channel == 1):
  #     # Convert from stereo to mono by selecting only the first channel
  #     resig = sig[:1, :]
  #   else:
  #     # Convert from mono to stereo by duplicating the first channel
  #     resig = torch.cat([sig, sig])

  #   return ((resig, sr))

  # staticmethod
  # def resample(aud, newsr):
  #   """
  #   This method will make sure that my sampling rate is equal across all audio signals.
  #   I don't think I need this method because I think I have the same sampling rate across all audio signals. 
  #   """
  #   waveform, samplerate = aud # 

  #   if (samplerate == newsr):
  #     # Nothing to do
  #     return aud

  #   num_channels = waveform.shape[0]
  #   # Resample first channel
  #   resig = torchaudio.transforms.Resample(samplerate, newsr)(waveform[:1,:])

  #   ### Don't think I need this because I think I have mono channels across all audio signals
  #   # if (num_channels > 1):
  #   #   # Resample the second channel and merge both channels
  #   #   retwo = torchaudio.transforms.Resample(samplerate, newsr)(waveform[1:,:])
  #   #   resig = torch.cat([resig, retwo])

  #   return ((resig, newsr))

    # @ staticmethod
    # def get_mrcg_from_file(file):
    #   waveform, sr = torchaudio.load(file)
    #   MAX = 75348463 # zero padding so that the length is equal to the length of the longest waveform 
    #   waveform, sr = torchaudio.load(file)
    #   target = torch.zeros(MAX)
    #   source = waveform[0]
    #   target[:len(source)] = source
    #   return target

  # @ staticmethod
  # def pad_trunc(aud, max_ms):
  #   sig, sr = aud
  #   num_rows, sig_len = sig.shape
  #   max_len = sr//1000 * max_ms

  #   if (sig_len > max_len):
  #     # Truncate the signal to the given length
  #     sig = sig[:,:max_len]

  #   elif (sig_len < max_len):
  #     # Length of padding to add at the beginning and end of the signal
  #     pad_begin_len = random.randint(0, max_len - sig_len)
  #     pad_end_len = max_len - sig_len - pad_begin_len

  #     # Pad with 0s
  #     pad_begin = torch.zeros((num_rows, pad_begin_len))
  #     pad_end = torch.zeros((num_rows, pad_end_len))

  #     sig = torch.cat((pad_begin, sig, pad_end), 1)
      
  #   return (sig, sr)



  # @ staticmethod
  # def time_shift(aud, shift_limit):
  #   sig,sr = aud
  #   _, sig_len = sig.shape
  #   shift_amt = int(random.random() * shift_limit * sig_len)
  #   return (sig.roll(shift_amt), sr)
  
  # @ staticmethod
  # def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
  #   sig,sr = aud
  #   top_db = 80

  #   # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
  #   spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

  #   # Convert to decibels
  #   spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
  #   return (spec)

  # @ staticmethod
  # def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
  #   _, n_mels, n_steps = spec.shape
  #   mask_value = spec.mean()
  #   aug_spec = spec

  #   freq_mask_param = max_mask_pct * n_mels
  #   for _ in range(n_freq_masks):
  #     aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

  #   time_mask_param = max_mask_pct * n_steps
  #   for _ in range(n_time_masks):
  #     aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

  #   return aug_spec

In [None]:
len(df_train_toy['primary_label'].value_counts())

12

# Data Loader

In [None]:

class SoundDS(Dataset):
  def __init__(self, df, data_path):
    self.df = df
   # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # self.data_path = str(data_path)
    # self.duration = 4000
    # self.sr = 22050
    # self.channel = 1
    # self.shift_pct = 0.4
            
  
  def __len__(self):
    return len(self.df)    
    
  # ----------------------------
  # Get i'th item in dataset
  # ----------------------------
  def __getitem__(self, idx):
    # Absolute file path of the audio file - concatenate the audio directory with
    # the relative path
    # audio_file = self.data_path + self.df.loc[idx, 'relative_path']
    
    # # Get the Class ID
    # class_id = self.df.loc[idx, 'primary_label']

    # aud = AudioUtil.open(audio_file)
    # # Some sounds have a higher sample rate, or fewer channels compared to the
    # # majority. So make all sounds have the same number of channels and same 
    # # sample rate. Unless the sample rate is the same, the pad_trunc will still
    # # result in arrays of different lengths, even though the sound duration is
    # # the same.
    
    
    # reaud = AudioUtil.resample(aud, self.sr)
    # rechan = AudioUtil.rechannel(reaud, self.channel)

    # dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
    # shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
    # sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
    # aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
  #  torch.from_numpy(np_array)
    
    return (torch.from_numpy(self.df['mrcg'].iloc[idx]), torch.tensor(self.df['primary_label'].iloc[idx]))

In [None]:
myds = SoundDS(df_train_toy, "")

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=8, shuffle=True)
val_dl = torch.utils.data.DataLoader(train_ds, batch_size=8, shuffle=False)

# Model

In [None]:


# ----------------------------
# Audio Classification Model
# ----------------------------
class AudioClassifier (nn.Module):
    # ----------------------------
    # Build the model architecture
    # ----------------------------
    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Second Convolution Block
        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        # Second Convolution Block
        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=64, out_features=397)

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)
 
    # ----------------------------
    # Forward pass computations
    # ----------------------------
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)

        # Final output
        return x

class AudioClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.input_fc = nn.Linear(input_dim, 50)
        self.hidden_fc = nn.Linear(50, 50)
        self.output_fc = nn.Linear(50, output_dim)

    def forward(self, x):

        # x = [batch size, height, width]
       # print(x.shape)
        batch_size = x.shape[0]


        x = x.view(batch_size, -1)
      #  print(x.shape)
        # x = [batch size, height * width]

        h_1 = F.relu(self.input_fc(x))

        # h_1 = [batch size, 250]

        h_2 = F.relu(self.hidden_fc(h_1))

        # h_2 = [batch size, 100]

        y_pred = self.output_fc(h_2)

        # y_pred = [batch size, output dim]

        return y_pred



device(type='cuda', index=0)

# Training

In [None]:
# Create the model and put it on the GPU if available
myModel = AudioClassifier(768*6054, 17)
device = torch.device( "cpu") #"cuda:0" if torch.cuda.is_available() else
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device


# ----------------------------
# Training Loop
# ----------------------------
def training(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = torch.tensor(data[0]).to(device), torch.tensor(data[1]).to(device)
       # print(labels.shape)
        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs.float())
      #  print(outputs.shape)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        #if i % 10 == 0:    # print every 10 mini-batches
        #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')

  print('Finished Training')
  
num_epochs=2   # Just for demo, adjust this higher.
training(myModel, train_dl, num_epochs)





torch.Size([8, 768, 6054])
torch.Size([8, 4649472])
torch.Size([8, 768, 6054])
torch.Size([8, 4649472])
Epoch: 0, Loss: nan, Accuracy: 0.00
torch.Size([8, 768, 6054])
torch.Size([8, 4649472])
torch.Size([8, 768, 6054])
torch.Size([8, 4649472])
Epoch: 1, Loss: nan, Accuracy: 0.00
Finished Training


In [None]:
df_train_toy['primary_label'] = range(1,len(df_train_toy)+1)
df_train_toy['primary_label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


42395     1
19597     2
32862     3
17812     4
17967     5
42300     6
35875     7
21558     8
43441     9
36320    10
5684     11
18159    12
46100    13
23432    14
43268    15
58599    16
Name: primary_label, dtype: int64

# Testing

In [None]:
# ----------------------------
# Inference
# ----------------------------
def inference (model, val_dl):
  correct_prediction = 0
  total_prediction = 0

  # Disable gradient updates
  with torch.no_grad():
    for data in val_dl:
      # Get the input features and target labels, and put them on the GPU
      inputs, labels = data[0].to(device), data[1].to(device)

      # Normalize the inputs
      inputs_m, inputs_s = inputs.mean(), inputs.std()
      inputs = (inputs - inputs_m) / inputs_s

      # Get predictions
      outputs = model(inputs)

      # Get the predicted class with the highest score
      _, prediction = torch.max(outputs,1)
      # Count of predictions that matched the target label
      correct_prediction += (prediction == labels).sum().item()
      total_prediction += prediction.shape[0]
    
  acc = correct_prediction/total_prediction
  print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

# Run inference on trained model with the validation set
inference(myModel, val_dl)

KeyboardInterrupt: ignored