<a href="https://colab.research.google.com/github/yakovsushenok/Thesis/blob/main/PreProcessing1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import zipfile
import os
import pandas as pd
import math, random
import torch
import torchaudio
from torchaudio import transforms
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import init
from google.colab import drive
from sklearn.model_selection import StratifiedShuffleSplit
drive.mount('/content/gdrive')
import time
import matplotlib.pyplot as plt
import numpy as np
import numpy.matlib
try:
    from scipy.fftpack import fft, ifft
except ImportError:
    from numpy.fft import fft, ifft
from scipy.signal import lfilter
import scipy.io as sio
from scipy import signal
import gc
import h5py
random.seed(0)
np.random.seed(0)

Mounted at /content/gdrive


In [None]:
df_500 = pd.read_csv("/content/gdrive/MyDrive/train_metadata_more_than_500.csv")
df_500['primary_label'] = df_500['primary_label'].apply(lambda x: x - 1)
df_500 = df_500[['relative_path', 'primary_label']]

In [2]:
### MRCG CODE
epsc = 0.000001

def mrcg_extract(sig, sampFreq = 32000): # Sample frequency is always 32,000 in our case
    # Code From: https://github.com/MoongMoong/MRCG_python/blob/master/MRCG_python_master/mrcg/MRCG.py
    
    beta = 1000 / np.sqrt(sum(map(lambda x:x*x,sig)) / len(sig))
    sig = sig*beta
    sig = sig.reshape(len(sig), 1)
    g = gammatone(sig, 64, sampFreq)
    cochlea1 = np.log10(cochleagram(g, int(sampFreq * 0.025), int(sampFreq * 0.010)) + 0.0000005)
    cochlea2 = np.log10(cochleagram(g, int(sampFreq * 0.200), int(sampFreq * 0.010)) + 0.0000005) # 768, x 
    cochlea1 = cochlea1[:,:]
    cochlea2 = cochlea2[:,:]
    cochlea3 = get_avg(cochlea1, 5, 5)
    cochlea4 = get_avg(cochlea1, 11, 11)
    
    all_cochleas = np.concatenate([cochlea1,cochlea2,cochlea3,cochlea4],0)
    del0 = deltas(all_cochleas)
    ddel = deltas(deltas(all_cochleas, 5), 5)

    ouotput = np.concatenate((all_cochleas, del0, ddel), 0)

    return ouotput

def gammatone(insig, numChan=128, fs = 16000): # 
    fRange = [1000, 20000] # try from 1000 to 20000 (was [50, 8000])
    filterOrder = 4
    gL = 2048
    sigLength = len(insig)
    phase = np.zeros([numChan, 1])
    erb_b = hz2erb(fRange)

    
    erb_b_diff = (erb_b[1]-erb_b[0])/(numChan-1)
    erb = np.arange(erb_b[0], erb_b[1]+epsc, erb_b_diff)
    cf = erb2hz(erb)
    b = [1.019 * 24.7 * (4.37 * x / 1000 + 1) for x in cf]
    gt = np.zeros([numChan, gL])
    tmp_t = np.arange(1,gL+1)/fs
    for i in range(numChan):
        gain = 10**((loudness(cf[i])-60)/20)/3*(2 * np.pi * b[i] / fs)**4
        tmp_temp = [gain*(fs**3)*x**(filterOrder - 1)*np.exp(-2 * np.pi * b[i] * x)*np.cos(2 * np.pi * cf[i] * x + phase[i]) for x in tmp_t]
        tmp_temp2 = np.reshape(tmp_temp, [1, gL])

        gt[i, :] = tmp_temp2

    sig = np.reshape(insig,[sigLength,1])
    gt2 = np.transpose(gt)
    resig = np.matlib.repmat(sig,1,numChan)
    r = np.transpose(fftfilt(gt2,resig,numChan))
    return r

def hz2erb(hz):  
    erb1 = 0.00437
    erb2 = np.multiply(erb1,hz)
    erb3 = np.subtract(erb2,-1)
    erb4 = np.log10(erb3)
    erb = 21.4 *erb4
    return erb

def erb2hz(erb): 
    hz = [(10**(x/21.4)-1)/(0.00437) for x in erb]
    return hz

def loudness(freq): 
    dB=60
    fmat = sio.loadmat('/content/gdrive/MyDrive/f_af_bf_cf.mat')
    af = fmat['af'][0]
    bf = fmat['bf'][0]
    cf = fmat['cf'][0]
    ff = fmat['ff'][0]
    i = 0
    while ff[i] < freq and i < len(ff) - 1: # my code:  i < len(ff)
        i = i + 1

    afy = af[i - 1] + (freq - ff[i - 1]) * (af[i] - af[i - 1]) / (ff[i] - ff[i - 1])
    bfy = bf[i - 1] + (freq - ff[i - 1]) * (bf[i] - bf[i - 1]) / (ff[i] - ff[i - 1])
    cfy = cf[i - 1] + (freq - ff[i - 1]) * (cf[i] - cf[i - 1]) / (ff[i] - ff[i - 1])
    loud = 4.2 + afy * (dB - cfy) / (1 + bfy * (dB - cfy))
    return loud

def fftfilt(b,x,nfft): 
    fftflops = [18, 59, 138, 303, 660, 1441, 3150, 6875, 14952, 32373, 69762,
                149647, 319644, 680105, 1441974, 3047619, 6422736, 13500637, 28311786,
                59244791, 59244791*2.09]
    nb, _ = np.shape(b)
    nx, mx = np.shape(x)
    n_min = 0
    while 2**n_min < nb-1:
        n_min = n_min+1
    n_temp = np.arange(n_min, 21 + epsc, 1)
    n = np.power(2,n_temp)
    fftflops = fftflops[n_min-1:21]
    L = np.subtract(n,nb-1)
    lenL= np.size(L)
    temp_ind0 = np.ceil(np.divide(nx,L))
    temp_ind = np.multiply(temp_ind0,fftflops)
    temp_ind = np.array(temp_ind)
    ind = np.argmin(temp_ind)
    nfft=int(n[ind])
    L=int(L[ind])
    b_tr = np.transpose(b)
    B_tr = fft(b_tr,nfft)
    B = np.transpose(B_tr)
    y = np.zeros([nx, mx])
    istart = 0
    while istart < nx :
        iend = min(istart+L,nx)
        if (iend - istart) == 1 :
            X = x[0][0]*np.ones([nx,mx])
        else :
            xtr = np.transpose(x[istart:iend][:])
            Xtr = fft(xtr,nfft)
            X = np.transpose(Xtr)
        temp_Y = np.transpose(np.multiply(B,X))
        Ytr = ifft(temp_Y,nfft)
        Y = np.transpose(Ytr)
        yend = np.min([nx, istart + nfft])
        y[istart:yend][:] = y[istart:yend][:] + np.real(Y[0:yend-istart][:])

        istart = istart + L
    
    return y

def cochleagram(r, winLength = 320, winShift=160): 
    numChan, sigLength = np.shape(r)
    increment = winLength / winShift
    M = np.floor(sigLength / winShift)
    a = np.zeros([numChan, int(M)])
    rs = np.square(r)
    rsl = np.concatenate((np.zeros([numChan,winLength-winShift]),rs),1)
    for m in range(int(M)):
        temp = rsl[:,m*winShift : m*winShift+winLength]
        a[:, m] = np.sum(temp,1)

    return a

def get_avg( m , v_span, h_span): 
    nr,nc = np.shape(m)

    fil_size = (2 * v_span + 1) * (2 * h_span + 1)
    meanfil = np.ones([1+2*h_span,1+2*h_span])
    meanfil = np.divide(meanfil,fil_size)

    out = signal.convolve2d(m, meanfil, boundary='fill', fillvalue=0, mode='same')
    return out

def deltas(x, w=9) : 
    nr,nc = np.shape(x)
    if nc ==0 :
        d= x
    else :
        hlen = int(np.floor(w / 2))
        w = 2 * hlen + 1
        win=np.arange(hlen, int(-(hlen+1)), -1)
        temp = x[:, 0]
        fx = np.matlib.repmat(temp.reshape([-1,1]), 1, int(hlen))
        temp = x[:, nc-1]
        ex = np.matlib.repmat(temp.reshape([-1,1]), 1, int(hlen))
        xx = np.concatenate((fx, x, ex),1)
        d = lfilter(win, 1, xx, 1)
        d = d[:,2*hlen:nc+2*hlen]

    return d

In [5]:
def get_mrcg_from_file(file):
  mid = 650000 # zero padding so that mrcgs are 20 seconds max # 2000000 for 60 sec
  waveform, sr = torchaudio.load(file)
  if len(waveform[0]) < mid:
    target = torch.zeros(mid)
    source = waveform[0]
    target[:len(source)] = source
    return mrcg_extract(target)
  else:
    waveform = waveform[0]
    return mrcg_extract(waveform[:mid])

In [None]:
df_list = [i for i in range(0,len(df_500)+100, 100)]
df_list[-1] = len(df_500)

for i in range(58,len(df_list)-1):    
    
  print(f"currently on df {i+1}")
  df = df_500[df_list[i]:df_list[i+1]]
  df.to_csv(f'/content/gdrive/MyDrive/df_500{i+1}.csv') 
  x = torch.tensor(get_mrcg_from_file(df['relative_path'].iloc[0]))     
  x = x[None, : , :]
  for j in range(1,len(df)):
    x1 = torch.tensor(get_mrcg_from_file(df['relative_path'].iloc[j])) 
    x1 = x1[None, :, :]
    x = torch.cat((x,x1), 0)
  print(f"saving tensor {i+1}")
  torch.save(x, f'/content/gdrive/MyDrive/df_train_tensor_500{i+1}.pt')

In [None]:
tensors = h5py.File('/content/gdrive/MyDrive/tensors_20_500.h5', 'w') 
N = len(df_500)
data = tensors.create_dataset('data', shape=(N, 768, 2031), dtype=np.float32, fillvalue=0)
df_list = [i for i in range(0,len(df_500)+100, 100)]
df_list[-1] = len(df_500)
for i in range(1,len(df_list)):
    print(i)
    x = torch.load(f'/content/gdrive/MyDrive/df_train_tensor_500{i}.pt')
    data[df_list[i-1]:df_list[i]] = x
tensors.close()

In [3]:
tensors = h5py.File('/content/gdrive/MyDrive/tensors_20_500.h5', 'r') 
random_sample = random.sample(range(0, 6000), 2000)
random_sample.sort()
tensors_toy = h5py.File('/content/gdrive/MyDrive/tensors_20_500_toy.h5', 'w') 
N = len(random_sample)
data = tensors_toy.create_dataset('data', shape=(N, 768, 2031), dtype=np.float32, fillvalue=0)
data[:] = tensors['data'][random_sample]
df_500_toy = df_500.iloc[random_sample]

In [6]:
class H5DS(Dataset):
  def __init__(self, df, path):
    self.path = path
    self.data = h5py.File(self.path, 'r') 
    self.data = self.data['data']
    self.df = df
  
  def __len__(self):
    return len(self.df)    
    
  def __getitem__(self, idx):
   
   return (self.data[idx], torch.tensor(self.df['primary_label'].iloc[idx]))

def inference(model, val_dl):
  correct_prediction = 0
  total_prediction = 0

  # Disable gradient updates
  with torch.no_grad():
    for data in val_dl:
      # Get the input features and target labels, and put them on the GPU
      inputs, labels = data[0].to(device), data[1].to(device)

      # Normalize the inputs
      # inputs_m, inputs_s = inputs.mean(), inputs.std()
      # inputs = (7 + (inputs - inputs_m) / inputs_s)
      # Get predictions
      outputs = model(inputs.float())

      # Get the predicted class with the highest score
      _, prediction = torch.max(outputs,1)
      # Count of predictions that matched the target label
      correct_prediction += (prediction == labels).sum().item()
      total_prediction += prediction.shape[0]
    
  acc = correct_prediction/total_prediction
  print(f'Val Accuracy: {acc:.2f}')

In [7]:
class AudioClassifier(nn.Module): # CNN2
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.conv1 = nn.Conv2d(1, 50, 3)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size = (2,2), stride = (2,2))
        self.conv2 = nn.Conv2d(50,20,3)
        self.fc = nn.Linear(7711440,12)
        self.flatten = nn.Flatten(start_dim=1)

    def forward(self, x):
        x = x[None, :, :, :]
        x = x.permute(1, 0, 2, 3)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)       
        x = self.relu(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x


    
# Create the model and put it on the GPU if available
myModel = AudioClassifier(768*2031, 12)
device = torch.device("cuda:0" if torch.cuda.is_available() else  "cpu") #
myModel = myModel.to(device)

def training(model, train_dl, num_epochs, val_dl):
  # Loss Function, Optimizer 
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  # Epoch iterator
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0
    indices = []
    # Batch iterator
    j = 0
    for i, data in enumerate(train_dl):

        inputs, labels = torch.tensor(data[0]).to(device), torch.tensor(data[1]).to(device) # Get the input features and target labels, and put them on the GPU
        if torch.isnan(torch.tensor(data[0])).any() == True:
          j += 1
          continue
        # Normalize the inputs - 1
        # inputs_m, inputs_s = inputs.mean(), inputs.std()
        # inputs = (7 + (inputs - inputs_m) / inputs_s)

        
        optimizer.zero_grad() # Zero the parameter gradients

        # forward + backward + optimize
        outputs = model(inputs.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        running_loss += loss.item() # Keep stats for Loss and Accuracy

        _, prediction = torch.max(outputs,1) # Get the predicted class with the highest score
        #print(prediction, labels)
        correct_prediction += (prediction == labels).sum().item() # Count of predictions that matched the target label
        total_prediction += prediction.shape[0]

    
    # Print stats at the end of the epoch
    print(j)
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch + 1}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')
    print(f"TESTING:")
    inference(model, val_dl)
    print("\n")

  print('Finished Training')


path = '/content/gdrive/MyDrive/tensors_20_500_toy.h5'
NUM_EPOCHS = 15
# Initializing the dataset
myds = H5DS(df_500_toy, path)
# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items*(0.80))
num_val = num_items - num_train
print(num_train,num_val)
train_ds, val_ds = random_split(myds, [num_train, num_val])
# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=10, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=10, shuffle=False)

training(myModel, train_dl, NUM_EPOCHS, val_dl) # Training

1600 400




0
Epoch: 1, Loss: 35.07, Accuracy: 0.18
TESTING:
Val Accuracy: 0.22


0
Epoch: 2, Loss: 0.70, Accuracy: 0.86
TESTING:
Val Accuracy: 0.23


0
Epoch: 3, Loss: 0.16, Accuracy: 0.98
TESTING:
Val Accuracy: 0.27


0
Epoch: 4, Loss: 0.04, Accuracy: 0.99
TESTING:
Val Accuracy: 0.27


0
Epoch: 5, Loss: 0.02, Accuracy: 1.00
TESTING:
Val Accuracy: 0.28




KeyboardInterrupt: ignored

# Results

---
Sample Length (s) : 20  $|$ _  $|$ _ $|$ _

Model: CNN2

```
        self.conv1 = nn.Conv2d(1, 50, 3)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size = (2,2), stride = (2,2))
        self.conv2 = nn.Conv2d(50,20,3)
        self.fc = nn.Linear(7711440,12)
        self.flatten = nn.Flatten(start_dim=1)

    def forward(self, x):
        x = x[None, :, :, :]
        x = x.permute(1, 0, 2, 3)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x
```

Normalization: Yes $|$ Yes $|$ Yes $|$ No

Mini-batch size = 10 $|$ _ $|$ 10 

Number of Samples in training: 2000*(0.8) $|$ 6000*(0.9) $|$ 6000*(0.8) $|$ 2000*(0.8)

`fs = [1000,20000]`

Best Val Accurary = 0.29 at epoch 2 $|$ _ $|$ 0.39 epoch 2 $|$ 0.29 epoch 5

Input shape: `(768, 2031)` $|$ _ $|$ _ $|$ _


Trainable params: 92,546,812

---

Sample Length (s) : 20

Model: CNN2

```
        self.conv1 = nn.Conv2d(1, 16, 5)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size = (2,2), stride = (2,2))
        self.conv2 = nn.Conv2d(16,10,3)
        self.fc = nn.Linear(3841800,12)
        self.flatten = nn.Flatten(start_dim=1)

    def forward(self, x):
        x = x[None, :, :, :]
        x = x.permute(1, 0, 2, 3)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x
```

Normalization: Yes

Number of classes: 12

Mini-batch size = 5

Number of Samples in training: 1000*(0.8)

`fs = [1000,20000]`

Best Val Accurary = 0.14 

Input shape: `(768, 2031)`


Trainable params: 46,103,478

---

Sample Length (s) : 20

Model: CNN2

```
        self.conv1 = nn.Conv2d(1, 100, 3)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size = (2,2), stride = (2,2))
        self.conv2 = nn.Conv2d(100,50,3)
        self.conv3 = nn.Conv2d(50,20,3)
        self.fc = nn.Linear(473760,12)
        self.flatten = nn.Flatten(start_dim=1)

    def forward(self, x):
        x = x[None, :, :, :]
        x = x.permute(1, 0, 2, 3)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv3(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x
```

Normalization: Yes

Number of classes: 12

Mini-batch size = 5

Number of Samples in training: 1000*(0.8)

`fs = [1000,20000]`

Best Val Accurary = 0.23 epoch 10

Input shape: `(768, 2031)`


Trainable params: 5,740,202



---

Sample Length (s) : 20

Model: CNN2

```
        self.conv1 = nn.Conv2d(1, 50, 3)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size = (2,2), stride = (2,2))
        self.conv2 = nn.Conv2d(50,20,3)
        self.fc1 = nn.Linear(7711440,50)
        self.fc2 = nn.Linear(50,12)
       
        self.flatten = nn.Flatten(start_dim=1)

    def forward(self, x):
        x = x[None, :, :, :]
        x = x.permute(1, 0, 2, 3)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x
```

Normalization: Yes

Number of classes: 12

Mini-batch size = 5

Number of Samples in training: 1000*(0.8)

`fs = [1000,20000]`

Best Val Accurary = 0.10 epoch 10

Input shape: `(768, 2031)`


Trainable params: 385,582,182

---












In [6]:
model = AudioClassifier(768*2031, 12)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 385,582,182 trainable parameters


In [11]:
MyModel = None
torch.cuda.empty_cache()
with torch.no_grad():
    torch.cuda.empty_cache()