In [None]:
!pip install tensorflow_io



In [None]:
import os
from matplotlib import pyplot as plt
import tensorflow as tf
import tensorflow_io as tfio
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir('/content/drive/MyDrive/DL')
os.getcwd()

'/content/drive/MyDrive/DL'

In [None]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio

In [None]:
class AudioDataset(Dataset):
  def __init__(self, annotation_file, audio_dir, transformation = None):
    self.annotations = pd.read_csv(annotation_file)
    self.audio_dir = audio_dir
    self.transformation = transformation

  def __len__(self):
    return len(self.annotations)

  def __getitem__(self, index):
    audio_sample_path = self._get_audio_sample_path(index)
    label = self._get_audio_sample_label(index)
    signal, sr = torchaudio.load(audio_sample_path)
    signal = self._resample_if_necessary(signal, sr)
    signal = self._mix_down_if_necessary(signal)

    signal = self._cut_if_necessary(signal)
    signal = self._right_pad_if_necessary(signal)
    signal = self.transformation(signal)
    return signal, label

  def _resample_if_necessary(self, signal, sr):
    if sr != 16000:
      resampler = torchaudio.transforms.Resample(sr, 16000)
      signal = resampler(signal)
    return signal

  def _mix_down_if_necessary(self, signal):
    if signal.shape[0] > 1:
      signal = torch.mean(signal, dim=0, keepdim=True)
    return signal

  def _cut_if_necessary(self, signal):
    if signal.ndim < 2:
        raise ValueError("Expected signal to have at least two dimensions [C, T], got: {}".format(signal.ndim))
    if signal.shape[1] > 48000:
      signal = signal[:, :48000]
    return signal

  def _right_pad_if_necessary(self, signal):
    length_signal = signal.shape[1]
    if length_signal < 48000:
      num_missing_samples = 48000 - length_signal
      last_dim_padding = (0, num_missing_samples)
      signal = torch.nn.functional.pad(signal, last_dim_padding)
    return signal

  def _get_audio_sample_path(self, index):
    folder = self.annotations.iloc[index, 2]
    path = os.path.join(self.audio_dir, folder, self.annotations.iloc[index,1])
    return path

  def _get_audio_sample_label(self, index):
    return self.annotations.iloc[index, 3]

In [None]:
mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=1024, hop_length=512, n_mels=64)

In [None]:
capuchin_annotations = '/content/drive/MyDrive/DL/Parsed_Capuchinbird_Clips.csv'
not_capuchin_annotations = '/content/drive/MyDrive/DL/Parsed_Not_Capuchinbird_Clips.csv'
capuchin_dir = '/content/drive/MyDrive/DL/data'
capuchin = AudioDataset(capuchin_annotations, capuchin_dir, mel_spectrogram)
not_capuchin = AudioDataset(not_capuchin_annotations, capuchin_dir, mel_spectrogram)

In [None]:
len(not_capuchin)

593

In [None]:
from torch.utils.data import ConcatDataset
data = ConcatDataset([capuchin, not_capuchin])

In [None]:
from torch.utils.data import random_split

train_size = int(0.8 * len(data))
val_size = len(data) - train_size
train_data, val_data = random_split(data, [train_size, val_size])

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16, shuffle=True)

In [None]:
import torch
from torchvision import models

# Load pre-trained ResNet50
model = models.resnet50(pretrained=True)
model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)

# Modify the fully connected layer to match the number of classes
model.fc = torch.nn.Linear(model.fc.in_features, 2)

# Move model to GPU if available
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
#criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
  size = len(dataloader.dataset)
  losses = []
  for batch, (X, y) in enumerate(dataloader):
    temp = []
    # Compute prediction and loss
    X = X.to(device)
    y = y.to(device)
    pred = model(X)
    loss = loss_fn(pred, y)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    temp.append(loss.item())

    if batch % 20 == 0:
        loss, current = loss.item(), (batch + 1) * len(X)
        losses.append(loss)
        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

  return losses



def test_loop(dataloader, model, loss_fn):
  size = len(dataloader.dataset)
  num_batches = len(dataloader)
  test_loss, correct = 0, 0

  with torch.no_grad():
    for X, y in dataloader:
      X = X.to(device)
      y = y.to(device)
      pred = model(X)
      test_loss += loss_fn(pred, y).item()
      correct += (pred.argmax(1) == y).type(torch.float).sum().item()

  test_loss /= num_batches
  correct /= size
  print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
  return test_loss, correct

In [None]:
train_loss = []
val_loss = []
accuracy = []

epochs = 10
training_losses = []
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    model.train()
    loss_tr = train_loop(train_loader, model, criterion, optimizer)
    train_loss_avg = sum(loss_tr) / len(loss_tr)
    train_loss.append(train_loss_avg)
    #train_loop(train_dl, model, criterion, optimizer)
    model.eval()
    loss_vl, acc = test_loop(val_loader, model, criterion)
    val_loss.append(loss_vl)
    accuracy.append(acc)

print('Training done!')

Epoch 1
-------------------------------
loss: 0.647581  [   16/  648]
loss: 0.484001  [  336/  648]
loss: 0.657976  [  328/  648]
Test Error: 
 Accuracy: 94.4%, Avg loss: 0.199659 

Epoch 2
-------------------------------
loss: 0.322062  [   16/  648]
loss: 0.090585  [  336/  648]
loss: 0.385419  [  328/  648]
Test Error: 
 Accuracy: 96.9%, Avg loss: 0.069068 

Epoch 3
-------------------------------
loss: 0.198038  [   16/  648]
loss: 0.187826  [  336/  648]
loss: 0.347235  [  328/  648]
Test Error: 
 Accuracy: 95.1%, Avg loss: 0.158197 

Epoch 4
-------------------------------
loss: 0.543897  [   16/  648]
loss: 0.022252  [  336/  648]
loss: 0.159932  [  328/  648]
Test Error: 
 Accuracy: 96.9%, Avg loss: 0.083216 

Epoch 5
-------------------------------
loss: 0.265703  [   16/  648]
loss: 0.043587  [  336/  648]
loss: 0.027689  [  328/  648]
Test Error: 
 Accuracy: 98.1%, Avg loss: 0.114665 

Epoch 6
-------------------------------
loss: 0.018388  [   16/  648]
loss: 0.073593  [  3

In [None]:
'''
torch.save(model.state_dict(), 'model_weights.pth')
optimizer_state = optimizer.state_dict()  # Assuming 'optimizer' is your optimizer
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer_state,
}, 'model_checkpoint.pth')
'''

In [None]:
'''
# Re-create the model architecture
model = TheModelClass(*args, **kwargs)  # Replace with your model class and parameters

# Load the state dictionary
model.load_state_dict(torch.load('model_weights.pth'))

# If you saved the complete checkpoint including optimizer state:
checkpoint = torch.load('model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])

# If you also need to load the optimizer state:
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
'''


In [None]:
X_test, y_test = next(iter(val_loader))

In [None]:
X_test_numpy = X_test.numpy()
y_test_numpy = y_test.numpy()

In [None]:
X_test_2 = torch.tensor(X_test).float().to(device)

  X_test_2 = torch.tensor(X_test).float().to(device)


In [None]:
model.eval()
with torch.no_grad():
    yhat = model(X_test_2)
yhat_prob = torch.softmax(yhat, dim=1)
_, predicted_classes = torch.max(yhat_prob, 1)

In [None]:
(yhat_prob[:, 1] >= 0.5).long()

tensor([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0], device='cuda:0')

In [None]:
import os
import torch
import torchaudio
from torch.utils.data import DataLoader, TensorDataset

def load_mp3_16k_mono(filename):
    waveform, sample_rate = torchaudio.load(filename)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    if sample_rate != 16000:
        transformer = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = transformer(waveform)
    return waveform.squeeze()

def preprocess_audio(audio_tensor):
    zero_padding = torch.zeros(48000 - audio_tensor.shape[0])
    audio_tensor = torch.cat((zero_padding, audio_tensor), dim=0)
    spectrogram = torchaudio.transforms.Spectrogram(n_fft=320, hop_length=32)(audio_tensor)
    return torch.abs(spectrogram).unsqueeze(0)


In [None]:
batch_size = 16
model.to('cpu')

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
results = {}
directory = '/content/drive/MyDrive/DL/data/Forest Recordings'
model.to('cpu')

for file in os.listdir(directory):
    FILEPATH = os.path.join(directory, file)
    waveform = load_mp3_16k_mono(FILEPATH)

    # Split waveform into chunks of 48000 samples
    chunks = waveform.unfold(0, 48000, 48000)  # Non-overlapping chunks

    # Preprocess and collect all chunks
    preprocessed_chunks = [preprocess_audio(chunk) for chunk in chunks]

    # Create a dataset and a DataLoader for batching
    dataset = TensorDataset(torch.stack(preprocessed_chunks))
    dataloader = DataLoader(dataset, batch_size=64, shuffle=False)

    # Predict
    predictions = []
    for batch in dataloader:
        inputs = batch[0]  # Already on CPU by default
        outputs = model(inputs)  # Model and inputs both on CPU
        predictions.extend(outputs.detach().numpy())  # No need to call .cpu() on detach

    results[file] = predictions

In [None]:
results

KeyError: 0

In [None]:
class_preds = {}
for file, logits in results.items():
    class_preds[file] = [1 if any(pred > 0.99 for pred in prediction) else 0 for prediction in logits]

In [None]:
class_preds['recording_98.mp3']

[1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1]

In [None]:
results['recording_00.mp3']

[array([ 1.7783755, -1.743832 ], dtype=float32),
 array([ 1.7598797, -1.7275972], dtype=float32),
 array([ 1.8523972, -1.8177593], dtype=float32),
 array([ 1.9886256, -1.9529644], dtype=float32),
 array([ 0.5798351 , -0.61253184], dtype=float32),
 array([ 1.8498255, -1.8148897], dtype=float32),
 array([ 1.7847242, -1.7497902], dtype=float32),
 array([ 1.7890346, -1.7542467], dtype=float32),
 array([ 1.8223507, -1.7877554], dtype=float32),
 array([ 1.7973251, -1.7624512], dtype=float32),
 array([ 1.7852842, -1.7506639], dtype=float32),
 array([ 1.7709395, -1.7363927], dtype=float32),
 array([ 1.7723894, -1.7383232], dtype=float32),
 array([ 1.9540703, -1.9167547], dtype=float32),
 array([ 0.59402263, -0.6289    ], dtype=float32),
 array([ 1.7831537, -1.7482643], dtype=float32),
 array([ 1.7178625, -1.6915476], dtype=float32),
 array([ 1.7877142, -1.7526518], dtype=float32),
 array([ 1.7766523, -1.7415922], dtype=float32),
 array([ 1.7446396, -1.7094357], dtype=float32),
 array([ 1.78746

In [None]:
from itertools import groupby
postprocessed = {}
for file, scores in class_preds.items():
    postprocessed[file] = tf.math.reduce_sum([key for key, group in groupby(scores)]).numpy()
postprocessed

{'recording_00.mp3': 6,
 'recording_01.mp3': 4,
 'recording_02.mp3': 1,
 'recording_03.mp3': 1,
 'recording_04.mp3': 4,
 'recording_05.mp3': 1,
 'recording_06.mp3': 4,
 'recording_07.mp3': 3,
 'recording_08.mp3': 18,
 'recording_09.mp3': 1,
 'recording_10.mp3': 5,
 'recording_11.mp3': 3,
 'recording_12.mp3': 1,
 'recording_14.mp3': 1,
 'recording_13.mp3': 1,
 'recording_15.mp3': 3,
 'recording_16.mp3': 2,
 'recording_17.mp3': 4,
 'recording_18.mp3': 11,
 'recording_19.mp3': 1,
 'recording_20.mp3': 1,
 'recording_21.mp3': 2,
 'recording_22.mp3': 3,
 'recording_23.mp3': 3,
 'recording_24.mp3': 1,
 'recording_27.mp3': 1,
 'recording_26.mp3': 2,
 'recording_25.mp3': 3,
 'recording_29.mp3': 2,
 'recording_28.mp3': 3,
 'recording_30.mp3': 4,
 'recording_31.mp3': 1,
 'recording_32.mp3': 3,
 'recording_33.mp3': 1,
 'recording_34.mp3': 4,
 'recording_35.mp3': 1,
 'recording_37.mp3': 4,
 'recording_38.mp3': 2,
 'recording_36.mp3': 12,
 'recording_41.mp3': 1,
 'recording_40.mp3': 2,
 'recording_4