In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import os
import h5py
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.optim.lr_scheduler import ReduceLROnPlateau
import csv

#paths to our training/validation 
train_path = "/Users/zaherqubein/Desktop/Project/Training_Set"
val_path = "/Users/zaherqubein/Desktop/Project/Validation_Set"

class AudioResNet(nn.Module):
    def __init__(self, num_classes=1):
        super(AudioResNet, self).__init__()
        #set number of input channels to 128
        self.in_channels = 128
        #first conv network
        self.conv1 = nn.Conv2d(1, 128, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(128)
        self.layer1 = self.make_layer(128, 2, stride=1)
        self.layer2 = self.make_layer(256, 2, stride=2)
        self.layer3 = self.make_layer(512, 2, stride=2)
        self.layer4 = self.make_layer(1024, 2, stride=2)
        #reduce the pooling sieze on the feature map to 1 on 1
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(1024, num_classes)
        self.sigmoid = nn.Sigmoid()
#create a ResNet Layer with multiple Blocks
    def make_layer(self, out_channels, blocks, stride):
        layers = []
        layers.append(self.make_resNET_block(out_channels, stride))
        for _ in range(1, blocks):
            layers.append(self.make_resNET_block(out_channels, stride=1))
        return nn.Sequential(*layers)

    def make_resNET_block(self, out_channels, stride):
        downsample = None
        #check if we need to do downsample by checking if output channels
        if stride != 1 or self.in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels),
            )
            #create a ResBlock update number of input channels
        block = ResBlock(self.in_channels, out_channels, stride, downsample)
        self.in_channels = out_channels
        return block
        #in this function we pass it to the next layers

    def forward(self, x):
        #pass the input between our convo layers we have 4
        x = torch.relu(self.bn1(self.conv1(x)))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        #apply pooling
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
    #then pass it through Fully conected lauer amd apply sigmoid activation    
        x = self.sigmoid(x)
        #then remove any extra dimensions by squeezing it
        return x.squeeze()

    def extract_features(self, x):
        x = torch.relu(self.bn1(self.conv1(x)))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avg_pool(x)
        #flatten our outpot to get feature vector
        return x.view(x.size(0), -1)

class ResBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResBlock, self).__init__()
        #pass it to the first convo layer
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        #batch normalisation 
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        identity = x
        out = torch.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        if self.downsample is not None:
            identity = self.downsample(x)
        out += identity
        return torch.relu(out)

def load_Training_data(file_path):
    all_segments = []
    #to store all features and labels that we extracted in preproccessing
    all_labels = []
    with h5py.File(file_path, 'r') as hf:
        #iterate over each file group in the HDF5 file
        for file_group in hf.values():
            if isinstance(file_group, h5py.Group):
                #load the segments from the current group and append them to thet array
                segments = file_group['segments'][:]
                labels = file_group['labels'][:]
                all_segments.append(segments)
                all_labels.append(labels)
    #concatenate them into single numpy array
    x = np.concatenate(all_segments, axis=0)
    y = np.concatenate(all_labels, axis=0)
    
    print(f"Proccessed data shape: x: {x.shape}, y: {y.shape}.")
    print(f"x dtype: {x.dtype}, y dtype: {y.dtype}")
    print(f"x range: ({x.min()}, {x.max()}), y range: ({y.min()}, {y.max()})")
    #convert numpy into pytorch
    return torch.FloatTensor(x), torch.FloatTensor(y)

def load_validation_data(val_path):
    val_data = []
    print(f"Loading the validation data from: {val_path}")
    
    h5_file = os.path.join(val_path, 'val_all.h5')
    #check if file exists
    if os.path.exists(h5_file):
        #open and iterate over the h5 file
        with h5py.File(h5_file, 'r') as hf:
            for key in hf.keys():
                file_data = {'file': hf[key]['file'][()],
                    'segments': torch.FloatTensor(hf[key]['segments'][:]),
                    'labels': np.array(hf[key]['labels'][:]),
                    'start_times': np.array(hf[key]['start_times'][:]),
                    'end_times': np.array(hf[key]['end_times'][:])}
                #APPEND EACH CREATED DICTIONARY TO VAL_DATA
                val_data.append(file_data)
        
        print(f"Loaded data from {h5_file}")
        print(f"Number of validation files: {len(val_data)}")
    else:
        print(f"Couldn't find : {h5_file} file")
    
    return val_data

def train(model, train_loader, criterion, optimizer, scheduler, device, num_epochs=3, patience=5):
    model.train()
    # set the model to training mode
    #generate a maximum value for best loss we willl decrease it later
    best_loss = float('inf')
    #patience counter helps us to stop early incase our model's performance doesn't improve
    patience_counter = 0
    for epoch in range(num_epochs):
#calculate and save the loss in each epoch loop by number of epochs
        running_loss = 0.0
        #loop over batches from training loader
        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            #move to GPU to improve performance and to avoid KERNEL failing
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            
            outputs = model(inputs)
            loss = criterion(outputs, labels.float())
            #compute loss and update model parameters
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        epoch_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")
        
        scheduler.step(epoch_loss)
        
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping")
                break
#after training the model we validate it using FEW SHOT LEARNING
def validate_few_shot(model, val_data, device, thresholds=np.linspace(0, 1, 100), support_size=5, batch_size=16):
    model.eval()
    all_results = []
    #lists to store results and predictions
    all_predictions = []

    for file_data in val_data:
        #loop over validation file
        #extract segments,labels start and end time
        segments = file_data['segments']
        labels = file_data['labels']
        start_times = file_data['start_times']
        end_times = file_data['end_times']
        #find indices of all positive samples
        pos_indices = np.where(labels == 1)[0]
        #we need at least 5 samples for few shot learning
        if len(pos_indices) < support_size:
            print(f"Not enough samples Skipping file: {file_data['file']}.")
            continue
        #we select the first 5 positive samples as support set
        support_indices = pos_indices[:support_size]
        #for the rest of the samples we use them as query
        query_indices = np.arange(len(segments))
        query_indices = query_indices[query_indices > max(support_indices)]
#if we don't have any query samples left for the file then skip it 
        if len(query_indices) == 0:
            print(f" No query samples left after the first 5 Positive in {file_data['file']} Skipping this file.")
            continue
#extract support and query set
        support_set = segments[support_indices]
        query_set = segments[query_indices]
        query_labels = labels[query_indices]

        with torch.no_grad():
            #extract features for support set
            support_features = model.extract_features(support_set.to(device))
            prototype = support_features.mean(dim=0)
            
            query_features = []
            #extract the features for query set in batches 
            for i in range(0, len(query_set), batch_size):
                batch = query_set[i:i+batch_size].to(device)
                batch_features = model.extract_features(batch)
                #move to cpu for distance calculation
                query_features.append(batch_features.cpu())
            query_features = torch.cat(query_features, dim=0)
            #calculate distances between query and prototype

            distances=torch.cdist(query_features, prototype.unsqueeze(0).cpu()).squeeze().numpy()
            #then we normalise distances between 0 and 1
            distances = (distances - distances.min()) / (distances.max() - distances.min())
            
            median_threshold = np.median(distances)
            
            best_threshold = 0
            best_f1 = 0
            #evaluation F metrics
            for threshold in thresholds:
                predictions = (distances < threshold).astype(int)
                f1 = f1_score(query_labels, predictions, average='binary')
                if f1 > best_f1:
                    best_f1 = f1
                    best_threshold = threshold
            
            predictions = (distances < best_threshold).astype(int)

        accuracy = accuracy_score(query_labels, predictions)
        precision = precision_score(query_labels, predictions, average='binary', zero_division=1)
        recall = recall_score(query_labels, predictions, average='binary', zero_division=1)
        f1 = f1_score(query_labels, predictions, average='binary')
#store results for each file
        all_results.append({
            'file': file_data['file'],
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'median_threshold': median_threshold,
            'best_threshold': best_threshold})
#store predictions in CSV files
        for i, pred in enumerate(predictions):
            if pred == 1:
                all_predictions.append({
                    'file': file_data['file'],
                    'start_time': start_times[query_indices[i]],
                    'end_time': end_times[query_indices[i]]
                })

    return all_results, all_predictions

def generate_csv_output(predictions, output_file):
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        #our header /cols in csv
        writer.writerow(['Audiofilename', 'Starttime', 'Endtime'])
        for pred in predictions:
            #write each prediciton as rows in the csv file
            writer.writerow([pred['file'], pred['start_time'], pred['end_time']])

if __name__ == "__main__":
    #use gpu if not available use cpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#train based on train_all.h5 file created in preproccessing
    train_x, train_y = load_Training_data(os.path.join(train_path, 'train_all.h5'))
    #after training call validation data
    val_data = load_validation_data(val_path)
    
    print(f"Training data path: {train_path}.")
    print(f"Validation data path: {val_path}.")
    print(f"Training shape: {train_x.shape}.")

    model = AudioResNet().to(device)
    for param in model.parameters():
        param.requires_grad = True
#define loss function binary cross entropy
    criterion = nn.BCELoss()
    #optimiser with learning rate
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    #reduce the learning rate if loss doesn't keep decreasing after three epochs
    scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3)
    #create a pytorch dataset and loader for trainnig the data
    train_dataset = TensorDataset(train_x, train_y)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    train(model, train_loader, criterion, optimizer, scheduler, device, num_epochs=15, patience=5)
#train the model and load the best model wuth lowest loss for evaluation
    model.load_state_dict(torch.load('best_model.pth'))

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
#validate using few shot learning
    results, predictions = validate_few_shot(model, val_data, device)

    if results:
        #if we have results print and go over them
        print("Validation Results:")
        avg_accuracy = avg_precision = avg_recall = avg_f1 = avg_median_threshold = avg_best_threshold = 0
        #for each file in results
        for file_result in results:
            print(f"File: {file_result['file']}.")
            for metric, value in file_result.items():
                if metric not in ['file', 'median_threshold', 'best_threshold']:
                    print(f"{metric}: {value:.4f}")    
                elif metric in ['median_threshold', 'best_threshold']:
                    print(f"{metric}: {value:.6f}")
            print()
            avg_accuracy += file_result['accuracy']
            avg_precision += file_result['precision']
            avg_recall += file_result['recall']
            avg_f1 += file_result['f1']
            avg_median_threshold += file_result['median_threshold']
            avg_best_threshold += file_result['best_threshold']
        
        
        print("Average Results:")
        print(f"Accuracy: {avg_accuracy / len(results):.4f}")
        print(f"Precision: {avg_precision / len(results):.4f}")
        print(f"Recall: {avg_recall / len(results):.4f}")
        print(f"F1 Score: {avg_f1 / len(results):.4f}")
        print(f"Median Threshold: {avg_median_threshold / len(results):.6f}")
        print(f"Best Threshold: {avg_best_threshold / len(results):.6f}")

        generate_csv_output(predictions, 'CNN_ResNet.csv')
    else:
        print("No validation results were found")

Proccessed data shape: x: (18213, 1, 128, 86), y: (18213,).
x dtype: float32, y dtype: int64
x range: (0.0, 1.0), y range: (0, 1)
Loading the validation data from: /Users/zaherqubein/Desktop/Project/Validation_Set
Loaded data from /Users/zaherqubein/Desktop/Project/Validation_Set/val_all.h5
Number of validation files: 41
Training data path: /Users/zaherqubein/Desktop/Project/Training_Set
Validation data path: /Users/zaherqubein/Desktop/Project/Validation_Set
Training data shape: torch.Size([18213, 1, 128, 86])


Epoch 1/15:   0%|                                       | 0/570 [00:03<?, ?it/s]


KeyboardInterrupt: 