In [1]:
import time, os, sys, copy, argparse
import multiprocessing

from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix

import numpy as np

In [2]:
import torch
import torchvision
from torchvision import datasets, models, transforms
import torch.utils.data as data
from torch.utils.tensorboard import SummaryWriter

import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torchsummary import summary


In [3]:
#Set the train and validation directory paths
#dataset_file = "dataset/dataset_8000_1.05_None_64_0_None_256_128.pk"
dataset_file = "dataset/trainset_8000_4.1_None_64_0_None_512_512.pk"
#dataset_file = "dataset/dataset_8000_5_None_64_0_None_512_256.pk"
#dataset_file = "dataset/dataset_8000_5_None_64_0_None_256_128.pk"
#dataset_file = "dataset/dataset_8000_5_None_64_50_None_256_128.pk"

In [4]:
from model import BlazeNet 
from dataset import AudioDataset

In [5]:
# Batch size
bs = 32 
# Number of epochs
num_epochs = 10
# Number of classes
num_classes = 2
# Number of workers
num_cpu = multiprocessing.cpu_count()

# Applying transforms to the data
image_transforms = { 
    'train': transforms.Compose([
        #transforms.Resize(size=128),
        #transforms.CenterCrop(size=224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ]),
    'valid': transforms.Compose([
        #transforms.Resize(size=128),
        #transforms.CenterCrop(size=224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        #transforms.Resize(size=128),
        #transforms.CenterCrop(size=224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

}

**train/** - folder containing the training files, with each top-level folder representing a subject  
**train_labels.csv** - file containing the target MGMT_value for each subject in the training data (e.g. the presence of MGMT promoter methylation)   
**test/** - the test files, which use the same structure as train/; your task is to predict the MGMT_value for each subject in the test data. NOTE: the total size of the rerun test set (Public and Private) is ~5x the size of the Public test set   
**sample_submission.csv** - a sample submission file in the correct format

In [8]:
# Load data from folders
dataset = {
    'train': AudioDataset(dataset_file, subset="train", transform=image_transforms['train']),
    'valid': AudioDataset(dataset_file, subset="valid", transform=image_transforms['valid']),
    'test': AudioDataset(dataset_file, subset="test", transform=image_transforms['test'])
}
 
# Size of train and validation data
dataset_sizes = {
    'train':len(dataset['train']),
    'valid':len(dataset['valid']),
    'test':len(dataset['test'])
}

# Create iterators for data loading
dataloaders = {
    'train':data.DataLoader(dataset['train'], batch_size=bs, shuffle=True,
                            num_workers=4, pin_memory=True, drop_last=True),
    'valid':data.DataLoader(dataset['valid'], batch_size=bs, shuffle=False,
                            num_workers=4, pin_memory=True, drop_last=True),
    'test':data.DataLoader(dataset['test'], batch_size=bs, shuffle=False,
                            num_workers=4, pin_memory=True, drop_last=True)

}

# Class names or target labels
class_names = dataset['train'].classes
class_to_idx = dataset['train'].class_to_idx
print("Classes:", class_names)
print("Class_to_idx:", class_to_idx)
 
# Print the train and validation data sizes
print("Training-set size:",dataset_sizes['train'],
      "\nValidation-set size:", dataset_sizes['valid'],
      "\nTest-set size:", dataset_sizes['test'],
     )



datafile =  dataset/trainset_8000_4.1_None_64_0_None_512_512.pk
class = cry, len = 3156
class = nocry, len = 3941
dataset train len = 6982
datafile =  dataset/trainset_8000_4.1_None_64_0_None_512_512.pk
class = cry, len = 401
class = nocry, len = 490
dataset valid len = 881
datafile =  dataset/trainset_8000_4.1_None_64_0_None_512_512.pk
class = cry, len = 381
class = nocry, len = 441
dataset test len = 814
Classes: ['cry', 'nocry']
Class_to_idx: {'cry': 0, 'nocry': 1}
Training-set size: 6982 
Validation-set size: 881 
Test-set size: 814


In [9]:
# Set default device as gpu, if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [13]:
# Instantiate a neural network model 
model_ft = BlazeNet(back_model=2)

# Print model summary
print('Model Summary:-\n')
for num, (name, param) in enumerate(model_ft.named_parameters()):
    print(num, name, param.requires_grad )

model_ft = model_ft.to(device)
summary(model_ft, input_size=(3, 64, 64))
print(model_ft)

Model Summary:-

0 backbone1.0.weight True
1 backbone1.0.bias True
2 backbone1.2.convs.0.weight True
3 backbone1.2.convs.0.bias True
4 backbone1.2.convs.1.weight True
5 backbone1.2.convs.1.bias True
6 backbone1.3.convs.0.weight True
7 backbone1.3.convs.0.bias True
8 backbone1.3.convs.1.weight True
9 backbone1.3.convs.1.bias True
10 backbone1.4.convs.0.weight True
11 backbone1.4.convs.0.bias True
12 backbone1.4.convs.1.weight True
13 backbone1.4.convs.1.bias True
14 backbone1.5.convs.0.weight True
15 backbone1.5.convs.0.bias True
16 backbone1.5.convs.1.weight True
17 backbone1.5.convs.1.bias True
18 backbone1.6.convs.0.weight True
19 backbone1.6.convs.0.bias True
20 backbone1.6.convs.1.weight True
21 backbone1.6.convs.1.bias True
22 backbone1.7.convs.0.weight True
23 backbone1.7.convs.0.bias True
24 backbone1.7.convs.1.weight True
25 backbone1.7.convs.1.bias True
26 backbone1.8.convs.0.weight True
27 backbone1.8.convs.0.bias True
28 backbone1.8.convs.1.weight True
29 backbone1.8.convs.1

In [14]:
# Loss function
weight = torch.tensor([1.0, 1.0]).to(device)
criterion = nn.CrossEntropyLoss(weight=weight)
#criterion = nn.BCELoss(weight=weight)


In [15]:
# Model training routine 
print("\nTraining:-\n")
def train_model(model, criterion, optimizer, scheduler, num_epochs=30):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    # Tensorboard summary
    writer = SummaryWriter()
    
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0.0

            # Iterate over data.
            for inputs, labels, srcs, inds, vocals in dataloaders[phase]:
                inputs = inputs.to(device, non_blocking=True)
                labels = labels.to(device, non_blocking=True)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # Record training loss and accuracy for each phase
            if phase == 'train':
                writer.add_scalar('Train/Loss', epoch_loss, epoch)
                writer.add_scalar('Train/Accuracy', epoch_acc, epoch)
                writer.flush()
            else:
                writer.add_scalar('Valid/Loss', epoch_loss, epoch)
                writer.add_scalar('Valid/Accuracy', epoch_acc, epoch)
                writer.flush()

            # deep copy the model
            if phase == 'valid' and epoch_acc >= best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model



Training:-



In [16]:
# Optimizer 
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.01, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=20, gamma=0.1)
# Train the model
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,num_epochs=60)
# Save the entire model
print("\nSaving the model...")
model_file = "checkpoints/blazenet_{}.pth".format(dataset_file.split('/')[-1]) 
torch.save(model_ft, model_file)

Epoch 0/59
----------
train Loss: 0.4481 Acc: 0.7777
valid Loss: 0.3220 Acc: 0.8445

Epoch 1/59
----------


KeyboardInterrupt: 

In [16]:
# Class label names
class_names=['cry','nocry']

# Initialize the prediction and label lists
predlist=torch.zeros(0,dtype=torch.long, device='cpu')
lbllist=torch.zeros(0,dtype=torch.long, device='cpu')

# Evaluate the model accuracy on the dataset
correct = 0
total = 0
with torch.no_grad():
    for images, labels, srcs, inds, vocals in dataloaders["test"]:
        images, labels = images.to(device), labels.to(device)
        outputs = model_ft(images)
        _, predicted = torch.max(outputs.data, 1)

        total += labels.size(0)
        #print(labels,predicted)        
        correct += (predicted == labels).sum().item()

        predlist=torch.cat([predlist,predicted.view(-1).cpu()])
        lbllist=torch.cat([lbllist,labels.view(-1).cpu()])

# Overall accuracy
overall_accuracy=100 * correct / total
print('Accuracy of the network on the {:d} test images: {:.2f}%'.format(dataset_sizes['test'], 
    overall_accuracy))

# Confusion matrix
conf_mat=confusion_matrix(lbllist.numpy(), predlist.numpy())
print('Confusion Matrix')
print('-'*16)
print(conf_mat,'\n')

# Per-class accuracy
class_accuracy=100*conf_mat.diagonal()/conf_mat.sum(1)
print('Per class accuracy')
print('-'*18)
for label,accuracy in zip(dataset['test'].classes, class_accuracy):
     class_name=label
     print('Accuracy of class %8s : %0.2f %%'%(class_name, accuracy))



Accuracy of the network on the 814 test images: 92.75%
Confusion Matrix
----------------
[[362  19]
 [ 39 380]] 

Per class accuracy
------------------
Accuracy of class      cry : 95.01 %
Accuracy of class    nocry : 90.69 %


In [17]:
testset_file = 'dataset/testset_8000_4.1_None_64_0_None_512_512.pk'
test_dataset2 = AudioDataset(
    testset_file,    
    subset="test",
    mode = "RGB",
    transform = image_transforms["test"]
)
test_loader2 = data.DataLoader(
    test_dataset2,
    batch_size=32,
    shuffle=False,
    num_workers=4,
)
for i, (img,label,src,ind,vocal) in enumerate(test_loader2):
    print(i)
    print(img.shape)
    print(label)
    print(src)
    print(ind)
    print(vocal)
    break  


datafile =  dataset/testset_8000_4.1_None_64_0_None_512_512.pk
class = cry, len = 10
class = noncry, len = 0
dataset test len = 553
0
torch.Size([32, 3, 64, 64])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
('Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10C

In [25]:
predictions = [] 

with torch.no_grad():
    for images, labels, srcs, inds, vocals in test_loader2:
        images, labels = images.to(device), labels.to(device)
        outputs = model_ft(images)
        
        _, predicted = torch.max(outputs.data, 1)
        
        probs = torch.softmax(outputs.data, 1)[:,0]
                
        probs = probs.cpu().detach().numpy()
        inds = inds.detach().numpy()
        vocals = vocals.detach().numpy()


        print(srcs)
        print(probs)
        print(inds)
        print(vocals)        
        #print(predicted)

        total += labels.size(0)        
        pred = predicted.tolist()       
        
        for k in range(len(srcs)):
            predictions.append((srcs[k],probs[k],inds[k],vocals[k])) # 1 is cry 


('Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10Cry/7376bd92-3ccd-4672-bf08-2bb5dd992b30.mp3', 'Self/10C

In [27]:
predictions_by_audio = {} 

cry_thresh = 0.5 

audio_files = [x[0] for x in predictions]
audio_files = list(set(audio_files))

for au in audio_files:
    predictions_by_audio[au] = [] 
    
for p in predictions: 
    predictions_by_audio[p[0]].append((p[1],p[2],p[3],p[1]>cry_thresh))
    
for au in predictions_by_audio:
    print(au)
    print(predictions_by_audio[au])

Self/10Cry/146B9C6FA25A_monitoringOff_1619739616502.mp3
[(0.0006914061, 0, True, False), (0.00084187713, 1, True, False), (0.07199191, 2, True, False), (0.27040952, 3, True, False), (5.2814958e-05, 4, True, False), (0.007960474, 5, True, False), (4.764928e-05, 6, True, False), (0.66214305, 7, True, True), (0.99995315, 8, True, True), (0.99998915, 9, True, True), (0.9989028, 10, True, True), (1.0, 11, True, True), (0.9999999, 12, True, True), (1.0, 13, True, True), (1.0, 14, True, True), (0.99999857, 15, True, True), (1.0, 16, True, True), (1.0, 17, True, True), (1.0, 18, True, True), (1.0, 19, True, True), (1.0, 20, True, True), (0.9998288, 21, True, True), (0.9994912, 22, True, True), (1.0, 23, True, True), (0.99999833, 24, True, True), (1.0, 25, True, True), (0.036855433, 26, True, False), (0.9892537, 27, True, True), (0.9637172, 28, True, True), (0.0002938557, 29, True, False), (0.0047844998, 30, True, False), (9.8744364e-05, 31, True, False), (0.4373048, 32, True, False), (0.000293

In [20]:
print(len(predictions_by_audio))

10
