In [1]:
import numpy as np
import torch
import torchvision
import matplotlib.pyplot as plt
import torchaudio
import time
# to fix relative import problem
import os
import sys
sys.path.append('../')

  '"sox" backend is being deprecated. '


In [2]:
root_dir   = "/work/oarongve/data/sound_dataset/SoccerNet-code/data/"
train_file = root_dir+"listgame_Train_300.npy"
valid_file = root_dir+"listgame_Valid_100.npy"
test_file  = root_dir+"listgame_Test_100.npy"

In [3]:
# Disable warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
%%bash
nvidia-smi

Wed Nov  4 00:41:02 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.165.02   Driver Version: 418.165.02   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM3...  On   | 00000000:59:00.0 Off |                    0 |
| N/A   37C    P0    51W / 350W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

#  Test loading of some samples from train_samples.npy

In [5]:
from torch.utils.data import Dataset, DataLoader

class soccernet_ms_npy_audio_only(Dataset):
    """Soccernet Dataset"""
    
    
    def __init__(self,npy_file,
                 root_dir,
                 transform=None,
                 train=True):
    
        self.samples = np.load(root_dir+npy_file,allow_pickle=True)        # GENERALIZE
        self.root_dir = root_dir
        self.transform = transform
        self.train = train
       
    def __len__(self):
        return len(self.samples)
    

    def __getitem__(self,idx):
        """Returns a sample containing video path, clip and label"""
        
        if torch.is_tensor(idx):
            idx.tolist()
        
        if self.train:
            if idx in [2209,2210,2212,2213,2215,2217,2222]: # ultradirty hack - fix later
                idx = 0
        
        path = str(self.samples[idx]['audiopath'][:-11]+str(idx)+"_ms.npy")
        ms = np.load(path)
        ms = ms-np.min(ms) / (np.max(ms)-np.min(ms))
        label = self.samples[idx]['label']
        info = self.samples[idx]['annotation']
        idx_old = self.samples[idx]['idx']
        
        sample = {'path': path,
                  'ms':ms,'idx': idx_old,
                  'label':label, 'info':info}
        
        return sample
    


In [None]:
# Clip dataset
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
import os
import json
import torchvision
import datetime
from subprocess import Popen, PIPE
import re

class SoccerNetDataset(Dataset):
    """Soccernet Dataset"""
    
    def __init__(self,npy_file,
                 npy_file_audio,
                 root_dir,nframes=1,
                 stride_frames=1,
                 frame_center='center',
                 transform=None,
                 train=False,
                 tshift={'active':False,
                         'mu':0,'sigma':(0.4/3), 
                         'interval':[-0.45,0.45],
                         'mode': 'uniform'}):
        self.npy_file = np.load(npy_file)
        self.audio_samples = np.load(root_dir+npy_file_audio,allow_pickle=True)
        self.samples = list() # maybe change structure later depending on efficiency        
        self.tshift = tshift
        self.root_dir = root_dir
        self.transform = transform
        self.frame_center = frame_center
        self.nframes = nframes
        self.stride_frames = stride_frames
        self.train = train
        # For each path in npy_file, get all annotations


        for e in self.npy_file:
            path, annotations = self.get_annotations(e)

            duration1 = self.getVideoLength(self.root_dir + e + "/1.mkv")
            duration2 = self.getVideoLength(self.root_dir + e + "/2.mkv")
            #print(f"duration1 : {duration1}, duration2: {duration2}")
            for annotation in annotations:
                # Check that annotations hold correct labels
                        if ("card" in annotation["label"]) or ("subs" in annotation["label"]) or ("soccer" in annotation["label"]):
                            annotation["duration1"] = duration1
                            annotation["duration2"] = duration2
                            self.samples.append([path,annotation,"soccernet"])


    def __len__(self):
        return len(self.samples)
    
    def getVideoLength(self,video_file):
        res = Popen(['ffmpeg', '-i', video_file, '-hide_banner'],stdout=PIPE,stderr=PIPE)
        none,meta = res.communicate()
        meta_out = meta.decode()
        #---| Take out info
        duration = re.search(r'Duration:.*', meta_out)
        return duration.group()[:21]

    def __getitem__(self,idx):
        """Returns a sample containing video path, clip and label"""
        if torch.is_tensor(idx):
            idx.tolist()
        
        if self.train:
            if idx in [2209,2210,2212,2213,2215,2217,2222]: # ultradirty hack - fix later
                idx = 0
        
        if self.train:
            if idx in [2209,2210,2212,2213,2215,2217,2222]: # ultradirty hack - fix later
                idx = 0
        
        path = str(self.audio_samples[idx]['audiopath'][:-11]+str(idx)+"_ms.npy")
        ms = np.load(path)
        ms = ms-np.min(ms) / (np.max(ms)-np.min(ms))
        label = self.samples[idx]['label']
        info = self.samples[idx]['annotation']
        idx_old = self.samples[idx]['idx']
        
        audio_sample = {'path': path,
                  'ms':ms,'idx': idx_old,
                  'label':label, 'info':info}
        
        
        # get annotations
        time_half = int(self.samples[idx][1]["gameTime"][0])
        time_minute = int(self.samples[idx][1]["gameTime"][-5:-3])
        time_second = int(self.samples[idx][1]["gameTime"][-2:])
        annotation = self.samples[idx][1]
        source = self.samples[idx][2]
        


        # Get label
        if ("card" in annotation["label"]): label = 0
        elif ("subs" in annotation["label"]): label = 1
        elif ("soccer" in annotation["label"]): label = 2
        elif ("background" in annotation["label"]): label = 3
        else: 
            print("Warning, label not compatible with set")
            return
            
        # Get videopath
        if source == 'soccernet':
            vidpath = os.path.join(self.root_dir,
                                str(self.samples[idx][0]),
                                    str(time_half)+".mkv")
        else:
            vidpath = str(self.samples[idx][0])
        
        
        # Get video frames 
        
        # get start in second, use labeled time as center TODO: fix centerframe as keyframe and stride
        fps = 25.0 # assume fps = 25 for now, should be so
        start_sec = time_minute*60 + time_second
        end_sec = start_sec

        if self.nframes == 1:
            end_sec = start_sec
        
        if start_sec == 0:
            end_sec += (1/fps) # possibly unstable solution
            
        
        if self.frame_center == 'center' and self.nframes > 1:
            
            end_sec = end_sec + (self.nframes/fps) # might need to subtract 1/fps
            # Shift backwards to center around time but check that time > 0
            diff = (end_sec - start_sec) / 2 # TODO : Might result in bad precision
            temp_start_sec = start_sec - diff
            temp_end_sec = end_sec - diff
            
            # Only change as long as the shift operation doesnt shift out of bounds 
            if temp_start_sec >= 0:
                start_sec = temp_start_sec
                end_sec = temp_end_sec
                
            # TODO : Find new samplesize if self.stride_frames > 1
            # if self.stride_frames > 1:
            # For now, this is an operation for another place
                
        elif self.frame_center == 'back' and self.nframes > 1:
            print("This option should NOT be used during inference, please use 'center' instead")
            end_sec = end_sec + (self.nframes/fps) # might need to subtract 1/fps
        elif self.frame_center == 'front' and self.nframes > 1:
            #print("This option should NOT be used during inference, please use 'center' instead")
            
            end_sec = end_sec + (self.nframes/fps) # might need to subtract 1/fps
            
            # Shift forward such that the last frame is at annotated time t around time but check that time > 0
            diff = (end_sec - start_sec) # TODO : Might result in bad precision
            temp_start_sec = start_sec - diff
            temp_end_sec = end_sec - diff
            
            # Only change as long as the shift operation doesnt shift out of bounds 
            if temp_start_sec >= 0:
                start_sec = temp_start_sec
                end_sec = temp_end_sec

        
        # Temporal translation transform
        
        if self.tshift['active'] and self.frame_center == 'center' and source == 'soccernet':
            t0 = self.tshift['interval'][0]
            t = self.tshift['interval'][1]
            
            if self.tshift['mode'] == 'uniform':    
                delta = np.floor(np.random.uniform(t0,t) * self.nframes)
            elif self.tshift['mode'] == 'normal':
                mu = self.tshift['mu']
                sigma = self.tshift['sigma']
                temporal_window_size = self.nframes / 25.0
                delta = np.random.normal(mu,sigma)
                if delta < t0:
                    delta = t0
                elif delta > t:
                    delta = t
                delta = delta * temporal_window_size
            else: return "Please choose uniform or normal distribution"
            
            # change delta from frames to seconds with correct stepsize
            shifted_start = start_sec+delta
            shifted_end = end_sec+delta
            # Verify that shifted window stays inside 
            # get duration of video
            if time_half == 1:
                video_length = self.samples[idx][1]["duration1"]
            elif time_half == 2:
                video_length = self.samples[idx][1]["duration2"]

            video_length_min = video_length[-8:-6]
            video_length_sec = video_length[-5:-3]
            total_sec = int(video_length_min)*60 + int(video_length_sec)
            
            if shifted_start < 0 or shifted_end > total_sec:
                shifted_start = start_sec
                shifted_end = end_sec
            
            start_sec = shifted_start
            end_sec = shifted_end

        # Buffer to endsec incase of bad load
        end_sec = end_sec + 0.9 # loads more frames than needed, then reduced later
        clip,_,info = torchvision.io.read_video(vidpath, start_pts=start_sec, end_pts=end_sec, pts_unit='sec')
        
        # TODO : This should be tested
        clip = clip[:self.nframes,:,:,:]
        

        one_hot_labels = np.zeros(3)
        one_hot_labels[label] = 1

        csize = clip.size()
        abnormal_count = 0
        bad_count = 0   
        abnormal_2_count = 0
        # At this point clip is [T x H x W x C]
        if clip.size()[3] != 3 or clip.size()[0] != self.nframes or clip.size()[1] != 224 or clip.size()[2] != 398:
            abnormal_count += 1
            if clip.size()[1] != 224 or clip.size()[2] != 398 or clip.size()[0] / self.nframes < 0.6:
                clip = torch.zeros([self.nframes,224,398,3]).byte()
                bad_count += 1
            elif clip.size()[1] == 224 and clip.size()[2] == 398 and clip.size()[0] / self.nframes > 0.6:
                a = clip.size()[0]
                b = self.nframes - a
                last_f = clip[(a-1):,:,:,:]
                dup = last_f.repeat(b,1,1,1)
                clip = torch.cat((clip,dup)).byte()
                abnormal_2_count += 1

        

        sample = {'vidpath': vidpath,'clip': clip, 'annotation':annotation,'label':one_hot_labels,'idx':idx,
                 'csize':csize, 'source': source, 'audio_sample': audio_sample}
        

        if self.transform:
            sample['clip'] = self.transform(sample['clip'])
        return sample
            
    def get_annotations(self,path):
        """ Reads json files and returns """
        with open(self.root_dir+path+"/Labels.json") as jsonfile:
            json_label = json.load(jsonfile)
        
        labels = [e for e in json_label['annotations']]
        
        return path,labels
    def get_keyframe(self,idx):
        if self.frame_center == 'back': return self.__getitem__(idx)['clip'][0,:,:,:]
        elif self.frame_center == 'center': return self.__getitem__(idx)['clip'][self.nframes//2,:,:,:]
        elif self.frame_center == 'front': return self.__getitem__(idx)['clip'][self.nframes-1,:,:,:]
    def describe(self):
        card = 0
        subs = 0
        goal = 0
        background = 0

        for sample in self.samples:
            annotation = sample[1]
        # Get label
            if ("card" in annotation["label"]): card += 1
            elif ("subs" in annotation["label"]): subs +=1
            elif ("soccer" in annotation["label"]): goal += 1
            elif ("background" in annotation["label"]): background += 1

        print("Description of dataset\n\n")
        print("\n ********* Classes *********")
        print("\n card = 0\n subs = 1\n goals = 2\n background = 3")

        print("\n ********* Distribution and count *********")
        print(f"\n N card: {card} \n N subs: {subs} \n N goal: {goal} \n N background: {background} \n \n Total : {card+subs+goal+background}")
        
        print("\n\n ********* Configuration *********")
        print(f"\n npy_file: {self.npy_file} \n tshift: {self.tshift} \n root_dir: {self.root_dir} \n transform: {self.transform} \n frame_center: {self.frame_center} \n nframes: {self.nframes} \n stride_frames: {self.stride_frames} \n background: {self.background}")
        print("\n\n ********* End of description *********")

# Dataloader

In [6]:
# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Note: Can also see warning once
# warnings.filterwarnings(action='once')

In [7]:
params = {'batch_size': 12,
         'shuffle': True,
         'num_workers':6}

In [119]:
X_train_npy = soccernet_ms_npy_audio_only(root_dir=root_dir,npy_file="train_samples.npy")
#X_valid_npy = soccernet_ms_npy_DS(root_dir=root_dir,npy_file="valid_samples.npy")
X_test_npy = soccernet_ms_npy_audio_only(root_dir=root_dir,npy_file="test_samples.npy")

params = {'batch_size': 12,
         'shuffle': True,
         'num_workers':6}
trainloader = DataLoader(X_train_npy,**params)
testloader = DataLoader(X_test_npy,**params)

In [99]:
trainloader = DataLoader(X_train_npy,**params)
testloader = DataLoader(X_test_npy,**params)

# Network

In [10]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.bn1 = nn.BatchNorm2d(6)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.bn2 = nn.BatchNorm2d(16)
        self.pool = nn.MaxPool2d(4, 4)
        self.fc1 = nn.Linear(2208, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 3)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = x.view(-1, 2208)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x



In [11]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('runs/simple_net3')

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

net = Net()

In [13]:
net.to(device)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (bn1): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=2208, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=3, bias=True)
)

In [14]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.025, momentum=0.9)

In [15]:
%%bash
nvidia-smi

Wed Nov  4 00:41:19 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.165.02   Driver Version: 418.165.02   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM3...  On   | 00000000:59:00.0 Off |                    0 |
| N/A   38C    P0    67W / 350W |   1406MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|    0  

In [16]:

# Dataloader or dataset faulty(returns batches of 0
epochs = 20
for epoch in range(epochs):
    running_loss = 0.0
    
    net.train()
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['ms'].to(device),data['label'].to(device)

        #print(inputs.size())
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        if i % 5 == 0:    # print every 2000 mini-batches
            
            writer.add_scalar('training loss',
                            running_loss / 5,
                            epoch * len(trainloader) + i)
            
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 5))
            running_loss = 0.0
    
    # calculate accuracy
    with torch.no_grad():
        net.eval()
        res = torch.zeros((3,3))
        for i, data in enumerate(testloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data['ms'].to(device),data['label'].to(device)

            # forward + backward + optimize
            outputs = net(inputs)
            
            preds = torch.argmax(outputs,dim=1)
            
            for p,gt in zip(preds,labels):
                res[int(p),int(gt)] += 1
            

        

        N_total = res.sum()
        N_correct = res.diag().sum()
        
        acc = N_correct / N_total
        
        writer.add_scalar('training acc',
                    acc,
                    epoch)
        print(f"Epoch : {epoch}, Accuracy : {acc}")
        
print('Finished Training')

[1,     1] loss: 0.241
[1,     6] loss: 1.153
[1,    11] loss: 1.107
[1,    16] loss: 0.992
[1,    21] loss: 1.034
[1,    26] loss: 0.907
[1,    31] loss: 0.669
[1,    36] loss: 1.124
[1,    41] loss: 0.953
[1,    46] loss: 0.691
[1,    51] loss: 0.742
[1,    56] loss: 0.929
[1,    61] loss: 0.679
[1,    66] loss: 0.686
[1,    71] loss: 0.598
[1,    76] loss: 0.801
[1,    81] loss: 0.734
[1,    86] loss: 0.700
[1,    91] loss: 0.857
[1,    96] loss: 0.695
[1,   101] loss: 0.875
[1,   106] loss: 0.882
[1,   111] loss: 0.905
[1,   116] loss: 0.840
[1,   121] loss: 0.630
[1,   126] loss: 0.844
[1,   131] loss: 0.948
[1,   136] loss: 0.982
[1,   141] loss: 0.891
[1,   146] loss: 0.887
[1,   151] loss: 0.698
[1,   156] loss: 0.683
[1,   161] loss: 0.725
[1,   166] loss: 0.776
[1,   171] loss: 0.837
[1,   176] loss: 0.680
[1,   181] loss: 0.863
[1,   186] loss: 0.732
[1,   191] loss: 0.649
[1,   196] loss: 0.561
[1,   201] loss: 0.987
[1,   206] loss: 0.706
[1,   211] loss: 0.728
[1,   216] 

[6,    76] loss: 0.498
[6,    81] loss: 0.497
[6,    86] loss: 0.471
[6,    91] loss: 0.474
[6,    96] loss: 0.504
[6,   101] loss: 0.554
[6,   106] loss: 0.363
[6,   111] loss: 0.357
[6,   116] loss: 0.375
[6,   121] loss: 0.312
[6,   126] loss: 0.354
[6,   131] loss: 0.396
[6,   136] loss: 0.256
[6,   141] loss: 0.417
[6,   146] loss: 0.420
[6,   151] loss: 0.319
[6,   156] loss: 0.297
[6,   161] loss: 0.299
[6,   166] loss: 0.455
[6,   171] loss: 0.278
[6,   176] loss: 0.424
[6,   181] loss: 0.291
[6,   186] loss: 0.576
[6,   191] loss: 0.466
[6,   196] loss: 0.355
[6,   201] loss: 0.583
[6,   206] loss: 0.418
[6,   211] loss: 0.365
[6,   216] loss: 0.482
[6,   221] loss: 0.276
[6,   226] loss: 0.477
[6,   231] loss: 0.479
[6,   236] loss: 0.328
[6,   241] loss: 0.485
[6,   246] loss: 0.347
[6,   251] loss: 0.424
[6,   256] loss: 0.377
[6,   261] loss: 0.301
[6,   266] loss: 0.499
[6,   271] loss: 0.323
[6,   276] loss: 0.561
[6,   281] loss: 0.337
[6,   286] loss: 0.559
[6,   291] 

[11,   121] loss: 0.223
[11,   126] loss: 0.252
[11,   131] loss: 0.263
[11,   136] loss: 0.299
[11,   141] loss: 0.244
[11,   146] loss: 0.205
[11,   151] loss: 0.291
[11,   156] loss: 0.354
[11,   161] loss: 0.163
[11,   166] loss: 0.261
[11,   171] loss: 0.246
[11,   176] loss: 0.333
[11,   181] loss: 0.272
[11,   186] loss: 0.400
[11,   191] loss: 0.226
[11,   196] loss: 0.275
[11,   201] loss: 0.362
[11,   206] loss: 0.367
[11,   211] loss: 0.268
[11,   216] loss: 0.323
[11,   221] loss: 0.263
[11,   226] loss: 0.290
[11,   231] loss: 0.286
[11,   236] loss: 0.294
[11,   241] loss: 0.262
[11,   246] loss: 0.430
[11,   251] loss: 0.276
[11,   256] loss: 0.168
[11,   261] loss: 0.441
[11,   266] loss: 0.147
[11,   271] loss: 0.355
[11,   276] loss: 0.298
[11,   281] loss: 0.259
[11,   286] loss: 0.354
[11,   291] loss: 0.219
[11,   296] loss: 0.163
[11,   301] loss: 0.595
[11,   306] loss: 0.296
[11,   311] loss: 0.477
[11,   316] loss: 0.413
[11,   321] loss: 0.451
[11,   326] loss

[16,   121] loss: 0.170
[16,   126] loss: 0.171
[16,   131] loss: 0.276
[16,   136] loss: 0.231
[16,   141] loss: 0.278
[16,   146] loss: 0.095
[16,   151] loss: 0.189
[16,   156] loss: 0.249
[16,   161] loss: 0.181
[16,   166] loss: 0.181
[16,   171] loss: 0.265
[16,   176] loss: 0.205
[16,   181] loss: 0.132
[16,   186] loss: 0.318
[16,   191] loss: 0.204
[16,   196] loss: 0.224
[16,   201] loss: 0.298
[16,   206] loss: 0.198
[16,   211] loss: 0.315
[16,   216] loss: 0.351
[16,   221] loss: 0.199
[16,   226] loss: 0.218
[16,   231] loss: 0.272
[16,   236] loss: 0.301
[16,   241] loss: 0.143
[16,   246] loss: 0.133
[16,   251] loss: 0.289
[16,   256] loss: 0.227
[16,   261] loss: 0.232
[16,   266] loss: 0.260
[16,   271] loss: 0.208
[16,   276] loss: 0.254
[16,   281] loss: 0.278
[16,   286] loss: 0.271
[16,   291] loss: 0.483
[16,   296] loss: 0.227
[16,   301] loss: 0.337
[16,   306] loss: 0.238
[16,   311] loss: 0.383
[16,   316] loss: 0.216
[16,   321] loss: 0.189
[16,   326] loss

In [120]:
with torch.no_grad():
        net.eval()
        res = torch.zeros((3,3))
        for i, data in enumerate(testloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data['ms'].to(device),data['label'].to(device)

            # forward + backward + optimize
            outputs = torch.softmax(net(inputs),dim=1)
            
            preds = torch.argmax(outputs,dim=1)
            
            for p,gt in zip(preds,labels):
                res[int(p),int(gt)] += 1
            

        

        N_total = res.sum()
        N_correct = res.diag().sum()
        
        acc = N_correct / N_total
        
        writer.add_scalar('training acc',
                    acc,
                    epoch)
        print(f"Epoch : {epoch}, Accuracy : {acc}")

Epoch : 10, Accuracy : 0.8122238516807556


# densenet


In [None]:
soccernet_ms_npy_audio_only

In [None]:
X_train_npy = soccernet_ms_npy_audio_only(root_dir=root_dir,npy_file="train_samples.npy")
#X_valid_npy = soccernet_ms_npy_DS(root_dir=root_dir,npy_file="valid_samples.npy")
X_test_npy = soccernet_ms_npy_audio_only(root_dir=root_dir,npy_file="test_samples.npy")

params = {'batch_size': 12,
         'shuffle': True,
         'num_workers':3}
trainloader = DataLoader(X_train_npy,**params)
testloader = DataLoader(X_test_npy,**params)

In [None]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('runs/dense_net_4')

In [None]:
densenet = torchvision.models.densenet161(pretrained=True)
densenet.features.conv0 = nn.Conv2d(1, 96, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
densenet.classifier = nn.Linear(in_features=densenet.classifier.in_features, out_features=3,bias=True)

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(densenet.parameters(), lr=0.001, momentum=0.9)

In [None]:
densenet.to(device)

In [None]:

# Dataloader or dataset faulty(returns batches of 0
epochs = 15
for epoch in range(epochs):
    running_loss = 0.0
    
    densenet.train()
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['ms'].to(device),data['label'].to(device)

        #print(inputs.size())
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = densenet(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        if i % 20 == 0:    # print every 2000 mini-batches
            
            writer.add_scalar('training loss',
                            running_loss / 20,
                            epoch * len(trainloader) + i)
            
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 20))
            running_loss = 0.0
    
    # calculate accuracy
    with torch.no_grad():
        densenet.eval()
        res = torch.zeros((3,3))
        for i, data in enumerate(testloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data['ms'].to(device),data['label'].to(device)

            # forward + backward + optimize
            outputs = densenet(inputs)
            
            preds = torch.argmax(outputs,dim=1)
            
            for p,gt in zip(preds,labels):
                res[int(p),int(gt)] += 1
            

        

        N_total = res.sum()
        N_correct = res.diag().sum()
        
        acc = N_correct / N_total
        
        writer.add_scalar('training acc',
                    acc,
                    epoch)
        print(f"Epoch : {epoch}, Accuracy : {acc}")
        
print('Finished Training')

In [None]:
with torch.no_grad():
        densenet.train()
        res = torch.zeros((3,3))
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data['ms'].to(device),data['label'].to(device)

            # forward + backward + optimize
            outputs = densenet(inputs)
            
            preds = torch.argmax(outputs,dim=1)
            print(outputs)
            for p,gt in zip(preds,labels):
                res[int(p),int(gt)] += 1
            

        

        N_total = res.sum()
        N_correct = res.diag().sum()
        
        acc = N_correct / N_total
        
        writer.add_scalar('training acc',
                    acc,
                    epoch)
        print(f"Epoch : {epoch}, Accuracy : {acc}")

In [None]:
res.diag().sum() / res.sum()

# Save validation list for dataset loading

In [None]:
X = soccernet_ms_npy_DS(root_dir=root_dir,npy_file=train_file)

In [None]:
np.save(file=root_dir+"valid_samples.npy",arr=samples,allow_pickle=True)

In [None]:
s = np.load(root_dir+"valid_samples.npy",allow_pickle=True)

In [None]:
len(s)

# Save test list for dataset loading

In [None]:
X = SoccerNetDataset(root_dir=root_dir,npy_file=test_file)

In [None]:
samples = list()
for e in X:
    samples.append(e)
    

In [None]:
np.save(file=root_dir+"test_samples.npy",arr=samples,allow_pickle=True)

In [None]:
s = np.load(root_dir+"test_samples.npy",allow_pickle=True)

In [None]:
len(s)

# Simple network

In [None]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.bn1 = nn.BatchNorm2d(6)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.bn2 = nn.BatchNorm2d(16)
        self.pool = nn.MaxPool2d(4, 4)
        self.fc1 = nn.Linear(2208, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 4)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = x.view(-1, 2208)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.0001, momentum=0.9)

In [None]:
# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Note: Can also see warning once
# warnings.filterwarnings(action='once')

In [None]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('runs/mel_spec_experiment_4')

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)

In [None]:
outputs

In [None]:
(labels == torch.argmax(outputs, dim=1)).sum() 

In [None]:
labels

In [None]:
torch.argmax(outputs, dim=1)

In [None]:
len(dataloader)

In [None]:
writer = SummaryWriter('runs/mel_spec_experiment_4')

In [None]:
net.to(device)
# Dataloader or dataset faulty(returns batches of 0
epochs = 2
for epoch in range(epochs):
    running_loss = 0.0
    
    net.train()
    for i, data in enumerate(dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['mel_spectogram'].to(device),data['label'].to(device)

        #print(inputs.size())
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        if i % 5 == 0:    # print every 2000 mini-batches
            
            writer.add_scalar('training loss',
                            running_loss / 5,
                            epoch * len(dataloader) + i)
            
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 5))
            running_loss = 0.0
    
    # calculate accuracy
    with torch.no_grad():
        net.eval()
        res = torch.zeros((3,3))
        for i, data in enumerate(dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data['mel_spectogram'].to(device),data['label'].to(device)

            # forward + backward + optimize
            outputs = net(inputs)
            
            preds = torch.argmax(outputs,dim=1)
            
            for p,gt in zip(preds,labels):
                res[int(p),int(gt)] += 1
            

        

        N_total = res.sum()
        N_correct = res.diag().sum()
        
        acc = N_total / N_correct
        
        writer.add_scalar('training acc',
                    acc,
                    epoch)
        print(f"Epoch : {epoch}, Accuracy : {acc}")
        
print('Finished Training')

In [None]:
writer.close()

# Evaluate samples to look for cases where y, sr from librosa is inconsistent

In [None]:
outputs = net(inputs)

In [None]:
inputs.size()

In [None]:
net.fc1.weight

# With GPU

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
net.to(device)

running_loss = 0.0
for i, data in enumerate(dataloader, 0):
    # get the inputs; data is a list of [inputs, labels]
    inputs, labels = data['mel_spectogram'].unsqueeze(0).to(device),torch.argmax(data['one_hot_label']).to(device)

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    outputs = net(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    # print statistics
    running_loss += loss.item()
    if i % 2000 == 1999:    # print every 2000 mini-batches
        print('[%d, %5d] loss: %.3f' %
              (epoch + 1, i + 1, running_loss / 2000))
        running_loss = 0.0

print('Finished Training')

In [None]:
%%bash
nvidia-smi

# Video dataset

In [17]:
# Clip dataset
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
import os
import json
import torchvision
import datetime
from subprocess import Popen, PIPE
import re

class SoccerNetDataset(Dataset):
    """Soccernet Dataset"""
    
    def __init__(self,npy_file,
                 npy_file_audio,
                 root_dir,nframes=1,
                 stride_frames=1,
                 frame_center='center',
                 transform=None,
                 train=False,
                 tshift={'active':False,
                         'mu':0,'sigma':(0.4/3), 
                         'interval':[-0.45,0.45],
                         'mode': 'uniform'}):
        self.npy_file = np.load(npy_file)
        self.audio_samples = np.load(root_dir+npy_file_audio,allow_pickle=True)
        self.samples = list() # maybe change structure later depending on efficiency        
        self.tshift = tshift
        self.root_dir = root_dir
        self.transform = transform
        self.frame_center = frame_center
        self.nframes = nframes
        self.stride_frames = stride_frames
        self.train = train
        # For each path in npy_file, get all annotations


        for e in self.npy_file:
            path, annotations = self.get_annotations(e)

            duration1 = self.getVideoLength(self.root_dir + e + "/1.mkv")
            duration2 = self.getVideoLength(self.root_dir + e + "/2.mkv")
            #print(f"duration1 : {duration1}, duration2: {duration2}")
            for annotation in annotations:
                # Check that annotations hold correct labels
                        if ("card" in annotation["label"]) or ("subs" in annotation["label"]) or ("soccer" in annotation["label"]):
                            annotation["duration1"] = duration1
                            annotation["duration2"] = duration2
                            self.samples.append([path,annotation,"soccernet"])


    def __len__(self):
        return len(self.samples)
    
    def getVideoLength(self,video_file):
        res = Popen(['ffmpeg', '-i', video_file, '-hide_banner'],stdout=PIPE,stderr=PIPE)
        none,meta = res.communicate()
        meta_out = meta.decode()
        #---| Take out info
        duration = re.search(r'Duration:.*', meta_out)
        return duration.group()[:21]

    def __getitem__(self,idx):
        """Returns a sample containing video path, clip and label"""
        if torch.is_tensor(idx):
            idx.tolist()
        
        if self.train:
            if idx in [2209,2210,2212,2213,2215,2217,2222]: # ultradirty hack - fix later
                idx = 0
        
        if self.train:
            if idx in [2209,2210,2212,2213,2215,2217,2222]: # ultradirty hack - fix later
                idx = 0
        
        path = str(self.audio_samples[idx]['audiopath'][:-11]+str(idx)+"_ms.npy")
        ms = np.load(path)
        ms = ms-np.min(ms) / (np.max(ms)-np.min(ms))
        label = self.audio_samples[idx]['label']
        info = self.audio_samples[idx]['annotation']
        #print(info)
        idx_old = self.audio_samples[idx]['idx']
        
        audio_sample = {'path': path,
                  'ms':ms,'idx': idx_old,
                  'label':label, 'info':info}
        
        
        # get annotations
        time_half = int(self.samples[idx][1]["gameTime"][0])
        time_minute = int(self.samples[idx][1]["gameTime"][-5:-3])
        time_second = int(self.samples[idx][1]["gameTime"][-2:])
        annotation = self.samples[idx][1]
        source = self.samples[idx][2]
        


        # Get label
        if ("card" in annotation["label"]): label = 0
        elif ("subs" in annotation["label"]): label = 1
        elif ("soccer" in annotation["label"]): label = 2
        elif ("background" in annotation["label"]): label = 3
        else: 
            print("Warning, label not compatible with set")
            return
            
        # Get videopath
        if source == 'soccernet':
            vidpath = os.path.join(self.root_dir,
                                str(self.samples[idx][0]),
                                    str(time_half)+".mkv")
        else:
            vidpath = str(self.samples[idx][0])
        
        
        # Get video frames 
        
        # get start in second, use labeled time as center TODO: fix centerframe as keyframe and stride
        fps = 25.0 # assume fps = 25 for now, should be so
        start_sec = time_minute*60 + time_second
        end_sec = start_sec

        if self.nframes == 1:
            end_sec = start_sec
        
        if start_sec == 0:
            end_sec += (1/fps) # possibly unstable solution
            
        
        if self.frame_center == 'center' and self.nframes > 1:
            
            end_sec = end_sec + (self.nframes/fps) # might need to subtract 1/fps
            # Shift backwards to center around time but check that time > 0
            diff = (end_sec - start_sec) / 2 # TODO : Might result in bad precision
            temp_start_sec = start_sec - diff
            temp_end_sec = end_sec - diff
            
            # Only change as long as the shift operation doesnt shift out of bounds 
            if temp_start_sec >= 0:
                start_sec = temp_start_sec
                end_sec = temp_end_sec
                
            # TODO : Find new samplesize if self.stride_frames > 1
            # if self.stride_frames > 1:
            # For now, this is an operation for another place
                
        elif self.frame_center == 'back' and self.nframes > 1:
            print("This option should NOT be used during inference, please use 'center' instead")
            end_sec = end_sec + (self.nframes/fps) # might need to subtract 1/fps
        elif self.frame_center == 'front' and self.nframes > 1:
            #print("This option should NOT be used during inference, please use 'center' instead")
            
            end_sec = end_sec + (self.nframes/fps) # might need to subtract 1/fps
            
            # Shift forward such that the last frame is at annotated time t around time but check that time > 0
            diff = (end_sec - start_sec) # TODO : Might result in bad precision
            temp_start_sec = start_sec - diff
            temp_end_sec = end_sec - diff
            
            # Only change as long as the shift operation doesnt shift out of bounds 
            if temp_start_sec >= 0:
                start_sec = temp_start_sec
                end_sec = temp_end_sec

        
        # Temporal translation transform
        
        if self.tshift['active'] and self.frame_center == 'center' and source == 'soccernet':
            t0 = self.tshift['interval'][0]
            t = self.tshift['interval'][1]
            
            if self.tshift['mode'] == 'uniform':    
                delta = np.floor(np.random.uniform(t0,t) * self.nframes)
            elif self.tshift['mode'] == 'normal':
                mu = self.tshift['mu']
                sigma = self.tshift['sigma']
                temporal_window_size = self.nframes / 25.0
                delta = np.random.normal(mu,sigma)
                if delta < t0:
                    delta = t0
                elif delta > t:
                    delta = t
                delta = delta * temporal_window_size
            else: return "Please choose uniform or normal distribution"
            
            # change delta from frames to seconds with correct stepsize
            shifted_start = start_sec+delta
            shifted_end = end_sec+delta
            # Verify that shifted window stays inside 
            # get duration of video
            if time_half == 1:
                video_length = self.samples[idx][1]["duration1"]
            elif time_half == 2:
                video_length = self.samples[idx][1]["duration2"]

            video_length_min = video_length[-8:-6]
            video_length_sec = video_length[-5:-3]
            total_sec = int(video_length_min)*60 + int(video_length_sec)
            
            if shifted_start < 0 or shifted_end > total_sec:
                shifted_start = start_sec
                shifted_end = end_sec
            
            start_sec = shifted_start
            end_sec = shifted_end

        # Buffer to endsec incase of bad load
        end_sec = end_sec + 0.9 # loads more frames than needed, then reduced later
        clip,_,info = torchvision.io.read_video(vidpath, start_pts=start_sec, end_pts=end_sec, pts_unit='sec')
        
        # TODO : This should be tested
        clip = clip[:self.nframes,:,:,:]
        

        one_hot_labels = np.zeros(3)
        one_hot_labels[label] = 1

        csize = clip.size()
        abnormal_count = 0
        bad_count = 0   
        abnormal_2_count = 0
        # At this point clip is [T x H x W x C]
        if clip.size()[3] != 3 or clip.size()[0] != self.nframes or clip.size()[1] != 224 or clip.size()[2] != 398:
            abnormal_count += 1
            if clip.size()[1] != 224 or clip.size()[2] != 398 or clip.size()[0] / self.nframes < 0.6:
                clip = torch.zeros([self.nframes,224,398,3]).byte()
                bad_count += 1
            elif clip.size()[1] == 224 and clip.size()[2] == 398 and clip.size()[0] / self.nframes > 0.6:
                a = clip.size()[0]
                b = self.nframes - a
                last_f = clip[(a-1):,:,:,:]
                dup = last_f.repeat(b,1,1,1)
                clip = torch.cat((clip,dup)).byte()
                abnormal_2_count += 1

        

        sample = {'vidpath': vidpath,'clip': clip, 'annotation':annotation,'label':one_hot_labels,'idx':idx,
                 'csize':csize, 'source': source, 'audio_sample': audio_sample}
        

        if self.transform:
            sample['clip'] = self.transform(sample['clip'])
        return sample
            
    def get_annotations(self,path):
        """ Reads json files and returns """
        with open(self.root_dir+path+"/Labels.json") as jsonfile:
            json_label = json.load(jsonfile)
        
        labels = [e for e in json_label['annotations']]
        
        return path,labels
    def get_keyframe(self,idx):
        if self.frame_center == 'back': return self.__getitem__(idx)['clip'][0,:,:,:]
        elif self.frame_center == 'center': return self.__getitem__(idx)['clip'][self.nframes//2,:,:,:]
        elif self.frame_center == 'front': return self.__getitem__(idx)['clip'][self.nframes-1,:,:,:]
    def describe(self):
        card = 0
        subs = 0
        goal = 0
        background = 0

        for sample in self.samples:
            annotation = sample[1]
        # Get label
            if ("card" in annotation["label"]): card += 1
            elif ("subs" in annotation["label"]): subs +=1
            elif ("soccer" in annotation["label"]): goal += 1
            elif ("background" in annotation["label"]): background += 1

        print("Description of dataset\n\n")
        print("\n ********* Classes *********")
        print("\n card = 0\n subs = 1\n goals = 2\n background = 3")

        print("\n ********* Distribution and count *********")
        print(f"\n N card: {card} \n N subs: {subs} \n N goal: {goal} \n N background: {background} \n \n Total : {card+subs+goal+background}")
        
        print("\n\n ********* Configuration *********")
        print(f"\n npy_file: {self.npy_file} \n tshift: {self.tshift} \n root_dir: {self.root_dir} \n transform: {self.transform} \n frame_center: {self.frame_center} \n nframes: {self.nframes} \n stride_frames: {self.stride_frames} \n background: {self.background}")
        print("\n\n ********* End of description *********")

In [122]:
len(videods_test)

1358

In [18]:
npy_file_train = "/work/oarongve/data/listgame_Train_300.npy"
npy_file_test = "/work/oarongve/data/listgame_Test_100.npy"
root_dir = "/work/oarongve/data/sound_dataset/SoccerNet-code/data/"

In [19]:
import torchvision

In [20]:
import torchvision.models.video as video_models

In [21]:
model = video_models.resnet.r3d_18(pretrained=True)

In [22]:
# change last layer..

model.fc = torch.nn.Linear(model.fc.in_features,3)

# Change first input as well

model.stem[0] = torch.nn.Conv3d(3,model.stem[0].out_channels,kernel_size=(1,7,7),stride=(10,2,2),padding=model.stem[0].padding,bias=model.stem[0].bias)

In [23]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)

In [26]:
params = {'batch_size': 12
         'shuffle': True,
         'num_workers':6}

SyntaxError: invalid syntax (<ipython-input-26-fa410f5b77da>, line 2)

In [None]:
# Video model

In [27]:
## Transforms
class ReSize(object):
    def __init__(self,output_size,interpolation='bilinear'):
        self.output_size = output_size
        self.interpolation = interpolation
    
    def __call__(self,clip):
        c,t,h,w = clip.size()
        return video_transform.F.resize(clip,self.output_size,interpolation_mode=self.interpolation)

In [28]:
import torchvision.transforms as transforms
import torchvision.transforms._transforms_video as video_transform
std = (0.22803, 0.22145, 0.216989)
mean = (0.43216, 0.394666, 0.37645)

transform = transforms.Compose(
        [video_transform.ToTensorVideo(),
         ReSize((112,112)),
        video_transform.RandomHorizontalFlipVideo(0.5),
        video_transform.NormalizeVideo(std,mean)])

In [29]:
videods = SoccerNetDataset(root_dir=root_dir,
                           npy_file=npy_file_train,
                           npy_file_audio='train_samples.npy',
                           train=True,
                            nframes=4*25, transform=transform)

videods_test = SoccerNetDataset(root_dir=root_dir,
                           npy_file=npy_file_train,
                           npy_file_audio='test_samples.npy',
                           train=True,
                            nframes=4*25, transform=transform)

In [30]:
trainloader = DataLoader(videods,**params)

In [118]:
params

{'batch_size': 12, 'shuffle': True, 'num_workers': 6}

In [103]:
len(testloader)

114

In [31]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [123]:
%%bash
nvidia-smi

Wed Nov  4 15:06:09 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.165.02   Driver Version: 418.165.02   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM3...  On   | 00000000:59:00.0 Off |                    0 |
| N/A   42C    P0    69W / 350W |   4964MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|    0  

In [33]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('runs/video_only_prototypev4')

In [35]:
model.to(device)
# Dataloader or dataset faulty(returns batches of 0
epochs = 15
for epoch in range(epochs):
    running_loss = 0.0
    
    model.train()
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['clip'].to(device),data['label'].to(device)
        target = torch.argmax(labels,dim=1)
        #print(inputs.size())
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        if i % 5 == 0:    # print every 2000 mini-batches
            
            writer.add_scalar('training loss',
                            running_loss / 5,
                            epoch * len(trainloader) + i)
            
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 5))
            running_loss = 0.0
    
    # calculate accuracy
    with torch.no_grad():
        model.eval()
        res = torch.zeros((3,3))
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data['clip'].to(device),data['label'].to(device)
            target = torch.argmax(labels,dim=1)

            # forward + backward + optimize
            outputs = model(inputs)
            
            preds = torch.argmax(outputs,dim=1)
            
            for p,gt in zip(preds,target):
                res[int(p),int(gt)] += 1
            

        

        N_total = res.sum()
        N_correct = res.diag().sum()
        
        acc = N_total / N_correct
        
        writer.add_scalar('training acc',
                    acc,
                    epoch)
        print(f"Epoch : {epoch}, Accuracy : {acc}")
        
print('Finished Training')

[1,     1] loss: 0.136
[1,     6] loss: 0.608
[1,    11] loss: 0.640
[1,    16] loss: 0.668
[1,    21] loss: 0.536
[1,    26] loss: 0.563
[1,    31] loss: 0.590
[1,    36] loss: 0.516
[1,    41] loss: 0.639
[1,    46] loss: 0.620
[1,    51] loss: 0.584
[1,    56] loss: 0.566
[1,    61] loss: 0.670
[1,    66] loss: 0.661
[1,    71] loss: 0.578
[1,    76] loss: 0.544
[1,    81] loss: 0.575
[1,    86] loss: 0.480
[1,    91] loss: 0.579
[1,    96] loss: 0.601
[1,   101] loss: 0.465
[1,   106] loss: 0.576
[1,   111] loss: 0.527
[1,   116] loss: 0.620
[1,   121] loss: 0.590
[1,   126] loss: 0.547
[1,   131] loss: 0.560
[1,   136] loss: 0.486
[1,   141] loss: 0.678
[1,   146] loss: 0.553
[1,   151] loss: 0.510
[1,   156] loss: 0.553
[1,   161] loss: 0.585
[1,   166] loss: 0.563
[1,   171] loss: 0.543
[1,   176] loss: 0.587
[1,   181] loss: 0.432
[1,   186] loss: 0.490
[1,   191] loss: 0.396
[1,   196] loss: 0.493
[1,   201] loss: 0.459
[1,   206] loss: 0.618
[1,   211] loss: 0.608
[1,   216] 

[8,   146] loss: 0.226
[8,   151] loss: 0.250
[8,   156] loss: 0.316
[8,   161] loss: 0.189
[8,   166] loss: 0.303
[8,   171] loss: 0.309
[8,   176] loss: 0.370
[8,   181] loss: 0.359
[8,   186] loss: 0.271
[8,   191] loss: 0.293
[8,   196] loss: 0.217
[8,   201] loss: 0.262
[8,   206] loss: 0.270
[8,   211] loss: 0.226
[8,   216] loss: 0.181
[8,   221] loss: 0.307
Epoch : 7, Accuracy : 1.077152967453003
[9,     1] loss: 0.136
[9,     6] loss: 0.230
[9,    11] loss: 0.260
[9,    16] loss: 0.223
[9,    21] loss: 0.323
[9,    26] loss: 0.300
[9,    31] loss: 0.331
[9,    36] loss: 0.254
[9,    41] loss: 0.274
[9,    46] loss: 0.276
[9,    51] loss: 0.338
[9,    56] loss: 0.277
[9,    61] loss: 0.259
[9,    66] loss: 0.277
[9,    71] loss: 0.210
[9,    76] loss: 0.181
[9,    81] loss: 0.350
[9,    86] loss: 0.222
[9,    91] loss: 0.248
[9,    96] loss: 0.263
[9,   101] loss: 0.198
[9,   106] loss: 0.233
[9,   111] loss: 0.269
[9,   116] loss: 0.288
[9,   121] loss: 0.251
[9,   126] loss: 

KeyboardInterrupt: 

# Audiovisual train

In [None]:
model.to(device)
# Dataloader or dataset faulty(returns batches of 0
epochs = 15
for epoch in range(epochs):
    running_loss = 0.0
    
    model.train()
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['clip'].to(device),data['label'].to(device)
        target = torch.argmax(labels,dim=1)
        #print(inputs.size())
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        if i % 5 == 0:    # print every 2000 mini-batches
            
            writer.add_scalar('training loss',
                            running_loss / 5,
                            epoch * len(trainloader) + i)
            
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 5))
            running_loss = 0.0
    
    # calculate accuracy
    with torch.no_grad():
        net.eval()
        res = torch.zeros((3,3))
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data['clip'].to(device),data['label'].to(device)
            target = torch.argmax(labels,dim=1)

            # forward + backward + optimize
            outputs = model(inputs)
            
            preds = torch.argmax(outputs,dim=1)
            
            for p,gt in zip(preds,labels):
                res[int(p),int(gt)] += 1
            

        

        N_total = res.sum()
        N_correct = res.diag().sum()
        
        acc = N_total / N_correct
        
        writer.add_scalar('training acc',
                    acc,
                    epoch)
        print(f"Epoch : {epoch}, Accuracy : {acc}")
        
print('Finished Training')

# Audiovisual eval


In [64]:
res = torch.zeros((3,3))
res_audio = torch.zeros((3,3))
res_video = torch.zeros((3,3))

In [67]:
net.to(device)
model.to(device)
# Dataloader or dataset faulty(returns batches of 0

# calculate accuracy
with torch.no_grad():
    print("eval starting...")
    net.eval()
    res = torch.zeros((3,3))
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['clip'].to(device),data['label'].to(device)
        inputs_audio = data['audio_sample']['ms'].to(device)
        
        target = torch.argmax(labels,dim=1)

        # forward + backward + optimize
        outputs = model(inputs)
        outputs_audio = net(inputs_audio)
        
        avg_pred = (outputs + outputs_audio) / 2
        preds = torch.argmax(avg_pred,dim=1)
        preds_audio = torch.argmax(outputs,dim=1)
        preds_video = torch.argmax(outputs_audio,dim=1)

        for p,gt in zip(preds,target):
            res[int(p),int(gt)] += 1

        
        # audio
        for p,gt in zip(preds_audio, target):
            res_audio[int(p),int(gt)] += 1
        # video
        for p,gt in zip(preds_video, target):
            res_video[int(p),int(gt)] += 1
        



    N_total = res.sum()
    N_correct = res.diag().sum()

    acc = N_total / N_correct

    writer.add_scalar('training acc',
                acc,
                epoch)
    print(f"Epoch : {epoch}, Accuracy : {acc}")

print('Finished Training')

eval starting...
Epoch : 10, Accuracy : 1.0200668573379517
Finished Training


In [None]:
videods_test = SoccerNetDataset(root_dir=root_dir,
                           npy_file=npy_file_test,
                           npy_file_audio='test_samples.npy',
                           train=False,
                            nframes=4*25, transform=transform)

testloader = DataLoader(videods_test,**params)

In [131]:
net.to(device)
model.to(device)
# Dataloader or dataset faulty(returns batches of 0

# calculate accuracy
with torch.no_grad():
    print("eval starting...")
    net.eval()
    res = torch.zeros((3,3))
    res_audio = torch.zeros((3,3))
    res_video = torch.zeros((3,3))
    for i, data in enumerate(testloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['clip'].to(device),data['label'].to(device)
        inputs_audio = data['audio_sample']['ms'].to(device)
        label_audio = data['audio_sample']['label'].to(device)
        #label_audio = 
        
        target = torch.argmax(labels,dim=1)
        target_audio = label_audio

        # forward + backward + optimize
        outputs = torch.softmax(model(inputs),dim=1)
        outputs_audio = torch.softmax(net(inputs_audio),dim=1)
        
        
        
        avg_pred = (outputs + outputs_audio) / 2
        preds = torch.argmax(avg_pred,dim=1)
        preds_audio = torch.argmax(outputs_audio,dim=1)
        preds_video = torch.argmax(outputs,dim=1)

        for p,gt in zip(preds,target):
            res[int(p),int(gt)] += 1

        
        # audio
        for p,gt in zip(preds_audio, target_audio):
            res_audio[int(p),int(gt)] += 1
        # video
        for p,gt in zip(preds_video, target):
            res_video[int(p),int(gt)] += 1
        



    N_total = res.sum()
    N_correct = res.diag().sum()

    acc = N_correct / N_total

    print(f"Epoch : {epoch}, Accuracy : {acc}")

print('Finished Training')

eval starting...


Exception ignored in: <bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x15543d265208>>
Traceback (most recent call last):
  File "/home/oarongve/.local/share/virtualenvs/project-daredevil-8eBKzQn6/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1203, in __del__
    self._shutdown_workers()
  File "/home/oarongve/.local/share/virtualenvs/project-daredevil-8eBKzQn6/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1177, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 122, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process
Exception ignored in: <bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x15543d265208>>
Traceback (most recent call last

Epoch : 10, Accuracy : 1.1279069185256958
Finished Training


In [132]:
len(testloader)

114

In [133]:
N_total = res.sum()
N_correct = res.diag().sum()

acc = N_correct / N_total


In [134]:
N_total_audio = res_audio.sum()
N_correct_audio = res_audio.diag().sum()

acc_audio = N_correct_audio / N_total_audio


In [135]:
N_total_video = res_video.sum()
N_correct_video = res_video.diag().sum()

acc_video = N_correct_video / N_total_video

In [136]:
print(acc)
print(acc_audio)
print(acc_video)

tensor(0.8866)
tensor(0.8122)
tensor(0.8513)


In [50]:
a = torch.rand((12,3))
b = torch.rand((12,3))

In [58]:
c = (a+b) / 2

c

tensor([[0.5143, 0.4434, 0.4136],
        [0.2918, 0.7081, 0.5464],
        [0.6591, 0.4193, 0.6603],
        [0.5454, 0.0187, 0.4433],
        [0.6008, 0.6664, 0.5428],
        [0.0740, 0.5168, 0.4054],
        [0.6531, 0.5964, 0.5219],
        [0.6565, 0.3273, 0.8152],
        [0.2550, 0.7734, 0.5334],
        [0.6792, 0.6497, 0.4694],
        [0.5513, 0.5988, 0.6295],
        [0.4152, 0.4855, 0.4012]])

In [87]:
torch.softmax(c,dim=1)

tensor([[0.3526, 0.3285, 0.3189],
        [0.2627, 0.3984, 0.3389],
        [0.3587, 0.2822, 0.3591],
        [0.4010, 0.2368, 0.3621],
        [0.3321, 0.3546, 0.3133],
        [0.2532, 0.3942, 0.3526],
        [0.3544, 0.3348, 0.3108],
        [0.3458, 0.2488, 0.4053],
        [0.2500, 0.4198, 0.3302],
        [0.3595, 0.3490, 0.2915],
        [0.3195, 0.3350, 0.3455],
        [0.3269, 0.3507, 0.3224]])

In [60]:
    N_total = res.sum()
    N_correct = res.diag().sum()

    acc = N_correct / N_total

In [61]:
N_total

tensor(3965.)

In [62]:
N_correct

tensor(3884.)

In [63]:
acc

tensor(0.9796)

In [None]:
with torch.no_grad():
    net.eval()
    res = torch.zeros((3,3))
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data['audio_samples']['ms'].to(device),data['audio_samples']['label'].to(device)
        target = torch.argmax(labels,dim=1)

        # forward + backward + optimize
        outputs = model(inputs)
        preds = torch.argmax(outputs,dim=1)

        for p,gt in zip(preds,labels):
            res[int(p),int(gt)] += 1




    N_total = res.sum()
    N_correct = res.diag().sum()

    acc = N_total / N_correct

    writer.add_scalar('training acc',
                acc,
                epoch)
    print(f"Epoch : {epoch}, Accuracy : {acc}")

print('Finished Training')