In [1]:
import numpy as np
import os, sys
sys.path.append('../')
from src.datasets.soccernet_generic_wcombined import soccernet_dataset_generic
from src.utils.helper import samples_by_language
import src.utils.training_helper as training_helper
import torch
from torch.utils.data import Dataset, DataLoader

  '"sox" backend is being deprecated. '


In [2]:
root_dir = "/work/oarongve/data/sound_dataset/SoccerNet-code/data/"
train_list = "/work/oarongve/data/sound_dataset/SoccerNet-code/data/listgame_Train_300.npy"
valid_list = "/work/oarongve/data/sound_dataset/SoccerNet-code/data/listgame_Valid_100.npy"
test_list = "/work/oarongve/data/sound_dataset/SoccerNet-code/data/listgame_Test_100.npy"


In [3]:
# Disable warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
%%bash
nvidia-smi

Mon Dec 21 15:46:49 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.165.02   Driver Version: 418.165.02   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM3...  On   | 00000000:E2:00.0 Off |                    0 |
| N/A   27C    P0    48W / 350W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [5]:
langpath_train = '/work/oarongve/project-daredevil/project-daredevil/language-annotations/annotations/train_lang_dict.json'
langpath_valid = '/work/oarongve/project-daredevil/project-daredevil/language-annotations/annotations/valid_lang_dict.json'
langpath_test = '/work/oarongve/project-daredevil/project-daredevil/language-annotations/annotations/test_lang_dict.json'

samples_train_all = samples_by_language(langpath_train,train_list,'all')
samples_valid_all = samples_by_language(langpath_valid,valid_list,'all')
samples_test_all = samples_by_language(langpath_test,test_list,'all')


In [6]:
test_set_all = soccernet_dataset_generic(npy_file=test_list,root_dir=root_dir,lang='all',lang_dict=langpath_test)
test_set_all.load_waves()
test_set_all.generate_mel_spectrograms(load_features=True)
test_set_all.load_resnet_features()
test_set_all.load_combined()

100%|██████████| 100/100 [00:13<00:00,  7.18it/s]
100%|██████████| 100/100 [00:04<00:00, 23.86it/s]
100%|██████████| 100/100 [00:00<00:00, 107.35it/s]
100%|██████████| 100/100 [00:01<00:00, 56.09it/s]


In [7]:
valid_set_all = soccernet_dataset_generic(npy_file=valid_list,root_dir=root_dir,lang='all',lang_dict=langpath_valid)


In [8]:
params = {'batch_size': 32,
         'shuffle': True,
         'num_workers':4}

# window size

w = 8

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
valid_set_all = soccernet_dataset_generic(npy_file=valid_list,root_dir=root_dir,lang='all',lang_dict=langpath_valid)

valid_set_all.load_waves()
valid_set_all.generate_mel_spectrograms(load_features=True)
valid_set_all.load_resnet_features()
valid_set_all.load_combined()

100%|██████████| 100/100 [00:12<00:00,  8.16it/s]
100%|██████████| 100/100 [00:03<00:00, 25.34it/s]
100%|██████████| 100/100 [00:00<00:00, 108.44it/s]
100%|██████████| 100/100 [00:01<00:00, 59.19it/s]


In [10]:
validloader_all = DataLoader(valid_set_all,**params)
testloader_all = DataLoader(valid_set_all,**params)


# Case 1 : Train on combined features

## Window size

In [11]:
import torchvision
import torch.optim as optim
import torch.nn as nn
def generate_model():
    model = torchvision.models.resnet18(pretrained=True)
    model.conv1 = nn.Conv2d(1,64,kernel_size=(7,7),stride=(2,2),padding=(3,3),bias=False)
    model.fc = nn.Linear(512,4,bias=True)
    #resnet.features.conv0 = nn.Conv2d(1, 96, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    #resnet.classifier = nn.Linear(in_features=densenet.classifier.in_features, out_features=3,bias=True)


    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    return model, criterion,optimizer

In [12]:
windows = [2,4,8,16,32]

# Case 2: Visual

In [13]:
import torchvision
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self,feature_size,window_size_seconds,T_field_seconds):
        super(Net, self).__init__()
        
        T = window_size_seconds*2
        T_field = T_field_seconds*2
        fc1_size = (32)*(T-(T_field-1))
        
        self.conv1 = nn.Conv2d(1,out_channels=32,kernel_size=(1,feature_size))
        self.bn1 = nn.BatchNorm2d(self.conv1.out_channels)
        self.conv2 = nn.Conv2d(1,out_channels=32,kernel_size=(T_field,32))
        self.bn2 = nn.BatchNorm2d(self.conv2.out_channels)
        self.fc1 = nn.Linear(fc1_size, 240)
        self.fc2 = nn.Linear(self.fc1.out_features, 240)
        self.fc3 = nn.Linear(240, 4)

    def forward(self, x):
        
                
        x = F.relu(self.bn1(self.conv1(x)))
        x = x.permute(0,3,2,1)
        x = F.relu(self.bn2(self.conv2(x)))
        x = x.reshape(x.size(0),-1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

## Window size

In [14]:
def evaluate_model(model,dataloader,feature_name,device):
    #if not feature_name == 'combined_spot' or not feature_name == 'resnet_spot':
    #    return "Bad feature name!"
    running_loss=0.0
    model.to(device)
    best_acc = 0
    with torch.no_grad():
        model.eval()
        res = torch.zeros((4,4))
        for i, data in enumerate(dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs = data[feature_name].unsqueeze(1).to(device)
            label = data['label'].to(device)

            # forward + backward + optimize
            outputs = model(inputs)

            preds = torch.argmax(outputs,dim=1)

            for p,gt in zip(preds,label):
                res[int(p),int(gt)] += 1




        N_total = res.sum()
        N_correct = res.diag().sum()

        acc = N_correct / N_total
        print(f"Accuracy : {acc}")
    
    return acc

In [15]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self,feature_size,window_size_seconds,T_field_seconds):
        super(Net, self).__init__()
        
        T = window_size_seconds*2
        T_field = T_field_seconds*2
        fc1_size = (32)*(T-(T_field-1))
        
        self.conv1 = nn.Conv2d(1,out_channels=32,kernel_size=(1,feature_size))
        self.bn1 = nn.BatchNorm2d(self.conv1.out_channels)
        self.conv2 = nn.Conv2d(1,out_channels=32,kernel_size=(T_field,32))
        self.bn2 = nn.BatchNorm2d(self.conv2.out_channels)
        self.fc1 = nn.Linear(fc1_size, 240)
        self.fc2 = nn.Linear(self.fc1.out_features, 240)
        self.fc3 = nn.Linear(240, 4)

    def forward(self, x):
        
                
        x = F.relu(self.bn1(self.conv1(x)))
        x = x.permute(0,3,2,1)
        x = F.relu(self.bn2(self.conv2(x)))
        x = x.reshape(x.size(0),-1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Case 3 B) Softmax average

In [16]:
def evaluate_softmax_fusion(dataloader,visual_model,audio_model):
    visual_model.to(device)
    audio_model.to(device)
    visual_model.eval()
    audio_model.eval()
    
    with torch.no_grad():
        res_audio = torch.zeros((4,4))
        res_visual = torch.zeros((4,4))
        res_average = torch.zeros((4,4))
        res_smax = torch.zeros((4,4))
        res_lmax = torch.zeros((4,4))
        
        all_labels = list()
        all_preds = list()
        all_preds_raw = list()
        
        for i, data in enumerate(dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs_audio = data['ms_spot'].unsqueeze(1).to(device)
            inputs_visual = data['resnet_spot'].unsqueeze(1).to(device)
            label = data['label'].to(device)

            # forward + backward + optimize
            outputs_audio = audio_model(inputs_audio)
            outputs_visual = visual_model(inputs_visual)
            
            softmax_audio = torch.softmax(outputs_audio,dim=1) # MAKE SURE DIMS ARE CORRECT, TEST
            softmax_visual = torch.softmax(outputs_visual,dim=1)
            softmax_average = (softmax_audio + softmax_visual) / 2

            logit_max = torch.max(outputs_audio,outputs_visual)
            softmax_max = torch.max(softmax_audio,softmax_visual)
            
            
            preds_audio = torch.argmax(outputs_audio,dim=1)
            preds_visual = torch.argmax(outputs_visual,dim=1)
            preds_average = torch.argmax(softmax_average,dim=1)
            preds_smax = torch.argmax(softmax_max,dim=1)
            preds_lmax = torch.argmax(logit_max,dim=1)
            
            all_preds_raw.append((softmax_audio,softmax_visual,softmax_average))
            all_preds.append((preds_audio,preds_visual,preds_average,preds_smax,preds_lmax))
            all_labels.append(label)

            for p,gt in zip(preds_audio,label):
                res_audio[int(p),int(gt)] += 1
            
            for p,gt in zip(preds_visual,label):
                res_visual[int(p),int(gt)] += 1

            for p,gt in zip(preds_average,label):
                res_average[int(p),int(gt)] += 1
                
            for p,gt in zip(preds_smax,label):
                res_smax[int(p),int(gt)] += 1
                
            for p,gt in zip(preds_lmax,label):
                res_lmax[int(p),int(gt)] += 1
                
                
        # Audio
        N_total_audio = res_audio.sum()
        N_correct_audio = res_audio.diag().sum()
        acc_audio = N_correct_audio / N_total_audio
        
        N_total_visual = res_visual.sum()
        N_correct_visual = res_visual.diag().sum()
        acc_visual = N_correct_visual / N_total_visual
        
        N_total_average = res_average.sum()
        N_correct_average = res_average.diag().sum()
        acc_average = N_correct_average / N_total_average
        
        N_total_smax = res_smax.sum()
        N_correct_smax = res_smax.diag().sum()
        acc_smax = N_correct_smax / N_total_smax
        
        N_total_lmax = res_lmax.sum()
        N_correct_lmax = res_lmax.diag().sum()
        acc_lmax = N_correct_lmax / N_total_lmax
        
        print(f"Audio Accuracy : {acc_audio},\
              Visual Accuracy : {acc_visual},\
              Average Accuracy : {acc_average},\
              smax Accuracy : {acc_smax},\
              lmax Accuracy : {acc_lmax}")
        
        return acc_audio,acc_visual,acc_average,acc_smax,acc_lmax,(res_audio,res_visual,res_average,res_smax),(all_preds,all_labels,all_preds_raw)
    

In [17]:
avg_w_accs_valid = list()
avg_w_accs_test = list()

for w in windows:
    print(w)
    valid_set_all.set_window_size(w)
    test_set_all.set_window_size(w)
    
    validloader = DataLoader(valid_set_all,**params)
    testloader = DataLoader(test_set_all,**params)
    
    # generate models
    visual_model = Net(512,w,w//2)
    audio_model, _,_ = generate_model()
    
    # load models
    visual_model.load_state_dict(torch.load("./weights/best_model_visual_"+str(w)+".pth"))
    audio_model.load_state_dict(torch.load("./weights/best_model_audio_"+str(w)+".pth"))
    
    avg_w_accs_valid.append(evaluate_softmax_fusion(validloader,visual_model,audio_model))
    avg_w_accs_test.append(evaluate_softmax_fusion(testloader,visual_model,audio_model))
    

2
Audio Accuracy : 0.6353846192359924,              Visual Accuracy : 0.810769259929657,              Average Accuracy : 0.7871794700622559,              smax Accuracy : 0.7846153974533081,              lmax Accuracy : 0.7543589472770691
Audio Accuracy : 0.6111387610435486,              Visual Accuracy : 0.7981103658676147,              Average Accuracy : 0.7692690491676331,              smax Accuracy : 0.7608155012130737,              lmax Accuracy : 0.7284932732582092
4
Audio Accuracy : 0.6825640797615051,              Visual Accuracy : 0.8425641059875488,              Average Accuracy : 0.8230769038200378,              smax Accuracy : 0.8230769038200378,              lmax Accuracy : 0.8112820386886597
Audio Accuracy : 0.6727995872497559,              Visual Accuracy : 0.8438587784767151,              Average Accuracy : 0.8239681720733643,              smax Accuracy : 0.8184982538223267,              lmax Accuracy : 0.8204873204231262
8
Audio Accuracy : 0.7215384840965271,           

In [25]:
import sklearn.metrics

In [26]:
a = 3
p_type = 2

In [61]:
def generate_c_report(l,w):
    labels = torch.cat(l[w][6][1]).cpu()
    
    reports = list()
    
    for pred_type in range(len(l[w][6][0][0])):
        current_preds = list()

        for p in l[w][6][0]:

            current_preds.append(p[pred_type].cpu())

        preds = torch.cat(current_preds)
        r = sklearn.metrics.classification_report(y_pred = preds,y_true=labels,output_dict=True)
        reports.append(r)
        
    return reports

In [28]:
windows = {w:i for i,w in enumerate([2,4,8,16,32])}

In [29]:
avg_w_accs_valid[2][6][1]

[tensor([3, 1, 3, 2, 3, 1, 3, 0, 3, 2, 3, 1, 3, 3, 3, 3, 1, 3, 3, 3, 0, 2, 3, 0,
         1, 0, 3, 2, 0, 3, 0, 1], device='cuda:0'),
 tensor([2, 1, 3, 1, 0, 0, 3, 3, 1, 3, 2, 0, 3, 2, 1, 1, 0, 0, 3, 2, 2, 0, 3, 0,
         1, 0, 0, 3, 3, 1, 0, 2], device='cuda:0'),
 tensor([0, 3, 0, 3, 1, 0, 2, 3, 1, 1, 0, 0, 3, 1, 3, 0, 3, 0, 3, 0, 1, 1, 3, 3,
         1, 2, 3, 3, 0, 1, 3, 3], device='cuda:0'),
 tensor([1, 1, 3, 3, 1, 1, 2, 3, 3, 0, 3, 1, 0, 3, 1, 3, 3, 0, 2, 2, 3, 3, 1, 3,
         0, 2, 1, 3, 1, 1, 0, 1], device='cuda:0'),
 tensor([0, 0, 3, 1, 3, 2, 0, 3, 3, 3, 1, 3, 0, 3, 1, 3, 1, 0, 0, 3, 2, 0, 2, 3,
         0, 3, 1, 3, 1, 3, 2, 3], device='cuda:0'),
 tensor([1, 1, 1, 0, 2, 1, 1, 3, 1, 0, 1, 2, 0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 0, 3,
         1, 1, 3, 2, 0, 3, 0, 0], device='cuda:0'),
 tensor([0, 2, 1, 1, 1, 3, 1, 1, 1, 3, 0, 2, 1, 2, 3, 0, 2, 0, 1, 0, 0, 2, 3, 2,
         0, 1, 3, 1, 0, 1, 3, 3], device='cuda:0'),
 tensor([3, 0, 3, 3, 3, 1, 0, 3, 2, 3, 2, 0, 0, 3, 1, 3, 3, 0, 0, 0, 

In [62]:
reports = dict()
for w in [2,4,8,16,32]:
    reports[w] = generate_c_report(avg_w_accs_test,windows[w])


In [67]:
import pandas as pd


In [80]:
print(pd.DataFrame(reports[8][1]).T.round(3).to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &   support \\
\midrule
0            &      0.869 &   0.848 &     0.858 &   453.000 \\
1            &      0.909 &   0.883 &     0.896 &   579.000 \\
2            &      0.898 &   0.942 &     0.919 &   326.000 \\
3            &      0.830 &   0.845 &     0.838 &   653.000 \\
accuracy     &      0.872 &   0.872 &     0.872 &     0.872 \\
macro avg    &      0.876 &   0.879 &     0.878 &  2011.000 \\
weighted avg &      0.873 &   0.872 &     0.872 &  2011.000 \\
\bottomrule
\end{tabular}



In [65]:
len(avg_w_accs_valid[0][6][0][0])

5

In [187]:
y_p = list()
for e in avg_w_accs_valid[a][6][0]:
    y_p.append(e[p_type])
y_p = torch.cat(y_p)
y_p = y_p.cpu()

In [188]:
l = torch.cat(avg_w_accs_valid[a][6][1]).cpu()

In [189]:
print(sklearn.metrics.classification_report(target_names=['Card','Substitutuion','Goal','Background'],y_true=l,y_pred=y_p))

               precision    recall  f1-score   support

         Card       0.84      0.87      0.85       396
Substitutuion       0.91      0.92      0.91       562
         Goal       0.95      0.97      0.96       356
   Background       0.88      0.85      0.86       636

     accuracy                           0.89      1950
    macro avg       0.90      0.90      0.90      1950
 weighted avg       0.89      0.89      0.89      1950



In [81]:
avg_w_accs_valid[a][5][1]

tensor([[326.,  22.,   2.,  46.],
        [ 28., 495.,   0.,  52.],
        [  0.,   2., 326.,  42.],
        [ 42.,  43.,  28., 496.]])

In [82]:
avg_w_accs_valid[a][5][2]

tensor([[319.,  17.,   5.,  64.],
        [ 22., 473.,   1.,  83.],
        [  4.,   4., 336.,  12.],
        [ 51.,  68.,  14., 477.]])

In [112]:
avg_w_accs_test[a][5][3]

tensor([[357.,  30.,   4.,  61.],
        [ 32., 484.,   4.,  78.],
        [  1.,   3., 310.,  19.],
        [ 63.,  62.,   8., 495.]])

In [60]:
X = torch.rand(3,3)

In [63]:
X[1]

tensor([0.5400, 0.7260, 0.0283])

In [47]:
for e in avg_w_accs_valid:
    print(e)

(tensor(0.6354), tensor(0.8108), tensor(0.7872), tensor(0.7846), tensor(0.7544), (tensor([[210.,  61.,  25.,  96.],
        [ 56., 336.,   8., 120.],
        [ 28.,  10., 294.,  21.],
        [102., 155.,  29., 399.]]), tensor([[308.,  29.,   0.,  47.],
        [ 28., 472.,   0.,  58.],
        [  1.,   6., 331.,  61.],
        [ 59.,  55.,  25., 470.]]), tensor([[305.,  30.,   5.,  61.],
        [ 28., 439.,   2.,  80.],
        [  9.,   3., 327.,  31.],
        [ 54.,  90.,  22., 464.]]), tensor([[303.,  28.,   5.,  64.],
        [ 30., 444.,   3.,  87.],
        [ 11.,   3., 331.,  33.],
        [ 52.,  87.,  17., 452.]])))
(tensor(0.6826), tensor(0.8426), tensor(0.8231), tensor(0.8231), tensor(0.8113), (tensor([[245.,  45.,  15., 111.],
        [ 49., 361.,   3., 114.],
        [  8.,   6., 320.,   6.],
        [ 94., 150.,  18., 405.]]), tensor([[326.,  22.,   2.,  46.],
        [ 28., 495.,   0.,  52.],
        [  0.,   2., 326.,  42.],
        [ 42.,  43.,  28., 496.]]), tensor(

In [None]:
for wsize,tacc in zip(windows,avg_w_accs_test):
    print(wsize,tacc)

## Window size

## Window slide

In [None]:
avg_pos_accs_valid = list()
avg_pos_accs_test = list()

for pos in window_pos:
    print(w)
    valid_set_all.set_window_pos(pos)
    test_set_all.set_window_pos(pos)
    
    train_set_all.set_window_size(best_w)
    valid_set_all.set_window_size(best_w)
    test_set_all.set_window_size(best_w)
    
    validloader = DataLoader(valid_set_all,**params)
    testloader = DataLoader(test_set_all,**params)
    
    # generate models
    visual_model = Net(512,w,w//2)
    audio_model = model, _,_ = generate_model()
    
    # load models
    visual_model.load_state_dict(torch.load("./weights/best_model_visual_slide_"+str(pos)+".pth"))
    audio_model.load_state_dict(torch.load("./weights/best_model_audio_slide_"+str(pos)+".pth"))
    
    avg_pos_accs_valid.append(evaluate_softmax_fusion(validloader,visual_model,audio_model))
    avg_pos_accs_test.append(evaluate_softmax_fusion(testloader,visual_model,audio_model))
    

## Language

# Video

In [9]:
import torchvision.transforms._transforms_video as video_transform
import torchvision.transforms as transforms
import src.datasets.utils.custom_transforms as custom_transforms

mean = [0.43216, 0.394666, 0.37645]
std = [0.22803, 0.22145, 0.216989]

newsize=(112,112)

transform_train_list = [video_transform.ToTensorVideo()]
transform_train_list.append(custom_transforms.ReSize(newsize))
transform_train_list.append(video_transform.RandomHorizontalFlipVideo(0.5))
transform_train_list.append(video_transform.NormalizeVideo(std,mean,inplace=True))

transform_valid_list = [video_transform.ToTensorVideo()]
transform_valid_list.append(custom_transforms.ReSize(newsize))
transform_valid_list.append(video_transform.NormalizeVideo(std,mean,inplace=True))



In [10]:


transform_train = transforms.Compose(transform_train_list)
transform_valid = transforms.Compose(transform_valid_list)

In [11]:
from src.datasets.soccernet_generic_wvideo import soccernet_dataset_generic as vid_set

In [12]:
vid_train_set = vid_set(npy_file=train_list,root_dir=root_dir,lang='all',transform=transform_train,lang_dict=langpath_train)
vid_train_set.load_waves()
vid_train_set.generate_mel_spectrograms(load_features=True)
vid_train_set.load_resnet_features()

100%|██████████| 300/300 [00:49<00:00,  6.00it/s]
100%|██████████| 300/300 [00:15<00:00, 18.88it/s]
100%|██████████| 300/300 [00:02<00:00, 117.55it/s]


In [15]:
vid_test_set = vid_set(npy_file=test_list,root_dir=root_dir,lang='all',transform=transform_valid,lang_dict=langpath_train)
vid_test_set.load_waves()
vid_test_set.generate_mel_spectrograms(load_features=True)
vid_test_set.load_resnet_features()

100%|██████████| 100/100 [00:11<00:00,  8.62it/s]
100%|██████████| 100/100 [00:03<00:00, 26.62it/s]
100%|██████████| 100/100 [00:00<00:00, 124.21it/s]


In [16]:
vid_valid_set = vid_set(npy_file=valid_list,root_dir=root_dir,lang='all',transform=transform_valid,lang_dict=langpath_train)
vid_valid_set.load_waves()
vid_valid_set.generate_mel_spectrograms(load_features=True)
vid_valid_set.load_resnet_features()

100%|██████████| 100/100 [00:13<00:00,  7.25it/s]
100%|██████████| 100/100 [00:05<00:00, 17.75it/s]
100%|██████████| 100/100 [00:01<00:00, 80.96it/s]


In [17]:
params = {'batch_size': 32,
         'shuffle': True,
         'num_workers':16}

# window size

w = 16

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [18]:
# window size
vid_train_set.set_window_size(w)
vid_valid_set.set_window_size(w)
vid_test_set.set_window_size(w)


In [19]:
vid_train_set.set_window_size(w)
vid_valid_set.set_window_size(w)
vid_test_set.set_window_size(w)
trainloader_all_v = DataLoader(vid_train_set,**params)
validloader_all_v= DataLoader(vid_valid_set,**params)
testloader_all_v= DataLoader(vid_test_set,**params)

In [20]:
import torch.nn as nn
import torch
import torchvision
def generate_vid_model():
    model = torchvision.models.video.r3d_18(pretrained=True)
    model.fc = nn.Linear(512,4,bias=True)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10, gamma=0.1)
    model = nn.DataParallel(model)
    
    return model, criterion, optimizer, scheduler
    

In [21]:
import copy
def train_vmod(model, criterion,optimizer,scheduler,epochs,trainloader,validloader,device):
    model.to(device)
    running_loss=0.0
    best_acc = 0

    for epoch in range(epochs):
        scheduler.step()
        for i, data in enumerate(trainloader_all_v,0):

            model.train()
            inputs = data['clip'][:, :, ::5, :, :].to(device)
            target = data['label'].to(device)
            optimizer.zero_grad()
            
            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()
            #break
            if i % 10:
                print(i)
            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
                running_loss += loss.item()

        # calculate accuracy
        with torch.no_grad():
            model.eval()
            res = torch.zeros((4,4))
            for i, data in enumerate(validloader_all_v, 0):
                # get the inputs; data is a list of [inputs, labels]
                inputs = data['clip'][:, :, ::5, :, :].to(device)
                label = data['label'].to(device)

                # forward + backward + optimize
                outputs = model(inputs)

                preds = torch.argmax(outputs,dim=1)

                for p,gt in zip(preds,label):
                    res[int(p),int(gt)] += 1


            N_total = res.sum()
            N_correct = res.diag().sum()

            acc = N_correct / N_total
            if acc > best_acc:
                print("new best acc")
                best_acc = acc
                best_model = copy.deepcopy(model) 

            print(f" Accuracy : {acc}")

    print('Finished Training')
    return best_acc,best_model


In [22]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

accs = list()
for w in [16]:
    a = list()
    vid_train_set.set_window_size(w)
    vid_valid_set.set_window_size(w)
    vid_test_set.set_window_size(w)
    
    trainloader_all_v = DataLoader(vid_train_set,**params)
    validloader_all_v= DataLoader(vid_valid_set,**params)
    testloader_all_v= DataLoader(vid_test_set,**params)
    
    model, criterion, optimizer, scheduler = generate_vid_model()
    a,best_model = train_vmod(model, criterion,optimizer,scheduler,12,trainloader_all_v,validloader_all_v,device)
    torch.save(best_model.state_dict(), "./weights/best_model_vid_"+str(w)+".pth")

1
2
3
4
5
6
7
8
9
11
12
13
14
15
16
17
18
19
21
22
23
24
25
26
27
28
29
31
32
33
34
35
36
37
38
39
41
42
43
44
45
46
47
48
49
51
52
53
54
55
56
57
58
59
61
62
63
64
65
66
67
68
69
71
72
73
74
75
76
77
78
79
81
82
83
84
85
86
87
88
89
91
92
93
94
95
96
97
98
99
101
102
103
104
105
106
107
108
109
111
112
113
114
115
116
117
118
119
121
122
123
124
125
126
127
128
129
131
132
133
134
135
136
137
138
139
141
142
143
144
145
146
147
148
149
151
152
153
154
155
156
157
158
159
161
162
163
164
165
166
167
168
169
171
172
173
174
175
176
177
178
179
181
new best acc
 Accuracy : 0.7984615564346313
1
2
3
4
5
6
7
8
9
11
12
13
14
15
16
17
18
19
21
22
23
24
25
26
27
28
29
31
32
33
34
35
36
37
38
39
41
42
43
44
45
46
47
48
49
51
52
53
54
55
56
57
58
59
61
62
63
64
65
66
67
68
69
71
72
73
74
75
76
77
78
79
81
82
83
84
85
86
87
88
89
91
92
93
94
95
96
97
98
99
101
102
103
104
105
106
107
108
109
111
112
113
114
115
116
117
118
119
121
122
123
124
125
126
127
128
129
131
132
133
134
135
136
137
138
13

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

accs = list()
for w in [8]:
    a = list()
    vid_train_set.set_window_size(w)
    vid_valid_set.set_window_size(w)
    vid_test_set.set_window_size(w)
    
    trainloader_all_v = DataLoader(vid_train_set,**params)
    validloader_all_v= DataLoader(vid_valid_set,**params)
    testloader_all_v= DataLoader(vid_test_set,**params)
    
    model, criterion, optimizer, scheduler = generate_vid_model()
    a,best_model = train_vmod(model, criterion,optimizer,scheduler,12,trainloader_all_v,validloader_all_v,device)
    torch.save(best_model.state_dict(), "./weights/best_model_vid_"+str(w)+".pth")

1
2
3
4
5
6
7
8
9
11
12
13
14
15
16
17
18
19
21
22
23
24
25
26
27
28
29
31
32
33
34
35
36
37
38
39
41
42
43
44
45
46
47
48
49
51
52
53
54
55
56
57
58
59
61
62
63
64
65
66
67
68
69
71
72
73
74
75
76
77
78
79
81
82
83
84
85
86
87
88
89
91
92
93
94
95
96
97
98
99
101
102
103
104
105
106
107
108
109
111
112
113
114
115
116
117
118
119
121
122
123
124
125
126
127
128
129
131
132
133
134
135
136
137
138
139
141
142
143
144
145
146
147
148
149
151
152
153
154
155
156
157
158
159
161
162
163
164
165
166
167
168
169
171
172
173
174
175
176
177
178
179
181
new best acc
 Accuracy : 0.8015384674072266
1
2
3
4
5
6
7
8
9
11
12
13
14
15
16
17
18
19
21
22
23
24
25
26
27
28
29
31
32
33
34
35
36
37
38
39
41
42
43
44
45
46
47
48
49
51
52
53
54
55
56
57
58
59
61
62
63
64
65
66
67
68
69
71
72
73
74
75
76
77
78
79
81
82
83
84
85
86
87
88
89
91
92
93
94
95
96
97
98
99
101
102
103
104
105
106
107
108
109
111
112
113
114
115
116
117
118
119
121
122
123
124
125
126
127
128
129
131
132
133
134
135
136
137
138
13

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

accs = list()
for w in [16]:
    a = list()
    vid_train_set.set_window_size(w)
    vid_valid_set.set_window_size(w)
    vid_test_set.set_window_size(w)
    
    trainloader_all_v = DataLoader(vid_train_set,**params)
    validloader_all_v= DataLoader(vid_valid_set,**params)
    testloader_all_v= DataLoader(vid_test_set,**params)
    
    model, criterion, optimizer, scheduler = generate_vid_model()
    a,best_model = train_vmod(model, criterion,optimizer,scheduler,12,trainloader_all_v,validloader_all_v,device)
    torch.save(best_model.state_dict(), "./weights/best_model_vid_"+str(w)+".pth")

In [20]:
best_model

DataParallel(
  (module): VideoResNet(
    (stem): BasicStem(
      (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
      (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Sequential(
          (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
          (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
        )
        (conv2): Sequential(
          (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
          (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (relu): ReLU(inplace=True)
      )
      (1): BasicBlock(
        (conv1): Sequential(
          (0): Conv3DSimple(64, 64, kernel_size=(3, 

In [23]:
import copy

In [190]:
def evaluate_softmax_fusion_video(dataloader,visual_model,audio_model):
    visual_model.to(device)
    audio_model.to(device)
    visual_model.eval()
    audio_model.eval()
    
    with torch.no_grad():
        model.eval()
        res_audio = torch.zeros((4,4))
        res_visual = torch.zeros((4,4))
        res_average = torch.zeros((4,4))
        res_smax = torch.zeros((4,4))
        res_lmax = torch.zeros((4,4))
        
        all_preds = list()
        all_labels = list()
        
        for i, data in enumerate(dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs_visual = data['clip'][:, :, ::5, :, :].to(device)
            inputs_audio = data['ms_spot'].unsqueeze(1).to(device)
            label = data['label'].to(device)

            # forward + backward + optimize
            outputs_audio = audio_model(inputs_audio)
            outputs_visual = visual_model(inputs_visual)
            
            softmax_audio = torch.softmax(outputs_audio,dim=1) # MAKE SURE DIMS ARE CORRECT, TEST
            softmax_visual = torch.softmax(outputs_visual,dim=1)
            softmax_average = (softmax_audio + softmax_visual) / 2
            
            
            logit_max = torch.max(outputs_audio,outputs_visual)
            softmax_max = torch.max(softmax_audio,softmax_visual)
            
            preds_audio = torch.argmax(outputs_audio,dim=1)
            preds_visual = torch.argmax(outputs_visual,dim=1)
            preds_average = torch.argmax(softmax_average,dim=1)
            preds_smax = torch.argmax(softmax_max,dim=1)
            preds_lmax = torch.argmax(logit_max,dim=1)
            
            all_preds.append((preds_audio,preds_visual,preds_average,preds_smax,preds_lmax))
            all_labels.append(label)

            for p,gt in zip(preds_audio,label):
                res_audio[int(p),int(gt)] += 1
            
            for p,gt in zip(preds_visual,label):
                res_visual[int(p),int(gt)] += 1

            for p,gt in zip(preds_average,label):
                res_average[int(p),int(gt)] += 1
                
            for p,gt in zip(preds_smax,label):
                res_smax[int(p),int(gt)] += 1
                
            for p,gt in zip(preds_lmax,label):
                res_lmax[int(p),int(gt)] += 1
                
                
        # Audio
        N_total_audio = res_audio.sum()
        N_correct_audio = res_audio.diag().sum()
        acc_audio = N_correct_audio / N_total_audio
        
        N_total_visual = res_visual.sum()
        N_correct_visual = res_visual.diag().sum()
        acc_visual = N_correct_visual / N_total_visual
        
        N_total_average = res_average.sum()
        N_correct_average = res_average.diag().sum()
        acc_average = N_correct_average / N_total_average
        
        N_total_smax = res_smax.sum()
        N_correct_smax = res_smax.diag().sum()
        acc_smax = N_correct_smax / N_total_smax
        
        N_total_lmax = res_lmax.sum()
        N_correct_lmax = res_lmax.diag().sum()
        acc_lmax = N_correct_lmax / N_total_lmax
        
        print(f"Audio Accuracy : {acc_audio},\
              Visual Accuracy : {acc_visual},\
              Average Accuracy : {acc_average},\
              smax Accuracy : {acc_smax},\
              lmax Accuracy : {acc_lmax}")
        
        return acc_audio,acc_visual,acc_average,acc_smax,acc_lmax,(all_labels,all_preds)
    

In [191]:
avg_wv_accs_valid_16 = list()
avg_wv_accs_test_16 = list()

for w in [16]:
    vid_valid_set.set_window_size(w)
    vid_test_set.set_window_size(w)
    
    validloader = DataLoader(vid_valid_set,**params)
    testloader = DataLoader(vid_test_set,**params)
    
    # generate models
    visual_model,_,_,_ = generate_vid_model()
    audio_model, _,_ = generate_model()
    
    # load models
    visual_model.load_state_dict(torch.load("./weights/best_model_vid_"+str(w)+".pth"))
    audio_model.load_state_dict(torch.load("./weights/best_model_audio_"+str(w)+".pth"))
    
    avg_wv_accs_valid_16.append(evaluate_softmax_fusion_video(validloader,visual_model,audio_model))
    avg_wv_accs_test_16.append(evaluate_softmax_fusion_video(testloader,visual_model,audio_model))

Audio Accuracy : 0.7389743328094482,              Visual Accuracy : 0.908717930316925,              Average Accuracy : 0.8907692432403564,              smax Accuracy : 0.8892307877540588,              lmax Accuracy : 0.8943589925765991
Audio Accuracy : 0.7255097031593323,              Visual Accuracy : 0.8990551829338074,              Average Accuracy : 0.8906016945838928,              smax Accuracy : 0.8881153464317322,              lmax Accuracy : 0.8896071314811707


In [210]:
avg_wv_accs_valid_16[0][5][1]

[(tensor([1, 2, 1, 2, 0, 0, 2, 3, 3, 0, 0, 0, 3, 0, 0, 1, 3, 1, 3, 3, 2, 3, 1, 2,
          1, 1, 2, 1, 3, 3, 3, 3], device='cuda:0'),
  tensor([1, 2, 1, 1, 2, 0, 0, 3, 0, 0, 3, 1, 3, 0, 0, 1, 0, 1, 3, 0, 2, 3, 3, 2,
          1, 1, 2, 1, 1, 0, 3, 3], device='cuda:0'),
  tensor([1, 2, 1, 1, 0, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 1, 3, 1, 3, 0, 2, 3, 1, 2,
          1, 1, 2, 1, 3, 3, 3, 3], device='cuda:0'),
  tensor([1, 2, 1, 1, 0, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 1, 3, 1, 3, 0, 2, 3, 1, 2,
          1, 1, 2, 1, 3, 3, 3, 3], device='cuda:0'),
  tensor([1, 2, 1, 1, 0, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 1, 3, 1, 3, 0, 2, 3, 1, 2,
          1, 1, 2, 1, 3, 0, 3, 3], device='cuda:0')),
 (tensor([0, 1, 3, 0, 3, 3, 3, 2, 3, 3, 3, 3, 2, 0, 2, 2, 0, 2, 0, 2, 3, 1, 2, 0,
          2, 3, 0, 1, 3, 1, 0, 1], device='cuda:0'),
  tensor([0, 1, 3, 0, 3, 3, 3, 2, 1, 3, 0, 1, 2, 1, 2, 2, 1, 3, 0, 2, 3, 0, 2, 0,
          3, 3, 0, 1, 1, 3, 0, 1], device='cuda:0'),
  tensor([0, 1, 3, 0, 3, 3, 3, 2, 1, 3, 0, 1, 2, 0, 2,

In [251]:

a = 0
p_type = 1
y_p = list()
for e in avg_wv_accs_valid_16[a][5][1]:
    y_p.append(e[p_type])
y_p = torch.cat(y_p)
y_p = y_p.cpu()
l = torch.cat(avg_wv_accs_valid_16[a][5][0]).cpu()

In [252]:
print(sklearn.metrics.classification_report(target_names=['Card','Substitutuion','Goal','Background'],y_true=l,y_pred=y_p))

               precision    recall  f1-score   support

         Card       0.87      0.90      0.88       396
Substitutuion       0.96      0.93      0.94       562
         Goal       0.92      0.96      0.94       356
   Background       0.88      0.87      0.87       636

     accuracy                           0.91      1950
    macro avg       0.91      0.91      0.91      1950
 weighted avg       0.91      0.91      0.91      1950



In [249]:

a = 0
p_type = 4
y_p = list()
for e in avg_wv_accs_valid_16[a][5][1]:
    y_p.append(e[p_type])
y_p = torch.cat(y_p)
y_p = y_p.cpu()
l = torch.cat(avg_wv_accs_valid_16[a][5][0]).cpu()

In [250]:
print(sklearn.metrics.classification_report(target_names=['Card','Substitutuion','Goal','Background'],y_true=l,y_pred=y_p))

               precision    recall  f1-score   support

         Card       0.85      0.88      0.86       396
Substitutuion       0.93      0.92      0.92       562
         Goal       0.94      0.96      0.95       356
   Background       0.87      0.85      0.86       636

     accuracy                           0.89      1950
    macro avg       0.90      0.90      0.90      1950
 weighted avg       0.89      0.89      0.89      1950



In [192]:
avg_wv_accs_valid_16

[(tensor(0.7390),
  tensor(0.9087),
  tensor(0.8908),
  tensor(0.8892),
  tensor(0.8944),
  ([tensor([1, 2, 1, 1, 0, 0, 0, 3, 0, 0, 3, 3, 3, 0, 0, 1, 0, 1, 3, 0, 2, 3, 3, 2,
            1, 1, 2, 1, 3, 0, 3, 3], device='cuda:0'),
    tensor([0, 1, 3, 0, 3, 3, 3, 2, 1, 3, 0, 1, 2, 3, 2, 2, 1, 1, 0, 2, 2, 1, 2, 0,
            3, 0, 0, 1, 1, 3, 3, 1], device='cuda:0'),
    tensor([2, 3, 2, 1, 0, 3, 1, 3, 0, 1, 0, 1, 0, 2, 1, 3, 3, 2, 3, 0, 3, 2, 3, 3,
            3, 1, 3, 3, 0, 2, 1, 0], device='cuda:0'),
    tensor([3, 3, 2, 2, 1, 2, 2, 3, 3, 0, 3, 1, 2, 2, 0, 0, 1, 3, 0, 0, 2, 2, 2, 2,
            0, 3, 3, 1, 1, 0, 1, 0], device='cuda:0'),
    tensor([1, 3, 2, 2, 0, 1, 0, 0, 3, 1, 3, 3, 0, 3, 3, 1, 3, 3, 3, 0, 3, 0, 3, 2,
            3, 1, 0, 3, 2, 2, 2, 2], device='cuda:0'),
    tensor([3, 1, 2, 2, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 1, 2, 0, 1, 2, 0, 1, 1, 3, 3,
            3, 0, 2, 1, 3, 3, 0, 2], device='cuda:0'),
    tensor([3, 3, 3, 1, 0, 3, 3, 0, 1, 2, 3, 0, 3, 3, 3, 3, 1, 3, 3, 0, 0, 3

In [193]:
avg_wv_accs_test_16

[(tensor(0.7255),
  tensor(0.8991),
  tensor(0.8906),
  tensor(0.8881),
  tensor(0.8896),
  ([tensor([3, 3, 3, 1, 3, 2, 3, 2, 1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 3, 3, 0, 0, 1,
            0, 1, 3, 0, 3, 1, 3, 0], device='cuda:0'),
    tensor([2, 2, 3, 3, 1, 2, 2, 0, 1, 2, 0, 3, 3, 1, 0, 3, 1, 0, 1, 1, 1, 1, 3, 3,
            2, 0, 0, 0, 2, 3, 1, 2], device='cuda:0'),
    tensor([1, 3, 2, 1, 3, 3, 0, 3, 0, 2, 0, 3, 3, 2, 3, 3, 0, 1, 0, 2, 2, 3, 0, 2,
            0, 3, 3, 3, 3, 3, 3, 0], device='cuda:0'),
    tensor([2, 0, 3, 3, 0, 1, 1, 1, 1, 3, 2, 0, 1, 3, 2, 3, 3, 1, 3, 0, 3, 2, 0, 0,
            1, 3, 1, 2, 1, 1, 2, 2], device='cuda:0'),
    tensor([2, 0, 0, 0, 1, 0, 3, 1, 0, 2, 0, 3, 3, 1, 0, 2, 1, 3, 1, 3, 3, 1, 3, 2,
            1, 1, 3, 3, 0, 3, 0, 0], device='cuda:0'),
    tensor([1, 3, 1, 3, 0, 3, 0, 2, 0, 1, 1, 0, 3, 1, 2, 1, 0, 3, 3, 1, 3, 3, 0, 1,
            0, 3, 0, 2, 3, 3, 0, 3], device='cuda:0'),
    tensor([3, 3, 0, 0, 3, 1, 1, 0, 2, 3, 1, 0, 1, 1, 1, 3, 2, 2, 0, 1, 1, 3

In [29]:
avg_wv_accs_valid = list()
avg_wv_accs_test = list()

for w in [8]:
    vid_valid_set.set_window_size(w)
    vid_test_set.set_window_size(w)
    
    validloader = DataLoader(vid_valid_set,**params)
    testloader = DataLoader(vid_test_set,**params)
    
    # generate models
    visual_model,_,_,_ = generate_vid_model()
    audio_model, _,_ = generate_model()
    
    # load models
    visual_model.load_state_dict(torch.load("./weights/best_model_vid_"+str(w)+".pth"))
    audio_model.load_state_dict(torch.load("./weights/best_model_audio_"+str(w)+".pth"))
    
    avg_wv_accs_valid.append(evaluate_softmax_fusion_video(validloader,visual_model,audio_model))
    avg_wv_accs_test.append(evaluate_softmax_fusion_video(testloader,visual_model,audio_model))

Audio Accuracy : 0.7215384840965271,              Visual Accuracy : 0.8969230651855469,              Average Accuracy : 0.8835897445678711,              smax Accuracy : 0.8789743781089783,              lmax Accuracy : 0.8589743375778198
Audio Accuracy : 0.711089015007019,              Visual Accuracy : 0.8831427097320557,              Average Accuracy : 0.877175509929657,              smax Accuracy : 0.8717055916786194,              lmax Accuracy : 0.8577821850776672


In [31]:
avg_wv_accs_valid

[(tensor(0.7215), tensor(0.8969), tensor(0.8836))]

In [32]:
avg_wv_accs_test

[(tensor(0.7111), tensor(0.8831), tensor(0.8772))]

In [17]:
%%bash
nvidia-smi

Wed Dec 16 01:50:53 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.165.02   Driver Version: 418.165.02   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM3...  On   | 00000000:36:00.0 Off |                    0 |
| N/A   28C    P0    49W / 350W |     13MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM3...  On   | 00000000:BE:00.0 Off |                    0 |
| N/A   33C    P0    49W / 350W |     13MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM3...  On   | 00000000:E2:00.0 Off |                    0 |
| N/A   

# Video - audio fusion

In [24]:
def evaluate_softmax_fusion_video(dataloader,visual_model,audio_model):
    visual_model.to(device)
    audio_model.to(device)
    visual_model.eval()
    audio_model.eval()
    
    with torch.no_grad():
        model.eval()
        res_audio = torch.zeros((4,4))
        res_visual = torch.zeros((4,4))
        res_average = torch.zeros((4,4))
        
        for i, data in enumerate(dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs_audio = data['clip'][:, :, ::3, :, :].to(device)
            inputs_visual = data['resnet_spot'].unsqueeze(1).to(device)
            label = data['label'].to(device)

            # forward + backward + optimize
            outputs_audio = audio_model(inputs_audio)
            outputs_visual = visual_model(inputs_visual)
            
            softmax_audio = torch.softmax(outputs_audio,dim=1) # MAKE SURE DIMS ARE CORRECT, TEST
            softmax_visual = torch.softmax(outputs_visual,dim=1)
            softmax_average = (softmax_audio + softmax_visual) / 2
            
            
            
            preds_audio = torch.argmax(outputs_audio,dim=1)
            preds_visual = torch.argmax(outputs_visual,dim=1)
            preds_average = torch.argmax(softmax_average,dim=1)

            for p,gt in zip(preds_audio,label):
                res_audio[int(p),int(gt)] += 1
            
            for p,gt in zip(preds_visual,label):
                res_visual[int(p),int(gt)] += 1

            for p,gt in zip(preds_average,label):
                res_average[int(p),int(gt)] += 1
                
                
        # Audio
        N_total_audio = res_audio.sum()
        N_correct_audio = res_audio.diag().sum()
        acc_audio = N_correct_audio / N_total_audio
        
        N_total_visual = res_visual.sum()
        N_correct_visual = res_visual.diag().sum()
        acc_visual = N_correct_visual / N_total_visual
        
        N_total_average = res_average.sum()
        N_correct_average = res_average.diag().sum()
        acc_average = N_correct_average / N_total_average
        
        N_total_smax = res_smax.sum()
        N_correct_smax = res_smax.diag().sum()
        acc_smax = N_correct_smax / N_total_smax
        
        N_total_lmax = res_lmax.sum()
        N_correct_lmax = res_lmax.diag().sum()
        acc_lmax = N_correct_lmax / N_total_lmax
        
        print(f"Audio Accuracy : {acc_audio},\
              Visual Accuracy : {acc_visual},\
              Average Accuracy : {acc_average},\
              smax Accuracy : {acc_smax},\
              lmax Accuracy : {acc_lmax}")
        
        return acc_audio,acc_visual,acc_average,acc_smax,acc_lmax
    

In [None]:
avg_w_accs_vidvalid = list()
avg_w_accs_vidtest = list()

for w in windows:
    print(w)
    valid_set_all.set_window_size(w)
    test_set_all.set_window_size(w)
    
    validloader = DataLoader(valid_set_all,**params)
    testloader = DataLoader(test_set_all,**params)
    
    # generate models
    visual_model = Net(512,w,w//2)
    audio_model, _,_ = generate_model()
    
    # load models
    visual_model.load_state_dict(torch.load("./weights/best_model_visual_"+str(w)+".pth"))
    audio_model.load_state_dict(torch.load("./weights/best_model_audio_"+str(w)+".pth"))
    
    avg_w_accs_vidvalid.append(evaluate_softmax_fusion(validloader,visual_model,audio_model))
    avg_w_accs_vidtest.append(evaluate_softmax_fusion(testloader,visual_model,audio_model))

In [None]:
def train_model_ws_visual(model,epochs,trainloader,validloader,criterion,optimizer,scheduler,device,tensorboard_name):
    running_loss=0.0
    model.to(device)
    best_acc = 0

    writer = SummaryWriter('runs/'+tensorboard_name)

    for epoch in range(epochs):
        scheduler.step()
        for i, data in enumerate(trainloader,0):

            model.train()
            inputs = data['resnet_spot'].unsqueeze(1).to(device)
            target = data['label'].to(device)
            inputs[inputs.isnan()] = 0.0
            optimizer.zero_grad()


            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
                running_loss += loss.item()
                writer.add_scalar('training loss',
                running_loss / 2000,
                epoch * len(trainloader) + i)
                # print statistics


        # calculate accuracy
        with torch.no_grad():
            model.eval()
            res = torch.zeros((4,4))
            for i, data in enumerate(validloader, 0):
                # get the inputs; data is a list of [inputs, labels]
                inputs = data['resnet_spot'].unsqueeze(1).to(device)
                label = data['label'].to(device)

                # forward + backward + optimize
                outputs = model(inputs)

                preds = torch.argmax(outputs,dim=1)

                for p,gt in zip(preds,label):
                    res[int(p),int(gt)] += 1




            N_total = res.sum()
            N_correct = res.diag().sum()

            acc = N_correct / N_total
            if acc > best_acc:
                print("new best acc")
                best_acc = acc
                best_model = copy.deepcopy(model) 

            writer.add_scalar('accuracy validation',
                acc,
                epoch * len(validloader) + i)
            print(f" Accuracy : {acc}")
        
    print('Finished Training')
    return best_model,best_acc