In [1]:
import argparse
from easydict import EasyDict
from dataloader import AudiosetDataset
from torch.utils.data import Dataset
import torchaudio
import json
import numpy as np
import torch
import os
from torchvision import transforms

To do list:

I need to change `self.get_image` in AudiosetDataset.

Note that `fbank = self._wav2fbank(datum['wav'], None, 0)` is correct when `datum['wav']`  indicates the correct audio data.


# 1. Audio dataloader

In [2]:
args = {}
args['data_train'] = './sample_datafiles/sample_json_subset.json'
args['label_csv'] = './sample_datafiles/class_labels_indices_subset.csv'
args['roll_mag_aug'] = False #use roll_mag_aug

# for audio_conf 
args['freqm']  = 0  # frequency mask max length, pretraining 0
args['timem'] = 0  # time mask max length, pretraining 0
args['mixup'] = 0 # how many (0-1) samples need to be mixup during training
args['dataset'] = "audioset"  # choices=["audioset", "esc50", "speechcommands"]
args['load_video'] = False
args = EasyDict(args)

In [3]:
target_length = {'audioset':1024, 'esc50':512, 'speechcommands':128}
norm_stats = {'audioset':[-4.2677393, 4.5689974], 'esc50':[-6.6268077, 5.358466], 'speechcommands':[-6.845978, 5.5654526]}
multilabel_dataset = {'audioset': True, 'esc50': False, 'k400': False, 'speechcommands': True}
audio_conf = {'num_mel_bins': 128, 
              'target_length': target_length[args.dataset],  # needed
              'freqm': args.freqm,
              'timem': args.timem,
              'mixup': args.mixup,
              'dataset': args.dataset,
              'mode':'train',
              'mean':norm_stats[args.dataset][0],
              'std':norm_stats[args.dataset][1],
              'multilabel':multilabel_dataset[args.dataset],
              'noise':False}

Note that dataset_json_file must have the format like this:

```
{
 "data": [
  {
   "video_id": "--00W1lcxW-WU_40.000",
   "wav": "./sample_audio/00W1lcxW-WU_40.000.wav",
   "video_path": "./sample_frames/00W1lcxW-WU_40.000/",
   "labels": "/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"
  },
  {
    "video_id": "--KlsG1EnBEjc_000361",
    "wav": "./sample_audio/KlsG1EnBEjc_000361.wav",
    "video_path": "./sample_frames/KlsG1EnBEjc_000361/",
    "labels": "/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"
   }
  
 ]
}
```

In [4]:
class AudioDataset(Dataset):
    def __init__(self, dataset_json_file, audio_conf, label_csv=None, train_model = True):
        # load dataset
        
        self.datapath = dataset_json_file
        with open(dataset_json_file, 'r') as fp:
            data_json = json.load(fp)
        self.data = data_json['data']
        self.data = self.pro_data(self.data)
        self.num_samples = self.data.shape[0]
        print('Dataset has {:d} samples'.format(self.num_samples))
        
        # some parameters
        self.audio_conf = audio_conf
        self.melbins = self.audio_conf.get('num_mel_bins')
        self.target_length = self.audio_conf.get('target_length')
        self.train_model = train_model
        self.norm_mean = self.audio_conf.get('mean')
        self.norm_std = self.audio_conf.get('std')
        self.freqm = self.audio_conf.get('freqm', 0)
        self.timem = self.audio_conf.get('timem', 0)
        print('now using following mask: {:d} freq, {:d} time'.format(self.audio_conf.get('freqm'), self.audio_conf.get('timem')))
        
        # if add noise for data augmentation
        self.noise = self.audio_conf.get('noise', False)
        if self.noise == True:
            print('now use noise augmentation')
        else:
            print('not use noise augmentation')
        
        # skip_norm is a flag that if you want to skip normalization to compute the normalization stats using src/get_norm_stats.py, if Ture, input normalization will be skipped for correctly calculating the stats.
        # set it as True ONLY when you are getting the normalization stats.
        self.skip_norm = self.audio_conf.get('skip_norm') if self.audio_conf.get('skip_norm') else False
        if self.skip_norm:
            print('now skip normalization (use it ONLY when you are computing the normalization stats).')
        else:
            print('use dataset mean {:.3f} and std {:.3f} to normalize the input.'.format(self.norm_mean, self.norm_std))
    
    # change python list to numpy array to avoid memory leak.
    def pro_data(self, data_json):
        for i in range(len(data_json)):
            data_json[i] = [data_json[i]['wav'], data_json[i]['labels'], data_json[i]['video_id'], data_json[i]['video_path']]
        data_np = np.array(data_json, dtype=str)
        return data_np
    
    def decode_data(self, np_data):
        datum = {}
        datum['wav'] = np_data[0]
        datum['labels'] = np_data[1]
        datum['video_id'] = np_data[2]
        datum['video_path'] = np_data[3]
        return datum
    
    def _wav2fbank(self, filename):
        waveform, sr = torchaudio.load(filename)
        waveform = waveform - waveform.mean()
        try:
            fbank = torchaudio.compliance.kaldi.fbank(waveform, htk_compat=True, sample_frequency=sr, use_energy=False, window_type='hanning', num_mel_bins=self.melbins, dither=0.0, frame_shift=10)
        except:
            fbank = torch.zeros([512, 128]) + 0.01
            print('there is a loading error')  
        target_length = self.target_length
        n_frames = fbank.shape[0]
        p = target_length - n_frames
        # cut and pad
        if p > 0:
            m = torch.nn.ZeroPad2d((0, 0, 0, p))
            fbank = m(fbank)
        elif p < 0:
            fbank = fbank[0:target_length, :]
        return fbank
        
    def __getitem__(self, index):
        datum = self.data[index]
        datum = self.decode_data(datum) # to jason file
        
        try:
            fbank = self._wav2fbank(datum['wav'])
        except:
            fbank = torch.zeros([self.target_length, 128]) + 0.01
            print('there is an error in loading audio')
        

        # SpecAug, not do for eval set
        if self.train_model: # training mode
            freqm = torchaudio.transforms.FrequencyMasking(self.freqm)
            timem = torchaudio.transforms.TimeMasking(self.timem)
            fbank = torch.transpose(fbank, 0, 1)
            fbank = fbank.unsqueeze(0)
            if self.freqm != 0:
                fbank = freqm(fbank)
            if self.timem != 0:
                fbank = timem(fbank)
            fbank = fbank.squeeze(0)
            fbank = torch.transpose(fbank, 0, 1)
            # normalize the input for both training and test
            if self.skip_norm == False:
                fbank = (fbank - self.norm_mean) / (self.norm_std)
            # skip normalization the input ONLY when you are trying to get the normalization stats.
            else:
                pass
            # if add noise for data augmentation
            #if (self.noise == True) and (self.train_model == True):
            if self.noise == True :
                fbank = fbank + torch.rand(fbank.shape[0], fbank.shape[1]) * np.random.rand() / 10
                fbank = torch.roll(fbank, np.random.randint(-self.target_length, self.target_length), 0)
        
        else: #evaluation mode
            fbank = fbank.squeeze(0)
            fbank = torch.transpose(fbank, 0, 1)

            # normalize the input for both training and test
            if self.skip_norm == False:
                fbank = (fbank - self.norm_mean) / (self.norm_std)
            # skip normalization the input ONLY when you are trying to get the normalization stats.
            else:
                pass
        return fbank

In [5]:
dataset_train = AudioDataset(args.data_train, label_csv=args.label_csv, audio_conf=audio_conf, 
                                #roll_mag_aug=args.roll_mag_aug,
                                #load_video=args.load_video
                               )

Dataset has 2 samples
now using following mask: 0 freq, 0 time
not use noise augmentation
use dataset mean -4.268 and std 4.569 to normalize the input.


In [6]:
temp = iter(dataset_train)

In [7]:
next(temp).shape

torch.Size([1024, 128])

### 1.2 one simple test to load audio data

In [8]:
def wav2fbank(filename):
        waveform, sr = torchaudio.load(filename)
        waveform = waveform - waveform.mean()
        fbank = torchaudio.compliance.kaldi.fbank(waveform, htk_compat=True, sample_frequency=sr, use_energy=False, window_type='hanning', 
                                                      num_mel_bins=128, dither=0.0, frame_shift=10)
  
        target_length = 1024
        n_frames = fbank.shape[0]
        p = target_length - n_frames
        # cut and pad
        if p > 0:
            m = torch.nn.ZeroPad2d((0, 0, 0, p))
            fbank = m(fbank)
        elif p < 0:
            fbank = fbank[0:target_length, :]
        return fbank
    
path = "./sample_audio/00W1lcxW-WU_40.000.wav"
fbank = wav2fbank(path)
fbank 

tensor([[-10.6136,  -9.0683,  -5.6249,  ...,  -5.7625,  -6.3989,  -7.7529],
        [ -8.2543,  -8.9011,  -5.4577,  ...,  -6.8562,  -6.2021,  -7.6635],
        [ -7.8870,  -9.3728,  -5.9294,  ...,  -2.8646,  -2.8981,  -4.4085],
        ...,
        [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]])

# 2. Video dataloader

Do we need this part?
```
## apply different color jittering for each frame in the video clip
        trans_clip_cj = []
        for frame in trans_clip:
            frame = self.toPIL(frame)  # PIL image
            frame = self.color_jitter_(frame)  # tensor [C x H x W]
            frame = np.array(frame)
            trans_clip_cj.append(frame)
```

In [9]:
import random
import numpy as np
import cv2


class RandomHorizontalFlip(object):
    def __init__(self, p=0.5):
        self.p = p
    def __call__(self, video_clip):
        if random.random() < self.p:
            # t x h x w
            #print("flip")
            flip_video_clip = np.flip(video_clip, axis=2).copy()
            return flip_video_clip
        return video_clip


class RandomCrop(object):
    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        if isinstance(output_size, int):
            self.output_size = (output_size, output_size)
        else:
            assert len(output_size) == 2
            self.output_size = output_size
    def __call__(self, video_clip):
        h, w = video_clip.shape[1:3]
        new_h, new_w = self.output_size
        h_start = random.randint(0, h-new_h)
        w_start = random.randint(0, w-new_w)
        rnd_crop_video_clip = video_clip[:, h_start:h_start+new_h,
                                 w_start:w_start+new_w, :]
        return rnd_crop_video_clip


class CenterCrop(object):
    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        if isinstance(output_size, int):
            self.output_size = (output_size, output_size)
        else:
            assert len(output_size) == 2
            self.output_size = output_size

    def __call__(self, video_clip):
        h, w = video_clip.shape[1:3]
        new_h, new_w = self.output_size
        h_start = int((h - new_h) / 2)
        w_start = int((w- new_w) / 2)
        center_crop_video_clip = video_clip[:, h_start:h_start + new_h,
                                    w_start:w_start + new_w, :]
        return center_crop_video_clip

class ClipResize(object):
    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        if isinstance(output_size, int):
            self.output_size = (output_size, output_size)
        else:
            assert len(output_size) == 2
            self.output_size = output_size

    def __call__(self, video_clip):
        rsz_video_clip = []
        new_h, new_w = self.output_size
        for frame in video_clip:
            rsz_frame = cv2.resize(frame, (new_w, new_h))
            rsz_video_clip.append(rsz_frame)
        return np.array(rsz_video_clip)

class ToTensor(object):
    """
    change input channel
    D x H x W x C ---> C x D x H x w
    """
    def __init__(self):
        super(ToTensor, self).__init__()

    def __call__(self, sample):
        video_clip = sample
        video_clip = np.transpose(video_clip, (3, 0, 1, 2))
        return video_clip

Note that dataset_json_file must have the format like this:

```
{
 "data": [
  {
   "video_id": "--00W1lcxW-WU_40.000",
   "wav": "./sample_audio/00W1lcxW-WU_40.000.wav",
   "video_path": "./sample_frames/00W1lcxW-WU_40.000/",
   "labels": "/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"
  },
  {
    "video_id": "--KlsG1EnBEjc_000361",
    "wav": "./sample_audio/KlsG1EnBEjc_000361.wav",
    "video_path": "./sample_frames/KlsG1EnBEjc_000361/",
    "labels": "/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"
   }
  
 ]
}
```

In [10]:
class VideoDataset(Dataset):
    def __init__(self, dataset_json_file, clip_len = 16, transforms_=None, color_jitter_=None):
        # load dataset
        self.datapath = dataset_json_file
        with open(dataset_json_file, 'r') as fp:
            data_json = json.load(fp)
        self.data = data_json['data']
        self.data = self.pro_data(self.data)
        self.num_samples = self.data.shape[0]
        print('Dataset has {:d} samples'.format(self.num_samples))
        
        # some parameters
        self.clip_len = clip_len
        self.toPIL = transforms.ToPILImage()
        self.transforms_ = transforms_
        self.color_jitter_ = color_jitter_
        
        
    # change python list to numpy array to avoid memory leak.
    def pro_data(self, data_json):
        for i in range(len(data_json)):
            data_json[i] = [data_json[i]['wav'], data_json[i]['labels'], data_json[i]['video_id'], data_json[i]['video_path']]
        data_np = np.array(data_json, dtype=str)
        return data_np
    
    def decode_data(self, np_data):
        datum = {}
        datum['wav'] = np_data[0]
        datum['labels'] = np_data[1]
        datum['video_id'] = np_data[2]
        datum['video_path'] = np_data[3]
        return datum
    
    def _loop_load_rgb(self, video_path, clip_len):
        video_clip = []
        for i in range(clip_len):
            cur_img_path = os.path.join(video_path, "frame_" + "{:02}.jpg".format(i))
            img = cv2.imread(cur_img_path)
            video_clip.append(img)
        video_clip = np.array(video_clip)
        return video_clip
    
    def __getitem__(self, index):
        datum = self.data[index]
        datum = self.decode_data(datum) # to jason file
        video_id, video_path = datum['video_id'],datum['video_path'] 
        rgb_clip = self._loop_load_rgb(video_path, self.clip_len)
        
        if not self.transforms_ == None:
            trans_clip = self.transforms_(rgb_clip)
        else:
            trans_clip = rgb_clip  
        trans_clip_cj = trans_clip
        ## apply different color jittering for each frame in the video clip
        #trans_clip_cj = []
        #for frame in trans_clip:
            #frame = self.toPIL(frame)  # PIL image
            #frame = self.color_jitter_(frame)  # tensor [C x H x W]
            #frame = np.array(frame)
            #trans_clip_cj.append(frame)
        trans_clip_cj = np.array(trans_clip_cj).transpose(3, 0, 1, 2)
        return trans_clip_cj
        

In [11]:
dataset_train = VideoDataset(args.data_train, clip_len = 16)
temp = iter(dataset_train)
next(temp).shape

Dataset has 2 samples


(3, 16, 224, 224)

### 2.2 one simple test to load video data

In [12]:
video_id = "--00W1lcxW-WU_40.000"
video_path = "./sample_frames/00W1lcxW-WU_40.000/"
clip_len = 16

In [13]:
def loop_load_rgb(video_path, clip_len):
    video_clip = []
    for i in range(clip_len):
        cur_img_path = os.path.join(video_path, "frame_" + "{:02}.jpg".format(i))
        img = cv2.imread(cur_img_path)
        video_clip.append(img)
    video_clip = np.array(video_clip)
    return video_clip

In [14]:
video_clip = loop_load_rgb(video_path, clip_len)
video_clip.shape

(16, 224, 224, 3)

# 3. Audio-Video dataloader

In [15]:
class AudioVideoDataset(Dataset):
    def __init__(self, dataset_json_file, audio_conf, label_csv=None, train_model = True,
                 clip_len = 16, transforms_=None, color_jitter_=None):
        # load dataset
        
        self.datapath = dataset_json_file
        with open(dataset_json_file, 'r') as fp:
            data_json = json.load(fp)
        self.data = data_json['data']
        self.data = self.pro_data(self.data)
        self.num_samples = self.data.shape[0]
        print('Dataset has {:d} samples'.format(self.num_samples))
        
        # some parameters for audio
        self.audio_conf = audio_conf
        self.melbins = self.audio_conf.get('num_mel_bins')
        self.target_length = self.audio_conf.get('target_length')
        self.train_model = train_model
        self.norm_mean = self.audio_conf.get('mean')
        self.norm_std = self.audio_conf.get('std')
        self.freqm = self.audio_conf.get('freqm', 0)
        self.timem = self.audio_conf.get('timem', 0)
        print('now using following mask: {:d} freq, {:d} time'.format(self.audio_conf.get('freqm'), self.audio_conf.get('timem')))
        
        
        # some parameters for video
        self.clip_len = clip_len
        self.toPIL = transforms.ToPILImage()
        self.transforms_ = transforms_
        self.color_jitter_ = color_jitter_
        
        # if add noise for data augmentation
        self.noise = self.audio_conf.get('noise', False)
        if self.noise == True:
            print('now use noise augmentation')
        else:
            print('not use noise augmentation')
        
        # skip_norm is a flag that if you want to skip normalization to compute the normalization stats using src/get_norm_stats.py, if Ture, input normalization will be skipped for correctly calculating the stats.
        # set it as True ONLY when you are getting the normalization stats.
        self.skip_norm = self.audio_conf.get('skip_norm') if self.audio_conf.get('skip_norm') else False
        if self.skip_norm:
            print('now skip normalization (use it ONLY when you are computing the normalization stats).')
        else:
            print('use dataset mean {:.3f} and std {:.3f} to normalize the input.'.format(self.norm_mean, self.norm_std))
    
    # change python list to numpy array to avoid memory leak.
    def pro_data(self, data_json):
        for i in range(len(data_json)):
            data_json[i] = [data_json[i]['wav'], data_json[i]['labels'], data_json[i]['video_id'], data_json[i]['video_path']]
        data_np = np.array(data_json, dtype=str)
        return data_np
    
    def decode_data(self, np_data):
        datum = {}
        datum['wav'] = np_data[0]
        datum['labels'] = np_data[1]
        datum['video_id'] = np_data[2]
        datum['video_path'] = np_data[3]
        return datum
    
    def _wav2fbank(self, filename):
        waveform, sr = torchaudio.load(filename)
        waveform = waveform - waveform.mean()
        try:
            fbank = torchaudio.compliance.kaldi.fbank(waveform, htk_compat=True, sample_frequency=sr, use_energy=False, window_type='hanning', num_mel_bins=self.melbins, dither=0.0, frame_shift=10)
        except:
            fbank = torch.zeros([512, 128]) + 0.01
            print('there is a loading error')  
        target_length = self.target_length
        n_frames = fbank.shape[0]
        p = target_length - n_frames
        # cut and pad
        if p > 0:
            m = torch.nn.ZeroPad2d((0, 0, 0, p))
            fbank = m(fbank)
        elif p < 0:
            fbank = fbank[0:target_length, :]
        return fbank
    
    def _loop_load_rgb(self, video_path, clip_len):
        video_clip = []
        for i in range(clip_len):
            cur_img_path = os.path.join(video_path, "frame_" + "{:02}.jpg".format(i))
            img = cv2.imread(cur_img_path)
            video_clip.append(img)
        video_clip = np.array(video_clip)
        return video_clip
        
    def __getitem__(self, index):
        datum = self.data[index]
        datum = self.decode_data(datum) # to jason file
        
        # part 1: get audio
        try:
            fbank = self._wav2fbank(datum['wav'])
        except:
            fbank = torch.zeros([self.target_length, 128]) + 0.01
            print('there is an error in loading audio')

        # SpecAug, not do for eval set
        if self.train_model: # training mode
            freqm = torchaudio.transforms.FrequencyMasking(self.freqm)
            timem = torchaudio.transforms.TimeMasking(self.timem)
            fbank = torch.transpose(fbank, 0, 1)
            fbank = fbank.unsqueeze(0)
            if self.freqm != 0:
                fbank = freqm(fbank)
            if self.timem != 0:
                fbank = timem(fbank)
            fbank = fbank.squeeze(0)
            fbank = torch.transpose(fbank, 0, 1)
            # normalize the input for both training and test
            if self.skip_norm == False:
                fbank = (fbank - self.norm_mean) / (self.norm_std)
            # skip normalization the input ONLY when you are trying to get the normalization stats.
            else:
                pass
            # if add noise for data augmentation
            #if (self.noise == True) and (self.train_model == True):
            if self.noise == True :
                fbank = fbank + torch.rand(fbank.shape[0], fbank.shape[1]) * np.random.rand() / 10
                fbank = torch.roll(fbank, np.random.randint(-self.target_length, self.target_length), 0)
        
        else: #evaluation mode
            fbank = fbank.squeeze(0)
            fbank = torch.transpose(fbank, 0, 1)

            # normalize the input for both training and test
            if self.skip_norm == False:
                fbank = (fbank - self.norm_mean) / (self.norm_std)
            # skip normalization the input ONLY when you are trying to get the normalization stats.
            else:
                pass
        
        # part 2: get video
        
        video_id, video_path = datum['video_id'],datum['video_path'] 
        rgb_clip = self._loop_load_rgb(video_path, self.clip_len)
        
        if not self.transforms_ == None:
            trans_clip = self.transforms_(rgb_clip)
        else:
            trans_clip = rgb_clip  
        trans_clip_cj = trans_clip
        ## apply different color jittering for each frame in the video clip
        #trans_clip_cj = []
        #for frame in trans_clip:
            #frame = self.toPIL(frame)  # PIL image
            #frame = self.color_jitter_(frame)  # tensor [C x H x W]
            #frame = np.array(frame)
            #trans_clip_cj.append(frame)
        trans_clip_cj = np.array(trans_clip_cj).transpose(3, 0, 1, 2)
        return fbank, trans_clip_cj

In [147]:
dataset_train = AudioVideoDataset(args.data_train, clip_len = 16,audio_conf=audio_conf)

Dataset has 2 samples
now using following mask: 0 freq, 0 time
not use noise augmentation
use dataset mean -4.268 and std 4.569 to normalize the input.


In [149]:
temp = iter(dataset_train)
fbank, trans_clip_cj = next(temp)
fbank.shape, trans_clip_cj.shape

(torch.Size([1024, 128]), (3, 16, 224, 224))