# Data Exploration

In [1]:
import numpy as np
import pandas as pd

import bom1.bom1 as bom1
import bom1.wakeword as wf

import torch
import torchaudio
import torchaudio.transforms as T
import os

import IPython.display as ipd

In [None]:
train, _, _ = wf.get_balanced_splits(balance='every25', splits=['train'], cliplength=1)

In [None]:
path = [os.path.join('/work3/s164419/01005WakeWordData/lectures', f'{x}.wav') for x in train['ID'].tolist()]

In [2]:
train, val, test = wf.get_balanced_splits(balance='1:1', splits=['train', 'val', 'test'], cliplength=1)

Generating negative classes for train: 100%|███████████████████████| 290/290 [00:06<00:00, 44.60it/s]
Generating negative classes for val: 100%|███████████████████████████| 47/47 [00:01<00:00, 31.50it/s]
Generating negative classes for test: 100%|██████████████████████████| 73/73 [00:01<00:00, 47.90it/s]


In [6]:
1.1 / (train.shape[0] + val.shape[0] + test.shape[0])

9.040105193951349e-05

In [10]:
train, val, test = wf.get_balanced_splits(balance='every50', splits=['train', 'val'], cliplength=1)

Generating negative classes for train: 100%|███████████████████████| 290/290 [00:09<00:00, 29.10it/s]
Generating negative classes for val: 100%|███████████████████████████| 47/47 [00:01<00:00, 28.27it/s]


In [11]:
9.040105193951349e-05 * (train.shape[0] + val.shape[0])

154.56798980933598

In [None]:
path = '/work3/s164419/01005WakeWordData/wup/train/'
path += os.listdir('/work3/s164419/01005WakeWordData/wup/train/')[0]

transforms = [wf.TransformMono(), wf.Padder(22050)]

%timeit wf.load_data(path, f = T.Spectrogram(), transforms = transforms, normalize = True, t1 = 0, t2 = 1)

In [None]:
lectures = '/work3/s164419/01005WakeWordData/lectures/'
#full_paths = [os.path.join(lectures, x) for x in os.listdir(lectures)]

In [None]:
t1, t2, ID = train['t1'].iloc[0], train['t2'].iloc[0], train['ID'].iloc[0]
path = os.path.join(lectures, f'{ID}.wav')
audio, sr, x = wf.load_data(path, f = T.Spectrogram(), transforms = transforms, normalize = True, t1 = t1, t2 = t2)

In [None]:
ipd.Audio(audio, rate=sr)

In [None]:
a = None
b = None

assert not ((a is None) and (b is None)), 'Both a and b cannot be none.'

# Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class WakewordDataset(Dataset):
    '''
    Construct a dataset with sound files.
    '''
    
    def __init__(self, folder, f, transforms=None, sr = 22050, normalize = False):
        #f is the function that takes audio and returns the spectrogram.
        
        self.paths  = [os.path.join(folder, x) for x in os.listdir(folder)]
        
        folderinfo  = [wf.info_from_path(x) for x in os.listdir(folder)] #Already here, it's shuffled.
        self.ID, self.t1, self.t2, self.target = [x[0] for x in folderinfo], [x[1] for x in folderinfo], [x[2] for x in folderinfo], [x[3] for x in folderinfo]
        
        self.transforms = transforms
        self.f          = f
        self.normalize  = normalize
        
        return
        
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):
        path            = self.paths[idx]
        audio, sr, x    = wf.load_data(path, f = self.f, transforms=self.transforms, normalize=self.normalize)
        target          = self.target[idx]
        ID              = self.ID[idx] 
        return audio, sr, x, target, path

In [None]:
train_dataset = WakewordDataset(folder='./1_to_1/train/',
                                f = T.Spectrogram(), 
                                normalize=True, #normalize the audio when reading it with torchaudio.
                                transforms = [wf.AudioAugment(reverb = 100, snr = 15, pitch = 150, p = [0.5, 0.5, 0.5]),
                                              wf.TransformMono(), 
                                              wf.Padder(44000)]
                               )

In [None]:
train_dataset.__getitem__(0)

In [None]:
df = pd.read_csv('./csv/links.csv')

In [None]:
url = df['link'].tolist()

In [None]:
outpath = (df['ID'] + '.wav').tolist()

In [None]:
print(url[:2])
print(outpath[:2])

In [None]:
import bom1.bom1 as bom1
import bom1.wakeword as wf

In [None]:
train, val, test = wf.get_balanced_splits(balance='1:1', seed=42, splits = ['train'])

In [None]:
bom1.download(['0_017ci9cj'], ['/work3/s164419/01005WakeWordData/lectures/bund.wav'], desc='Downloading lectures')

In [None]:
import pandas as pd

df = pd.read_csv('./csv/links.csv')

In [None]:
link = df['link'].tolist()
ID   = df['ID'].tolist()

outpath = [f'/work3/s164419/01005WakeWordData/lectures/{x}.wav' for x in ID]

In [None]:
import bom1.bom1 as bom1

In [None]:
bom1.stream_link('0_0awr6vwy')

In [None]:
import bom1.wakeword as wf
_, _, train = wf.get_splits()

In [None]:
bom1.stream_link(train['ID'].unique()[5])

# Problem with torchaudio.load and durations

For some reason, the durations we get from torchaudio.load don't match the ones on the video. Pretty weird.

In [None]:
import numpy as np
import pandas as pd

import bom1.bom1 as bom1
import bom1.wakeword as wf

import torch
import torchaudio
import torchaudio.transforms as T
import os

import IPython.display as ipd

In [None]:
durations = wf.lecture_durations()

In [None]:
diffs = {}
for ID in durations.keys():
    d = durations[ID]
    
    info = torchaudio.info(f'/work3/s164419/01005WakeWordData/lectures/{ID}.wav')

    d2 = info.num_frames / info.sample_rate
    
    diffs[ID] = np.abs(d2-d)
    
    
diffs = pd.DataFrame(diffs, index=[0]).T

diffs = diffs.reset_index()

In [None]:
diffs.columns = ['ID', 'difference']

In [None]:
diffs = diffs.sort_values(by='difference', ascending=False)

In [None]:

np.sum(diffs['difference'] > 0.05)

In [None]:
halfclip = 1

links = pd.read_csv('./csv/links.csv')
data  = pd.read_csv('./csv/data.csv')

data['t1'] = data['t'] - halfclip
data['t2'] = data['t'] + halfclip
data.drop('t', axis=1, inplace=True)

counts = pd.merge(links[['semester', 'title', 'ID']],
              data.groupby(['semester', 'ID']).size().reset_index(name='n'),
              how='inner',
              on=['semester', 'ID'])

counts['percentage'] = counts['n'].cumsum() / counts['n'].sum()

In [None]:
durations = wf.lecture_durations()

In [None]:
def d(ID):
    info = torchaudio.info(f'/work3/s164419/01005WakeWordData/lectures/{ID}.wav')
    return info.num_frames / info.sample_rate

In [None]:
counts['d1'] = counts['ID'].apply(lambda x : durations[x])
counts['d2'] = counts['ID'].apply(lambda x : d(x))

counts.loc[np.abs(counts['d2'] - counts['d1']) < 1]

In [1]:
import os

In [4]:
len(os.listdir('/work3/s164419/01005WakeWordData/every50_1s/train/'))

1442276