In [None]:
# Standard libraries
import json
import math
import os
import random
from typing import (Any, Dict, List, Tuple)

# Third party libraries
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import scipy.io.wavfile
import scipy.signal
import sklearn
import sklearn.tree

print('IPython.__version__ = %s' % IPython.__version__)
print('matplotlib.__version__ = %s' % matplotlib.__version__)
print('numpy.__version__ = %s' % np.__version__)
print('pandas.__version__ = %s' % pd.__version__)
print('scipy.__version__ = %s' % scipy.__version__)
print('sklearn.__version__ = %s' % sklearn.__version__)

print('\nseeding random with %d\n' % int("9fe3ddf4da76a6", 16))
random.seed(int("9fe3ddf4da76a6", 16))

with open('/proc/%d/status' % os.getpid(), 'rb') as f:
    print(f.read().decode('utf-8'))

In [None]:
# Raw audio: ../audios/*.wav
# Labels: ../outputs/*.json --> ../labels/*.tsv
youtube_ids = set([filename.split('.')[0] for filename in os.listdir('../audios/')])
training_files = set([
    'wr2sVPTacTE', # kelly, wife or dog
])
eval_files = set([
    'TjUXr560Gu0', # dhoom taana
])

# sampling? first-N-seconds? last-N-seconds? specific time-window?
# lazy loading? mono-channel?

def sampleData(rate: int, data: np.ndarray) -> np.ndarray:
    limit = rate * 10 # rate (samples/second) * seconds -> num samples
    return data[:limit, 0]


def labelsFromUtterances(utterances: List[Dict[str, Any]], windows_per_second: int, n_rows: int) -> Tuple[int, np.ndarray]:
    """return: np.ndarray[ndtype=intish, shape=[n_rows]]"""
    positive_examples = 0
    labels = np.zeros(n_rows)
    for item in utterances:
        # math.ceil rounds up to the latest millisecond for labeling
        start_i = int(math.ceil(item['start'] * windows_per_second))
        end_i = int(math.ceil(item['end'] * windows_per_second))
        for i in range(start_i, min(end_i, n_rows)):
            labels[i] = 1
            positive_examples += 1
    return positive_examples, labels


# TODO use a dataclass instead of a Dict
def readData(video_id: str) -> Dict[str, Any]:
    # dtype should be np.dtype('int16')
    rate, all_data = scipy.io.wavfile.read('../audios/%s.wav' % video_id)
    data = sampleData(rate, all_data)
    
    # try wrapping in `int(2 ** math.ceil(math.log(.., 2)))`
    window_size = int(rate) // 100
    step_size = window_size // 2
    # we want windows_per_second to be 200
    windows_per_second = int(rate) // step_size
    _freqs, _times, spectro = scipy.signal.stft(
        data,
        rate,
        window='hann', # default, as specified by the documentation
        nperseg=window_size,
        noverlap=window_size // 2
    )
    
    utterances = []
    with open('../tsvs/%s.tsv' % video_id, 'rb') as f:
        for line in f:
            cols = [s.decode('utf-8') for s in line.rstrip(b'\n').split(b'\t')]
            utterances.append({
                'start': float(cols[0]),
                'end': float(cols[1]),
                'duration': float(cols[2]),
                'content': cols[3],
            })
    
    _num_examples, labels = labelsFromUtterances(
        utterances, 
        windows_per_second, 
        spectro.T.shape[0]
    )
    
    # TODO understand stft input and output shapes
    # Drop the last frame because I don't know how it's derived...
    return {
        'file_name': video_id,
        'signal_rate': rate,
        'window_size': window_size,
        'step_size': step_size,
        'data': data, # TODO remove this line
        'freqs_vec': spectro.T[:-1, :-1],
        'labels': labels[:-1],
        # TODO phoneme
    }


def dict2packed(data: Dict[str, Any]) -> pd.DataFrame:
    num_rows = data['freqs_vec'].shape[0]
    step_size = data['step_size']  # type: int
    window_size = data['window_size']  # type: int
    frames = []
    for i in range(0, data['data'].shape[0], step_size):
        # Cast to lists because pandas doesn't allow numpy.ndarray in cells
        frames.append(list(data['data'][i : i + window_size]))
    return pd.DataFrame(data={
        'file_name': [data['file_name']] * num_rows,
        'signal_rate': [data['signal_rate']] * num_rows,
        'window_size': [window_size] * num_rows,
        'step_size': [step_size] * num_rows,
        'raw_signal_vec': frames,
        'freqs_vec': data['freqs_vec'].tolist(),
        'labels': data['labels'],
        # TODO phoneme
    })


def packed2unpacked(df: pd.DataFrame) -> pd.DataFrame:
    assert df.shape[0] > 0, 'at least one row is required'
    data = {
        'file_name': df['file_name'],
        'signal_rate': df['signal_rate'],
        'window_size': df['window_size'],
        'step_size': df['step_size'],
        'labels': df['labels'],
    }
    window_size = df['window_size'].iat[0]
    for i in range(window_size):
        # The ternary operator here shouldn't be necessary. For some reason, the
        # last frame has half the signal size compared to all the other rows.
        data['raw_signal_vec[%d]' % i] = df['raw_signal_vec'].apply(
            lambda vec: vec[i] if i < len(vec) else None
        )
    step_size = df['step_size'].iat[0]
    # TODO understand stft input and output shapes
    # Why is frequencies limited to 240 complex values instead of 480?...
    for i in range(step_size):
        data['freqs_vec[%d]' % i] = df['freqs_vec'].apply(lambda vec: vec[i])
    return pd.DataFrame(data=data)


# Viewing options:
# 1) Signal amplitude
# 2) Test signal amplitude (examples: sum(freqs[:5]), sum(freqs[5:10]), ...)
# 3) Spectrogram pcolormesh
# 4) IPython.display.Audio
# 5) JSON utterance labels
# 6) Time series labels, i.e. for (1-3), `plt.axvline(x=item['start'], color='#d62728')`, TODO: linewidth=wut?
# `%matplotlib notebook` may be handy?

# Phoneme Labeler:
# For each utterance, view a 3 second window.

# want: frames (aka windows) of 10 ms, steps of 5 ms.
# This is an example packed datastructure
# datastruct: (file_name, frame index, signal_rate (example: 44.1kHz), raw_signal_vec, freqs_vec (further want: speech_vec + background_vec), label, phoneme)
example_df = pd.DataFrame(data={
    'file_name': ['a', 'a', 'a', 'b', 'b'],
    'signal_rate': [44100, 44100, 44100, 44100, 44100],
    'window_size': [441, 441, 441, 441, 441],
    'step_size': [220, 220, 220, 220, 220],
    'frame_index': [0, 1, 2, 0, 1],
    'window_max_i': [0, 0, 1, 0, 1],
    # These are a bit misleading because their length is 2, but window_size says
    # they should be 441.
    'raw_signal_vec': [[0, 0], [1, 1], [1, 2], [0, 0], [0, 1]],
    'freqs_vec': [[0, 0], [1, 0], [2, 1], [0, 0], [0, 1]],
    # TODO: fft(fft(raw_signal)) b/c harmonics. consider librosa's "pitch class"
    'label': [0, 0, 1, None, None],
    'phoneme': [None, None, 'a', None, None],
})
example_df

In [None]:
# readData gives a raw Dict/struct
# dict2packed repeats some data, like file_name, to fit into a DataFrame
# packed2unpacked makes a separate column for each field.
df = dict2packed(readData(list(training_files)[0]))
#df = dict2packed(readData(list(eval_files)[0]))
df

In [None]:
freqs_vec = np.abs(np.asarray(df.freqs_vec.tolist()))
# TODO handle zero rows better. i.e. silence
# This is how REPET-sim does it
# `norm_freqs_vec = freqs_vec * (1.0 / np.sqrt(np.power(freqs_vec, 2).sum()))`
lengths = np.linalg.norm(freqs_vec, axis=1)
# `true_divide` means `/` instead of `//`
# `[:, np.newaxis]` adds a new dimension to the shape to allow for broadcasting to work
# out and where are to avoid dividing by zero
norm_freqs_vec = np.true_divide(
    freqs_vec,
    lengths[:, np.newaxis],
    out=np.zeros(freqs_vec.shape, dtype='float64'),
    where=(lengths != 0.0)[:, np.newaxis]
)
# Contract: `np.power(norm_freqs_vec[i], 2).sum()` ~== 1, for all values of i.
# Except for values of i that represent a zero vector. See above TODO

sim_mat = np.matmul(norm_freqs_vec, norm_freqs_vec.T)
frame_ids = np.arange(norm_freqs_vec.shape[0])
plt.pcolormesh(frame_ids, frame_ids, sim_mat)
plt.colorbar()
plt.gcf().set_size_inches([15, 12]) # default is 6 x 4
plt.show()

In [None]:
unpacked_df = packed2unpacked(df)
cols_to_drop = [col for col in unpacked_df.columns.tolist() if col.startswith('raw_signal_vec')]
cols_to_drop