## Packages

In [1]:
import os
import os.path as osp
from pathlib import Path
from time import time
import matplotlib.pyplot as plt
import math

import numpy as np
import pandas as pd
import librosa

from IPython.display import Audio, clear_output, display

In [2]:
import sys

sys.path.append("../scripts/")

import data_loader as dl

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split

from torch import nn
from torch.nn import functional as F

import pytorch_lightning as pl

## Arguments & User Defined Functions

In [4]:
transcript_path = "../outputs/all_transcripts.csv"
transcripts = pd.read_csv(transcript_path)

# Only for sample purposes:
file_path = "142-orig.wav"
file_transcripts = transcripts.loc[transcripts["file"] == file_path]

bert_scores_path = "../outputs/bert_scores.csv"
bert_scores = pd.read_csv(bert_scores_path)

file_transcripts = file_transcripts.merge(bert_scores, on=["file", "line"])

data_path = "../outputs/npy/"

batch_size = 32
num_workers = 1

max_len = np.max(np.load("../outputs/npy/142-orig.wav_shapes.npy"))
write_dir = "../outputs/splits/"
if not osp.exists(write_dir):
    os.makedirs(write_dir)

## Dataset

In [5]:
# PyTorch Dataset
# NOTE: REIMPORTED DUE TO EMPTY ERROR IN `../scripts/data_loader.py`
class AudioDataset(Dataset):
    def __init__(
        self, metadata, data_dir, y_name="gs_score", trunc_pad_len=2048, in_dim=5
    ):
        super().__init__()
        self.metadata = metadata

        # Faster than using a .loc on column names directly
        self.columns_dict = dict([(c, i) for i, c in enumerate(self.metadata.columns)])
        self.data_dir = data_dir
        self.y_name = y_name
        self.trunc_pad_len = trunc_pad_len
        self.in_dim = in_dim

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        # Get row with .iloc
        row = self.metadata.iloc[idx]

        # Read in pre-computed numpy array
        file_name = row[self.columns_dict["file"]]
        line_name = row[self.columns_dict["line"]]
        npy_path = osp.join(self.data_dir, f"{file_name}_{line_name}.npy")
        data = np.load(npy_path)

        # Get y_true
        score = row[self.columns_dict[self.y_name]]

        # Pad/Truncate
        data_aug = np.zeros((self.trunc_pad_len, self.in_dim))
        data_aug[: min(data.shape[0], self.trunc_pad_len), :] = data[
            : self.trunc_pad_len
        ]
        item = {
            "x": torch.tensor(data_aug, dtype=torch.float),
            "y": torch.tensor([score], dtype=torch.float),
        }

        return item

In [6]:
new_dataset = AudioDataset(file_transcripts, data_path)
next(iter(new_dataset))

{'x': tensor([[0.0000, 0.0000, 0.0407, 0.0000, 0.0000],
         [8.2564, 1.0000, 0.1086, 0.0000, 0.0000],
         [8.2897, 1.0000, 0.0780, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]),
 'y': tensor([-0.5621])}

## Data Module

In [7]:
# PyTorch Lightning Module
class AudioDataModule(pl.LightningDataModule):
    def __init__(
        self,
        metadata: pd.DataFrame,
        write_dir: str = "./",
        data_dir: str = "./",
        batch_size: int = 32,
        num_workers: int = 8,
        y_name="gs_score",
        trunc_pad_len=2048,
        in_dim: int = 5,
        seed: int = 42,
    ):
        super().__init__()
        self.metadata = metadata
        self.write_dir = write_dir
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.y_name = y_name
        self.trunc_pad_len = trunc_pad_len
        self.in_dim = in_dim
        self.num_workers = num_workers
        self.seed = seed

    def prepare_data(self):
        # Split out transcript metadata into train, val, test
        rng = np.random.default_rng(42)
        indices = rng.permutation(self.metadata.shape[0])
        train_size = math.floor(len(indices) * 0.80)
        val_size = math.floor(len(indices) * 0.10)
        train_idx = indices[:train_size]
        val_idx = indices[train_size : train_size + val_size]
        test_idx = indices[train_size + val_size :]

        self.train = self.metadata.iloc[train_idx].reset_index(drop=True)
        self.train.to_csv(osp.join(self.write_dir, "train.csv"), index=False)

        self.val = self.metadata.iloc[val_idx].reset_index(drop=True)
        self.val.to_csv(osp.join(self.write_dir, "val.csv"), index=False)

        self.test = self.metadata.iloc[test_idx].reset_index(drop=True)
        self.test.to_csv(osp.join(self.write_dir, "test.csv"), index=False)

    def setup(self):
        # Load in train, val, test datasets
        self.train_data = pd.read_csv(osp.join(self.write_dir, "train.csv"))
        self.val_data = pd.read_csv(osp.join(self.write_dir, "val.csv"))
        self.test = pd.read_csv(osp.join(self.write_dir, "test.csv"))

    def train_dataloader(self):
        return DataLoader(
            dataset=dl.AudioDataset(
                metadata=self.train_data,
                data_dir=self.data_dir,
                y_name=self.y_name,
                trunc_pad_len=self.trunc_pad_len,
                in_dim=self.in_dim,
            ),
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            dl.AudioDataset(
                metadata=self.val_data,
                data_dir=self.data_dir,
                y_name=self.y_name,
                trunc_pad_len=self.trunc_pad_len,
                in_dim=self.in_dim,
            ),
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

    def test_dataloader(self):
        return DataLoader(
            dl.AudioDataset(
                metadata=self.test_data,
                data_dir=self.data_dir,
                y_name=self.y_name,
                trunc_pad_len=self.trunc_pad_len,
                in_dim=self.in_dim,
            ),
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

In [None]:
datamodule = AudioDataModule(
    metadata=file_transcripts,
    write_dir=write_dir,
    data_dir=data_path,
    batch_size=32,
    num_workers=4,
    y_name="gs_score",
    trunc_pad_len=2048,
    in_dim=5,
    seed=42,
)

datamodule.prepare_data()

datamodule.setup()

next(iter(datamodule.train_dataloader()))

## Model Module

In [45]:
class AudioFFModel(pl.LightningModule):
    def __init__(self, input_dims=(2048, 5)):
        super().__init__()
        self.batch_norm = nn.BatchNorm1d(input_dims[0])
        self.flatten = nn.Flatten()
        self.l1 = nn.Linear(input_dims[0] * input_dims[1], 1)

    def forward(self, x):
        x = self.batch_norm(x)
        x = self.flatten(x)
        x = self.l1(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch["x"], batch["y"]
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.02)

In [46]:
trainer = pl.Trainer()
model = AudioFFModel()
trainer.fit(model, datamodule.train_dataloader())

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name       | Type        | Params
-------------------------------------------
0 | batch_norm | BatchNorm1d | 4.1 K 
1 | flatten    | Flatten     | 0     
2 | l1         | Linear      | 10.2 K
3 | relu       | ReLU        | 0     
-------------------------------------------
14.3 K    Trainable params
0         Non-trainable params
14.3 K    Total params
0.057     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

  loss = F.mse_loss(y_hat, y)
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
