## Packages

In [1]:
%load_ext autoreload
%autoreload 2

import os
import os.path as osp
from pathlib import Path
from time import time
import matplotlib.pyplot as plt
import math

import numpy as np
import pandas as pd
import librosa

from IPython.display import Audio, clear_output, display

In [2]:
import sys

sys.path.append("../scripts/")

import data_loader as dl

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split

from torch import nn
from torch.nn import functional as F

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers.csv_logs import CSVLogger

## Arguments & User Defined Functions

In [4]:
transcript_path = "../outputs/data_transcripts_v2.csv"
transcripts = pd.read_csv(transcript_path)

# # Only for sample purposes:
# file_path = "142-orig.wav"
# file_transcripts = transcripts.loc[transcripts["file"] == file_path]

bert_scores_path = "../outputs/bert_scores_v2.csv"
bert_scores = pd.read_csv(bert_scores_path)

transcripts = transcripts.merge(bert_scores, on=["file", "line"])
transcripts["line"] = transcripts["line"].astype(str)

data_path = "../outputs/npy2"

summary_data_path = "../outputs/nn_summary_info.csv"
summary_data = pd.read_csv(summary_data_path)

seed = 42
batch_size = 128
num_workers = 8
sequence_len = 2048
n_features = 5

write_dir = "../outputs/splits/"
if not osp.exists(write_dir):
    os.makedirs(write_dir)

In [5]:
# npys = [path for path in Path(data_path).rglob("*.npy")]

# npys_name = [n.name for n in npys]
# npys_name = pd.DataFrame(npys_name, columns=["name"])
# sequences = npys_name.loc[~npys_name["name"].str.contains("shape")].copy()
# shapes = npys_name.loc[npys_name["name"].str.contains("shape")].copy()

# print("Sequence Arrays:", len(sequences))
# sequences["splits"] = sequences["name"].apply(lambda x: x.split("_"))
# sequences["file"] = sequences["splits"].apply(lambda x: x[0])
# sequences["line"] = sequences["splits"].apply(lambda x: x[1].split(".")[0])

# transcripts = transcripts.merge(
#     sequences[["file", "line"]], on=["file", "line"]
# ).reset_index(drop=True)

# print("Matching Sequences:", len(transcripts))
# try:
#   transcripts = transcripts.drop("Unnamed: 0", axis = 1)
# except:
#   pass
# transcripts.sample(n=4)

# transcripts.to_csv("../outputs/valid_transcripts.csv", index=False)
transcripts = pd.read_csv("../outputs/valid_transcripts.csv")
transcripts = transcripts.merge(summary_data, on=["file","line"])
print(transcripts.shape)

(17147, 44)


## Dataset

In [6]:
# PyTorch Dataset
class AudioDataset(Dataset):
    def __init__(
        self,
        metadata,
        data_dir,
        y_name="gs_score",
        trunc_pad_len=2048,
        in_dim=35,
    ):
        super().__init__()
        self.metadata = metadata

        # Faster than using a .loc on column names directly
        self.columns_dict = dict([(c, i) for i, c in enumerate(self.metadata.columns)])
        self.data_dir = data_dir
        self.y_name = y_name
        self.trunc_pad_len = trunc_pad_len
        self.in_dim = in_dim

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        # Get row with .iloc
        row = self.metadata.iloc[idx]

        means_stds = [c for c in self.columns_dict.keys() if ("mean" in c) or ("std" in c)]

        # self.in_dim += len(means_stds)

        # Read in pre-computed numpy array
        file_name = row[self.columns_dict["file"]]
        line_name = row[self.columns_dict["line"]]
        npy_path = osp.join(self.data_dir, f"{file_name}_{line_name}.npy")
        data = np.load(npy_path)

        # Get y_true
        score = row[self.columns_dict[self.y_name]]

        # Get averages and standard deviations of the features before padding.
        summary_arr = row[[self.columns_dict[c] for c in means_stds]].values
        summary_arr_tiled = np.tile(summary_arr, (data.shape[0], 1))

        data = np.concatenate([data, summary_arr_tiled], axis=1)

        # Pad/Truncate
        data_aug = np.zeros((self.trunc_pad_len, self.in_dim))
        data_aug[: min(data.shape[0], self.trunc_pad_len), :] = data[
            : self.trunc_pad_len
        ]
        item = {
            "x": torch.tensor(data_aug, dtype=torch.float),
            "y": torch.tensor([score], dtype=torch.float),
        }

        return (item["x"], item["y"])

## Data Module

In [7]:
# PyTorch Lightning Module
class AudioDataModule(pl.LightningDataModule):
    def __init__(
        self,
        metadata: pd.DataFrame,
        write_dir: str = "./",
        data_dir: str = "./",
        batch_size: int = 32,
        num_workers: int = 8,
        y_name="gs_score",
        trunc_pad_len=2048,
        in_dim: int = 5,
        seed: int = 42,
    ):
        super().__init__()
        self.metadata = metadata
        self.write_dir = write_dir
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.y_name = y_name
        self.trunc_pad_len = trunc_pad_len
        self.in_dim = in_dim
        self.num_workers = num_workers
        self.seed = seed

    def prepare_data(self):
        # Split out transcript metadata into train, val, test
        rng = np.random.default_rng(42)
        indices = rng.permutation(self.metadata.shape[0])
        train_size = math.floor(len(indices) * 0.80)
        val_size = math.floor(len(indices) * 0.10)
        train_idx = indices[:train_size]
        val_idx = indices[train_size : train_size + val_size]
        test_idx = indices[train_size + val_size :]

        self.train = self.metadata.iloc[train_idx].reset_index(drop=True)
        self.train.to_csv(osp.join(self.write_dir, "train.csv"), index=False)

        self.val = self.metadata.iloc[val_idx].reset_index(drop=True)
        self.val.to_csv(osp.join(self.write_dir, "val.csv"), index=False)

        self.test = self.metadata.iloc[test_idx].reset_index(drop=True)
        self.test.to_csv(osp.join(self.write_dir, "test.csv"), index=False)

    def setup(self):
        # Load in train, val, test datasets
        self.train_data = pd.read_csv(osp.join(self.write_dir, "train.csv"))
        self.val_data = pd.read_csv(osp.join(self.write_dir, "val.csv"))
        self.test_data = pd.read_csv(osp.join(self.write_dir, "test.csv"))

    def train_dataloader(self):
        return DataLoader(
            dataset=dl.AudioDataset(
                metadata=self.train_data,
                data_dir=self.data_dir,
                y_name=self.y_name,
                trunc_pad_len=self.trunc_pad_len,
                in_dim=self.in_dim,
            ),
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            dl.AudioDataset(
                metadata=self.val_data,
                data_dir=self.data_dir,
                y_name=self.y_name,
                trunc_pad_len=self.trunc_pad_len,
                in_dim=self.in_dim,
            ),
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

    def test_dataloader(self):
        return DataLoader(
            dl.AudioDataset(
                metadata=self.test_data,
                data_dir=self.data_dir,
                y_name=self.y_name,
                trunc_pad_len=self.trunc_pad_len,
                in_dim=self.in_dim,
            ),
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

In [8]:
datamodule = AudioDataModule(
    metadata=transcripts,
    write_dir=write_dir,
    data_dir=data_path,
    batch_size=batch_size,
    num_workers=num_workers,
    y_name="gs_score",
    trunc_pad_len=sequence_len,
    in_dim=n_features,
    seed=42,
)

datamodule.prepare_data()

datamodule.setup()

In [9]:
one_batch = next(iter(datamodule.train_dataloader()))
print(one_batch[0].size())
print(one_batch[1].size())

torch.Size([128, 2048, 5])
torch.Size([128, 1])


## Model Module

In [10]:
class AudioFFModel(pl.LightningModule):
    def __init__(self, criterion, input_dims=(2048, 5), learning_rate=0.001):
        super(AudioFFModel, self).__init__()
        self.criterion = criterion
        self.learning_rate = learning_rate
        self.batch_norm = nn.BatchNorm1d(input_dims[1])
        self.flatten = nn.Flatten()
        self.relu = nn.ReLU()
        self.l1 = nn.Linear(
            input_dims[0] * input_dims[1], int(input_dims[0] * input_dims[1] / 4)
        )
        self.l2 = nn.Linear(
            int(input_dims[0] * input_dims[1] / 4),
            int(input_dims[0] * input_dims[1] / 16),
        )
        self.l3 = nn.Linear(int(input_dims[0] * input_dims[1] / 16), 1)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.batch_norm(x)
        x = self.flatten(x)
        x = self.l1(x)
        x = self.relu(x)
        x = self.l2(x)
        x = self.relu(x)
        x = self.l3(x)
        return x

    def training_step(self, batch, *_):
        x, y = batch[0], batch[1]
        y_hat = self(x)
        loss = self.criterion(y_hat, y)

        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, *_):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log("val_loss", loss)
        return loss

    def test_step(self, batch, *_):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log("test_loss", loss)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

In [11]:
# trainer = pl.Trainer()
# model = AudioFFModel(criterion=F.mse_loss)
# trainer.fit(model, datamodule.train_dataloader())

In [16]:
class LSTMRegressor(pl.LightningModule):
    """
    Standard PyTorch Lightning module:
    https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html
    """

    def __init__(
        self,
        n_features,
        hidden_size,
        seq_len,
        batch_size,
        num_layers,
        dropout,
        learning_rate,
        criterion,
        bidirectional,
        track_level_flag,
        convnet_flag,
        kernel_size=None,
        stride=None,
        conv_output=None,
    ):
        super(LSTMRegressor, self).__init__()
        self.n_features = n_features
        self.hidden_size = hidden_size
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.criterion = criterion
        self.learning_rate = learning_rate
        self.bidirectional = bidirectional

        self.lstm_input = self.n_features
        self.bn = nn.BatchNorm1d(n_features)

        self.track_level_flag = track_level_flag
        if self.track_level_flag:
            self.lstm_input = self.n_features * 3  # original + mean + std

        self.convnet_flag = convnet_flag
        if self.convnet_flag:
            self.conv_input = self.lstm_input
            self.kernel_size = kernel_size
            self.stride = stride
            self.conv_output = conv_output
            self.conv1d = nn.Conv1d(
                self.conv_input, self.conv_output, self.kernel_size, self.stride, "same"
            )
            self.lstm_input = conv_output

        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True,
            bias=True
        )

        for layer_p in self.lstm._all_weights:
          for p in layer_p:
              if 'weight' in p:
                  nn.init.xavier_uniform_(self.lstm.__getattr__(p))
              if 'bias' in p:
                  nn.init.constant_(self.lstm.__getattr__(p), 1.0)
                  
        self.linear = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # x_orig = (batch_size, seq_len, hidden_size)
        # x = x.permute(0, 2, 1)
        # x = self.bn(x)
        # x = x.permute(0, 2, 1)

        # if self.track_level_flag:
        #     x_means = x.mean(axis=1).unsqueeze(1).repeat(1, self.seq_len, 1)
        #     x_stds = x.std(axis=1).unsqueeze(1).repeat(1, self.seq_len, 1)
        #     x = torch.cat((x, x_means, x_stds), dim=2)

        # if self.convnet_flag:
        #     x = x.permute(0, 2, 1)
        #     x = self.conv1d(x)
        #     x = x.permute(0, 2, 1)

        h0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size).to("cuda")
        c0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_size).to("cuda")

        lstm_out, _ = self.lstm(x, (h0, c0))
        
        y_pred = self.linear(lstm_out[:, -1])
        return y_pred

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

    def training_step(self, batch, *_):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, *_):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log("val_loss", loss)
        return loss

    def test_step(self, batch, *_):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log("test_loss", loss)

In [17]:
"""
All parameters are aggregated in one place.
This is useful for reporting experiment params to experiment tracking software
"""

p = dict(
    seq_len=2048,
    batch_size=128,
    criterion=nn.MSELoss(),
    max_epochs=50,
    n_features=n_features,
    hidden_size=128,
    num_layers=2,
    dropout=0,
    learning_rate=0.001,
    bidirectional=False,
    track_level_flag=False,
    convnet_flag=False,
    kernel_size=16,
    stride=1,
    conv_output=8,
)

In [18]:
lstm_model = LSTMRegressor(
    n_features=p["n_features"],
    hidden_size=p["hidden_size"],
    seq_len=p["seq_len"],
    batch_size=p["batch_size"],
    criterion=p["criterion"],
    num_layers=p["num_layers"],
    dropout=p["dropout"],
    learning_rate=p["learning_rate"],
    bidirectional=p["bidirectional"],
    track_level_flag=p["track_level_flag"],
    convnet_flag=p["convnet_flag"],
    kernel_size=p["kernel_size"],
    stride=p["stride"],
    conv_output=p["conv_output"],
)

dm = AudioDataModule(
    metadata=transcripts,
    write_dir=write_dir,
    data_dir=data_path,
    batch_size=p["batch_size"],
    num_workers=num_workers,
    y_name="gs_score",
    trunc_pad_len=sequence_len,
    in_dim=n_features,
    seed=42,
)

dm.prepare_data()
dm.setup()

In [19]:
trainer = pl.Trainer(gpus=1, max_epochs=1, default_root_dir="./logs/")

trainer.fit(lstm_model, dm)
trainer.test(lstm_model, datamodule=dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type        | Params
------------------------------------------
0 | criterion | MSELoss     | 0     
1 | bn        | BatchNorm1d | 10    
2 | lstm      | LSTM        | 201 K 
3 | linear    | Linear      | 129   
------------------------------------------
201 K     Trainable params
0         Non-trainable params
201 K     Total params
0.805     Total estimated model params size (MB)


Epoch 0:  88%|████████▊ | 107/122 [00:33<00:04,  3.17it/s, loss=0.438, v_num=4]

RuntimeError: Expected hidden[0] size (2, 21, 128), got [2, 128, 128]