In [None]:
%load_ext autoreload
%autoreload 2

import os
import os.path as osp
from pathlib import Path
from time import time
import matplotlib.pyplot as plt
import math

import numpy as np
import pandas as pd
import librosa

from IPython.display import Audio, clear_output, display

import sys

sys.path.append("../scripts/")

import data_loader as dl

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.preprocessing import MinMaxScaler

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import defaultdict

In [None]:
transcript_path = "../outputs/data_transcripts_v2.csv"
transcripts = pd.read_csv(transcript_path)

# # Only for sample purposes:
# file_path = "142-orig.wav"
# file_transcripts = transcripts.loc[transcripts["file"] == file_path]

bert_scores_path = "../outputs/bert_scores_v2.csv"
bert_scores = pd.read_csv(bert_scores_path)

transcripts = transcripts.merge(bert_scores, on=["file", "line"])
transcripts["line"] = transcripts["line"].astype(str)

data_path = "../outputs/npy2"

summary_data_path = "../outputs/nn_summary_info.csv"
summary_data = pd.read_csv(summary_data_path)

seed = 42
batch_size = 128
num_workers = 8
sequence_len = 2048
n_features = 5

write_dir = "../outputs/splits/"
if not osp.exists(write_dir):
    os.makedirs(write_dir)

In [None]:
transcripts = pd.read_csv("../outputs/valid_transcripts.csv")
transcripts = transcripts.merge(summary_data, on=["file","line"])
print(transcripts.shape)

In [None]:
# PyTorch Lightning Module
class AudioDataModule(pl.LightningDataModule):
    def __init__(
        self,
        metadata: pd.DataFrame,
        write_dir: str = "./",
        data_dir: str = "./",
        batch_size: int = 32,
        num_workers: int = 8,
        y_name="gs_score",
        trunc_pad_len=2048,
        in_dim: int = 5,
        seed: int = 42,
    ):
        super().__init__()
        self.metadata = metadata
        self.write_dir = write_dir
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.y_name = y_name
        self.trunc_pad_len = trunc_pad_len
        self.in_dim = in_dim
        self.num_workers = num_workers
        self.seed = seed

    def prepare_data(self):
        # Split out transcript metadata into train, val, test
        rng = np.random.default_rng(42)
        indices = rng.permutation(self.metadata.shape[0])
        train_size = math.floor(len(indices) * 0.80)
        val_size = math.floor(len(indices) * 0.10)
        train_idx = indices[:train_size]
        val_idx = indices[train_size : train_size + val_size]
        test_idx = indices[train_size + val_size :]

        self.train = self.metadata.iloc[train_idx].reset_index(drop=True)
        self.train.to_csv(osp.join(self.write_dir, "train.csv"), index=False)

        self.val = self.metadata.iloc[val_idx].reset_index(drop=True)
        self.val.to_csv(osp.join(self.write_dir, "val.csv"), index=False)

        self.test = self.metadata.iloc[test_idx].reset_index(drop=True)
        self.test.to_csv(osp.join(self.write_dir, "test.csv"), index=False)

    def setup(self):
        # Load in train, val, test datasets
        self.train_data = pd.read_csv(osp.join(self.write_dir, "train.csv"))
        self.val_data = pd.read_csv(osp.join(self.write_dir, "val.csv"))
        self.test_data = pd.read_csv(osp.join(self.write_dir, "test.csv"))

    def train_dataloader(self):
        return DataLoader(
            dataset=dl.AudioDataset(
                metadata=self.train_data,
                data_dir=self.data_dir,
                y_name=self.y_name,
                trunc_pad_len=self.trunc_pad_len,
                in_dim=self.in_dim,
            ),
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            dl.AudioDataset(
                metadata=self.val_data,
                data_dir=self.data_dir,
                y_name=self.y_name,
                trunc_pad_len=self.trunc_pad_len,
                in_dim=self.in_dim,
            ),
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

    def test_dataloader(self):
        return DataLoader(
            dl.AudioDataset(
                metadata=self.test_data,
                data_dir=self.data_dir,
                y_name=self.y_name,
                trunc_pad_len=self.trunc_pad_len,
                in_dim=self.in_dim,
            ),
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

In [None]:
N_EPOCHS = 8
BATCH_SIZE = 64

data_module = AudioDataModule(
    metadata=transcripts,
    write_dir=write_dir,
    data_dir=data_path,
    batch_size=batch_size,
    num_workers=num_workers,
    y_name="gs_score",
    trunc_pad_len=sequence_len,
    in_dim=n_features,
    seed=42,
)

data_module.prepare_data()
data_module.setup()

In [None]:
one_batch = next(iter(data_module.train_dataloader()))
print(one_batch[0].size())
print(one_batch[1].size())

In [None]:
class AudioLSTMModel(nn.Module):
    def __init__(self, n_features, n_hidden=128, n_layers=2):
        super().__init__()

        self.n_hidden = n_hidden

        self.lstm = nn.LSTM(input_size=n_features, hidden_size=n_hidden, batch_first=True, num_layers=n_layers, dropout=0.2)
        self.ffc = nn.Linear(n_hidden, n_hidden)
        self.regressor = nn.Linear(n_hidden, 1)

    def forward(self, x):
        self.lstm.flatten_parameters()
        _, (hidden, _) = self.lstm(x)
        out = hidden[-1]
        ff = self.ffc(out)
        return self.regressor(ff)

In [None]:
class AudioLSTMModelv2(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.input_size = input_size

        self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)
        return out

In [None]:
class AudioLSTMPredictor(pl.LightningModule):
    def __init__(self, n_features, n_hidden):
        super().__init__()
        self.model = AudioLSTMModel(n_features, n_hidden)
        self.criterion = nn.MSELoss()

    def forward(self, x, labels=None):
        output = self.model(x)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output

    def training_step(self, batch, batch_idx):
        x, y = batch
        loss, outputs = self(x, y)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        loss, outputs = self(x, y)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        loss, outputs = self(x, y)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr = 0.0001)

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath = "checkpoints",
    filename = "best-checkpoint", 
    save_top_k=1, 
    verbose =True, 
    monitor = "val_loss",
    mode="min"
)

logger = TensorBoardLogger("lightning_logs", name="audio-model-v")
early_stopping_callback = EarlyStopping(monitor="val_loss", patience = 2)
trainer = pl.Trainer(
    logger=logger,
    checkpoint_callback=checkpoint_callback,
    callbacks = [early_stopping_callback],
    max_epochs = N_EPOCHS,
    gpus = 1,
)

In [None]:
model = AudioLSTMPredictor(n_features = n_features, n_hidden=256)
trainer.fit(model, data_module)

In [None]:
trainer.test(model, data_module)