In [1]:
# Re-loads all imports every time the cell is ran. 
%load_ext autoreload
%autoreload 2

from time import time

import math
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.5f}'.format

from IPython.display import display

# Sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Neural Networks
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
# from pytorch_lightning.loggers.csv_logs import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import sys
sys.path.append("../scripts/")
import data_loader as dl

In [3]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
import os.path as osp
for dirname, _, filenames in os.walk('C:/Users/yagne/Downloads/household_power_consumption.txt/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

C:/Users/yagne/Downloads/household_power_consumption.txt/household_power_consumption.txt


In [4]:
class NewAudioDataset(Dataset):
    def __init__(self, 
                 metadata,
                 data_maxes = np.load("../outputs/data_maxes.npy"),
                 data_directory="../outputs/npy2",
                 num_features: int = 5,
                 seq_len: int = 2048,
                 y_col="gs_score"):
        self.metadata = metadata
        self.columns_dict = dict([(c, i) for i, c in enumerate(self.metadata.columns)])
        self.data_maxes = data_maxes
        self.data_directory = data_directory
        self.num_features = num_features
        self.seq_len = seq_len
        self.y_col = "gs_score"
    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, index):
        row = self.metadata.iloc[index]
        # means_stds = [c for c in self.columns_dict.keys() if ("mean" in c) or ("std" in c)]
        file_name = row[self.columns_dict["file"]]
        line_name = row[self.columns_dict["line"]]
        npy_path = osp.join(self.data_directory, f"{file_name}_{line_name}.npy")
        data = np.load(npy_path)
        data = data/self.data_maxes
        
        # Get y_true
        score = row[self.columns_dict[self.y_col]]
#         score = score.reshape(-1,1)
        
        data_aug = np.zeros((self.seq_len, self.num_features))

        data_aug[: min(data.shape[0], self.seq_len), :] = data[
            : self.seq_len
        ]
        
        item = {
            "x": torch.tensor(data_aug, dtype=torch.float),
            "y": torch.tensor(score, dtype=torch.float),
        }

        return (item["x"], item["y"])

In [5]:
transcripts = pd.read_csv("../outputs/valid_transcripts.csv")
it = iter(NewAudioDataset(transcripts))
print(NewAudioDataset(transcripts).__len__())
print(next(it))
print(next(it))
print(next(it))
print(next(it))

17110
(tensor([[0.0000, 0.0000, 0.0100, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0100, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0100, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]), tensor(-1.1736))
(tensor([[0.0000, 0.0000, 0.0100, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0100, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0100, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]), tensor(-0.8314))
(tensor([[0.0000, 0.0000, 0.0100, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0100, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0100, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]), tensor(0.740

In [6]:
class NewAudioDataModule(pl.LightningDataModule):
    def __init__(self,
                 metadata,
                 data_maxes = np.load("../outputs/data_maxes.npy"),
                 split_directory="../outputs/splits",
                 data_directory="../outputs/npy2",
                 seq_len=2048,
                 num_features=5,
                 y_col="gs_score",
                 batch_size=128,
                 num_workers=4,
                 seed=42
                ):
        super().__init__()
        self.metadata = metadata
        self.data_maxes = data_maxes
        self.split_directory = split_directory
        self.data_directory = data_directory
        self.seq_len = seq_len
        self.num_features = num_features
        self.y_col = y_col
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.seed = seed
        
    def prepare_data(self):
        rng = np.random.default_rng(self.seed)
        indices = rng.permutation(self.metadata.shape[0])
        train_size = math.floor(len(indices) * 0.80)
        val_size = math.floor(len(indices) * 0.10)
        train_idx = indices[:train_size]
        val_idx = indices[train_size : train_size + val_size]
        test_idx = indices[train_size + val_size :]
    
        self.train = self.metadata.iloc[train_idx].reset_index(drop=True)
        self.train.to_csv(osp.join(self.split_directory, "train.csv"), index=False)

        self.val = self.metadata.iloc[val_idx].reset_index(drop=True)
        self.val.to_csv(osp.join(self.split_directory, "val.csv"), index=False)

        self.test = self.metadata.iloc[test_idx].reset_index(drop=True)
        self.test.to_csv(osp.join(self.split_directory, "test.csv"), index=False)
        
    def setup(self):
        self.train_data = pd.read_csv(osp.join(self.split_directory, "train.csv"))
        self.val_data = pd.read_csv(osp.join(self.split_directory, "val.csv"))
        self.test_data = pd.read_csv(osp.join(self.split_directory, "test.csv"))
    
    def train_dataloader(self):
        self.train_dataset = dl.NewAudioDataset(metadata=self.train_data,
            data_maxes = self.data_maxes,
            data_directory=self.data_directory,
            num_features= self.num_features,
            seq_len = self.seq_len,
            y_col=self.y_col)
        train_loader = DataLoader(self.train_dataset, 
                                  batch_size = self.batch_size, 
                                  shuffle = False, 
                                  num_workers = self.num_workers)
        
        return train_loader

    def val_dataloader(self):
        self.val_dataset = dl.NewAudioDataset(metadata=self.val_data,
            data_maxes = self.data_maxes,
            data_directory=self.data_directory,
            num_features= self.num_features,
            seq_len = self.seq_len,
            y_col=self.y_col)
        val_loader = DataLoader(self.val_dataset, 
                                batch_size = self.batch_size, 
                                shuffle = False, 
                                num_workers = self.num_workers)

        return val_loader

    def test_dataloader(self):
        self.test_dataset = dl.NewAudioDataset(
            metadata=self.test_data,
            data_maxes = self.data_maxes,
            data_directory=self.data_directory,
            num_features= self.num_features,
            seq_len = self.seq_len,
            y_col=self.y_col)
        test_loader = DataLoader(self.test_dataset, 
                                 batch_size = self.batch_size, 
                                 shuffle = False, 
                                 num_workers = self.num_workers)

        return test_loader

In [7]:
transcripts = pd.read_csv("../outputs/valid_transcripts.csv")


dm = NewAudioDataModule(metadata=transcripts,
    data_maxes = np.load("../outputs/data_maxes.npy"),                    
    split_directory="../outputs/splits",
    data_directory="../outputs/npy2",
    seq_len=2048,
    num_features=5,
    y_col="gs_score",
    batch_size=128,
    num_workers=4,
    seed=42
)

dm.prepare_data()
dm.setup()

In [8]:
b = next(iter(dm.test_dataloader()))

In [9]:
b[1].unsqueeze(1)

tensor([[-1.0897],
        [-1.0980],
        [-0.4058],
        [ 0.2149],
        [ 0.4949],
        [ 1.1137],
        [-0.8485],
        [ 0.4415],
        [ 0.1394],
        [ 0.9759],
        [ 0.8286],
        [ 1.0672],
        [-0.5937],
        [-0.1836],
        [-0.6085],
        [ 0.0931],
        [ 0.2375],
        [ 0.2525],
        [ 0.4784],
        [-0.1702],
        [-0.9899],
        [ 0.2278],
        [ 0.4687],
        [-0.1692],
        [ 0.0160],
        [-1.6187],
        [ 0.5862],
        [-1.4173],
        [-0.1363],
        [ 0.0659],
        [ 0.1783],
        [-0.5699],
        [ 0.2688],
        [-0.1328],
        [ 0.8270],
        [ 0.6541],
        [ 0.9794],
        [-0.4515],
        [-0.1409],
        [ 0.3374],
        [-1.3745],
        [-0.4433],
        [-0.3409],
        [ 0.2464],
        [ 0.8267],
        [ 0.3802],
        [ 0.2278],
        [-1.2297],
        [ 0.3120],
        [-0.3844],
        [-0.6961],
        [ 0.4879],
        [-0.

In [10]:
class LSTMRegressor(pl.LightningModule):
    '''
    Standard PyTorch Lightning module:
    https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html
    '''
    def __init__(self, 
                 n_features, 
                 hidden_size, 
                 seq_len, 
                 batch_size,
                 num_layers, 
                 dropout, 
                 learning_rate,
                 criterion):
        super(LSTMRegressor, self).__init__()
        self.n_features = n_features
        self.hidden_size = hidden_size
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.criterion = criterion
        self.learning_rate = learning_rate

        self.lstm = nn.LSTM(input_size=n_features, 
                            hidden_size=hidden_size,
                            num_layers=num_layers, 
                            dropout=dropout, 
                            batch_first=True)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        # lstm_out = (batch_size, seq_len, hidden_size)
        lstm_out, _ = self.lstm(x)
        y_pred = self.linear(lstm_out[:,-1])
        return y_pred
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y = y.unsqueeze(1)
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y = y.unsqueeze(1)
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y = y.unsqueeze(1)
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss

In [11]:
'''
All parameters are aggregated in one place.
This is useful for reporting experiment params to experiment tracking software
'''

p = dict(
    seq_len = 2048,
    batch_size = 64, 
    criterion = nn.MSELoss(),
    num_workers = 4,
    max_epochs = 50,
    n_features = 5,
    hidden_size = 128,
    num_layers = 2,
    dropout = 0.2,
    learning_rate = 0.0001,
    path="C:/Users/yagne/Downloads/household_power_consumption.txt/household_power_consumption.txt"
)

In [12]:
%load_ext tensorboard
%tensorboard --logdir ./lstm

Reusing TensorBoard on port 6006 (pid 17040), started 1 day, 4:57:22 ago. (Use '!kill 17040' to kill it.)

In [13]:
# seed_everything(1)

# csv_logger = CSVLogger('./', name='lstm'),

checkpoint_callback = ModelCheckpoint(
    dirpath = "checkpoints",
    filename = "best-checkpoint", 
    save_top_k=1, 
    verbose =True, 
    monitor = "val_loss",
    mode="min"
)


logger = TensorBoardLogger("lstm", name="audio_change")
early_stopping_callback = EarlyStopping(monitor="val_loss", patience = 4)

trainer = Trainer(
    max_epochs=p['max_epochs'],
    logger=logger,
    gpus=1,
    checkpoint_callback=checkpoint_callback,
    callbacks = [early_stopping_callback],
#     overfit_batches=1
)

model = LSTMRegressor(
    n_features = p['n_features'],
    hidden_size = p['hidden_size'],
    seq_len = p['seq_len'],
    batch_size = p['batch_size'],
    criterion = p['criterion'],
    num_layers = p['num_layers'],
    dropout = p['dropout'],
    learning_rate = p['learning_rate']
)

dm = NewAudioDataModule(metadata=transcripts,
    data_maxes = np.load("../outputs/data_maxes.npy"),                    
    split_directory="../outputs/splits",
    data_directory="../outputs/npy2",
    seq_len=2048,
    num_features=5,
    y_col="gs_score",
    batch_size=128,
    num_workers=4,
    seed=42
)

dm.setup()
dm.prepare_data()

trainer.fit(model, dm)
trainer.test(model, datamodule=dm)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params
--------------------------------------
0 | criterion | MSELoss | 0     
1 | lstm      | LSTM    | 201 K 
2 | linear    | Linear  | 129   
--------------------------------------
201 K     Trainable params
0         Non-trainable params
201 K     Total params
0.805     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.45311039686203003}
--------------------------------------------------------------------------------


[{'test_loss': 0.45311039686203003}]