In [26]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler 

import torchaudio
import sys
sys.path.append("..")
from dataset_generation.randomDataset import RandomDataset
from torch.utils.data import DataLoader
from utils.codec import CodecTransform

In [49]:
def sliding_windows(data, x_len, y_len):
    x = []
    y = []

    for i in range(len(data)-x_len-y_len- 1):
        _x = data[i:(i+x_len)]
        _y = data[i+x_len:i+x_len+y_len]
        x.append(_x)
        #print(_x.shape) 
        #print(_y.shape)
        y.append(_y)

    x = torch.tensor(data=x)
    y = torch.tensor(data=y)
    return x.unsqueeze(2), y.unsqueeze(2)

test = np.zeros(1000)
x, y = sliding_windows(test, 10, 5)
print(x.shape, y.shape)

torch.Size([984, 10, 1]) torch.Size([984, 5, 1])


In [50]:
class LSTM(nn.Module):
    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Propagate input through LSTM
        ula, (h_out, _) = self.lstm(x)
        
        h_out = h_out.view(-1, self.hidden_size)
        
        out = self.fc(h_out)
        
        return out

In [51]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

src = '../dataset_generation/speeches'
nfft=512
nmels=60
#mfcc = torchaudio.transforms.MFCC(sample_rate=16000, n_mfcc=15, melkwargs={'n_fft':nfft, 'n_mels':nmels})
codec = CodecTransform(sample_rate=16000, bandwidth=6.0)
dataSet = RandomDataset(src, 16000, 1000, codec, 50, 49)
train_data, val_data = torch.utils.data.random_split(dataSet, (800, 200))
train_dl = DataLoader(train_data, batch_size=64)
val_dl = DataLoader(val_data, batch_size=64)

cpu


In [62]:
# Create data set 
input_waveform = dataSet[0][0]
# assume they're returned separately 
first_row = input_waveform[0, 0, :].numpy()
trainX, trainY = sliding_windows(first_row, 8, 4) 
# (N, 10)
trainX = Variable(torch.Tensor(trainX))
trainY = Variable(torch.Tensor(trainY))
# (N, 5)

print(trainX.shape)
print(trainY.shape)

num_epochs = 10000
learning_rate = 0.01

input_size = 1
hidden_size = 20
num_layers = 1

num_classes = 4 # Predict next 5 outputs 

lstm = LSTM(num_classes, input_size, hidden_size, num_layers)

criterion = torch.nn.MSELoss()    # mean-squared error for regression
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)
#optimizer = torch.optim.SGD(lstm.parameters(), lr=learning_rate)


torch.Size([3662, 8, 1])
torch.Size([3662, 4, 1])


In [63]:

# Train the model
for epoch in range(num_epochs):
    #WE ARE HERE
    outputs = lstm(trainX)
    optimizer.zero_grad()
    
    # obtain the loss function
    #print(outputs.shape)
    trainY=trainY.squeeze()
    #print(trainY.shape)
    loss = criterion(outputs, trainY)
    
    loss.backward()
    
    optimizer.step()
    if epoch % 100 == 0:
      print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))

Epoch: 0, loss: 311161.56250
Epoch: 100, loss: 290535.46875
Epoch: 200, loss: 271460.50000
Epoch: 300, loss: 254537.82812
Epoch: 400, loss: 239098.67188
Epoch: 500, loss: 224923.32812
Epoch: 600, loss: 211888.50000
Epoch: 700, loss: 199905.79688
Epoch: 800, loss: 188903.67188
Epoch: 900, loss: 178819.65625
Epoch: 1000, loss: 169597.31250
Epoch: 1100, loss: 161184.39062
Epoch: 1200, loss: 153531.71875
Epoch: 1300, loss: 146592.53125
Epoch: 1400, loss: 140322.12500
Epoch: 1500, loss: 134677.46875
Epoch: 1600, loss: 129617.03125
Epoch: 1700, loss: 125100.61719
Epoch: 1800, loss: 121089.32812
Epoch: 1900, loss: 117545.35938
Epoch: 2000, loss: 114432.03125
Epoch: 2100, loss: 111713.83594
Epoch: 2200, loss: 109356.38281
Epoch: 2300, loss: 107326.40625
Epoch: 2400, loss: 105591.96094
Epoch: 2500, loss: 104122.36719
Epoch: 2600, loss: 102888.32031
Epoch: 2700, loss: 101862.07812
Epoch: 2800, loss: 101017.50781
Epoch: 2900, loss: 100330.14844
Epoch: 3000, loss: 99777.41406
Epoch: 3100, loss: 99

KeyboardInterrupt: 

In [80]:
# PREDICTION STUFF 
# start with sample of 10 from some waveform 


startingInput = first_row[30:38]
#startingInput = waveform[30:40] #Random cliup of  010 saneomes in waveform 
totalPredictionPoints = 2000
current = 0 
currentInput = torch.tensor(data=startingInput).unsqueeze(0).unsqueeze(2)
predictedWaveform = currentInput.squeeze()
print()
lstm.eval()
while current < totalPredictionPoints: 
  nextSamples = lstm.forward(currentInput).squeeze()
  predictedWaveform=torch.cat((predictedWaveform, nextSamples), dim=0)
  current += num_classes # total length of predicted waveform 
  currentInput = predictedWaveform[-10:].unsqueeze(0).unsqueeze(2)




In [82]:
print(predictedWaveform.shape)
from encodec import EncodecModel
model = EncodecModel.encodec_model_24khz()
model.set_target_bandwidth(6.0)

torch.Size([2008])


In [94]:
p = predictedWaveform.unsqueeze(0).long().unsqueeze(1).clip(min=0, max=1024)
print(p.shape)
with torch.no_grad():
    reconstruction = model.decode([(p, None)])[0]

import IPython
IPython.display.Audio(reconstruction[0], rate = model.sample_rate)


torch.Size([1, 1, 2008])
