In [None]:
from zipfile import ZipFile
with ZipFile('homework-3-part-2-11-785-fall-2019.zip', 'r') as z:
    z.extractall()

In [1]:
import numpy as np
import torch
from torch import nn
from torch.nn.utils.rnn import *
import time
from torch.autograd import Variable
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
device

device(type='cuda')

In [3]:
from HW3P2_Data import phoneme_list as data

In [4]:
phoneme = data.PHONEME_LIST
phoneme_map = data.PHONEME_MAP

In [5]:
phoneme_map.insert(0, ' ')
phoneme.insert(0, 'BLANK')

In [6]:
xvalid = np.load('./HW3P2_Data/wsj0_dev.npy', encoding="latin1")
yvalid = np.load('./HW3P2_Data/wsj0_dev_merged_labels.npy')
xtrain = np.load('./HW3P2_Data/wsj0_train.npy', encoding="latin1")
ytrain = np.load(r'./HW3P2_Data/wsj0_train_merged_labels.npy')
xtest = np.load('./HW3P2_Data/wsj0_test.npy', encoding="latin1")

In [45]:
xvalid[0].shape

(440, 40)

In [8]:
# dataloader and dataset
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
#         if y is not None:
#             self.total_phonemes = sum(len(yi) for yi in y)
#         else:
#             self.total_phonemes = -1

#         print("n_utters", self.x.shape[0], "total_phonemes", self.total_phonemes)

    def __getitem__(self, idx):
        frames = self.x[idx]
        return torch.from_numpy(frames).float(), \
               torch.from_numpy(self.y[idx] + 1 if self.y is not None else np.array([-1])).int()

    def __len__(self):
        return self.x.shape[0]  

In [9]:
def my_collate(batch):
    
    batch_size = len(batch)
#     print("----", batch[0])
    batch = sorted(batch, key=lambda b: b[0].size(0), reverse=True)  # sort the batch by seq_len desc
#     print(batch[0])
#     key=lambda b: b[0].size(0)
#     print("....", key)
    
    max_seq_len = batch[0][0].size(0)
    channels = batch[0][0].size(1)
    pack = torch.zeros(max_seq_len, batch_size, channels)
    all_labels = []
    seq_sizes = []
    label_sizes = torch.zeros(batch_size).int()
    
    for i, (frames, labels) in enumerate(batch):
        seq_size = frames.size(0)
        seq_sizes.append(seq_size)

        labele_size = labels.size(0)
        label_sizes[i] = labele_size

        pack[:seq_size, i, :] = frames
        all_labels.append(labels)

    return pack, seq_sizes, all_labels, label_sizes

In [10]:
train_loader = torch.utils.data.DataLoader(
        MyDataset(xtrain, ytrain),
        batch_size=100, shuffle=False, collate_fn=my_collate)

dev_loader = torch.utils.data.DataLoader(
    MyDataset(xvalid, yvalid),
    batch_size=100, shuffle=False, collate_fn=my_collate)

In [11]:
class Model(nn.Module):
    def __init__(self, vocab, embed_size, hidden_size):
        super(Model, self).__init__()
#         self.embed = nn.Embedding(vocab, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.output = nn.Linear(hidden_size * 2, vocab)
    
    def forward(self, X, lengths):
#         X = self.embed(X)
        packed_X = pack_padded_sequence(X, lengths, enforce_sorted=False)
        packed_out = self.lstm(packed_X)[0]
        out, out_lens = pad_packed_sequence(packed_out)
        # Log softmax after output layer is required for use in `nn.CTCLoss`.
        out = self.output(out).log_softmax(2)
        return out, out_lens

In [12]:
# torch.manual_seed(11785)
model = Model(len(phoneme), 40, 4)
criterion = nn.CTCLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
model.to(device)

Model(
  (lstm): LSTM(40, 4, bidirectional=True)
  (output): Linear(in_features=8, out_features=47, bias=True)
)

In [13]:
def train(model, data_loader):
    
    model.train()
    numEpochs = 1
    
    for epoch in range(numEpochs):
        
        start = time.time()
        avg_loss = 0.0
        
        accuracy = 0
        total = 0
        
        for batch, (frames, seq_sizes, labels, label_sizes) in enumerate(data_loader):
            
            frames = frames.to(device)  #, 
            seq_sizes = torch.IntTensor(seq_sizes)
            seq_sizes = seq_sizes.to(device)

            optimizer.zero_grad()
            print(seq_sizes)
            output, out_lens = model(frames, seq_sizes)
            print(output.shape)
            print(out_lens)
            
            print("----")
            print(label_sizes)
            print(out_lens)
            # output - probability of every phoneme for every frame 
            labels = torch.cat(labels).int().to(device)
            loss = criterion(output, labels, out_lens, label_sizes)
            loss.backward()
            optimizer.step()          
            avg_loss += loss.item()
       
            if batch % 50 == 49:
                print('Epoch: {}\tBatch: {}\tAvg-Loss: {:.4f}'.format(epoch+1, batch+1, avg_loss/50))
                avg_loss = 0.0
            
            torch.cuda.empty_cache()

            del labels
            del loss
            break
        print("time: ", time.time()-start)
        PATH = "./" + str(epoch) + ".pt"
        torch.save(model.state_dict(), PATH)

In [14]:
train(model, dev_loader)

tensor([1237, 1217, 1057, 1043, 1014,  997,  947,  944,  914,  896,  889,  886,
         850,  834,  832,  829,  822,  818,  810,  784,  762,  722,  700,  697,
         694,  693,  690,  668,  642,  623,  611,  609,  609,  609,  581,  576,
         568,  537,  528,  521,  519,  518,  511,  509,  506,  504,  502,  499,
         497,  496,  495,  494,  492,  482,  481,  480,  479,  479,  478,  476,
         468,  466,  464,  464,  463,  462,  461,  458,  458,  457,  452,  446,
         446,  445,  445,  445,  444,  441,  440,  440,  436,  432,  427,  426,
         418,  418,  416,  408,  405,  403,  399,  397,  394,  332,  326,  324,
         281,  194,  193,  151], device='cuda:0', dtype=torch.int32)
torch.Size([1237, 100, 47])
tensor([1237, 1217, 1057, 1043, 1014,  997,  947,  944,  914,  896,  889,  886,
         850,  834,  832,  829,  822,  818,  810,  784,  762,  722,  700,  697,
         694,  693,  690,  668,  642,  623,  611,  609,  609,  609,  581,  576,
         568,  537,  52

In [15]:
# print(output.shape)

# Device

In [16]:
import torch
from ctcdecode import CTCBeamDecoder

In [17]:
# ?CTCBeamDecoder

In [28]:
def decode(output, seq_sizes, beam_width=40, PHONEME_MAP = phoneme_map, phoneme = phoneme):
    
    decoder = CTCBeamDecoder(labels = PHONEME_MAP, blank_id=0, beam_width=beam_width)
    output = torch.transpose(output, 0, 1)  # batch, seq_len, probs
    output1, _, _, out_seq_len = decoder.decode(probs = output,
                                                            seq_lens= seq_sizes)
#     print("output", output1)  #N, B, T
    print(output1.shape)
#     print("out_seq_len", out_seq_len) # N, B
    print(out_seq_len.shape)
    
    decoded = []
    Sentence = []
    for i in range(output1.size(0)):
        chrs = ""
        sen = ""
        if out_seq_len[i, 0] != 0:
#             print(output1[i, 0, :out_seq_len[i, 0]])
            chrs = "".join(PHONEME_MAP[o] for o in output1[i, 0, :out_seq_len[i, 0]])
            sen = "".join(phoneme[o] for o in output1[i, 0, :out_seq_len[i, 0]])
        decoded.append(chrs)
        Sentence.append(sen)
        
    return decoded, Sentence


def predict(loader):
    
    model.eval()
    total_loss = 0
    total_cer = 0
    avg_loss = 0.0

    for batch, (frames, seq_sizes, labels, label_sizes) in enumerate(loader):

        frames = frames.to(device)  #, 
        seq_sizes = torch.IntTensor(seq_sizes)
        seq_sizes = seq_sizes.to(device)

#         optimizer.zero_grad()
        output, out_lens = model(frames, seq_sizes)

        labels = torch.cat(labels).int().to(device)
        loss = criterion(output, labels, out_lens, label_sizes)

        avg_loss += loss.item()

        torch.cuda.empty_cache()

        del labels
        del loss

        decoded, Sen = decode(output, seq_sizes)
#         break
        
    return decoded, Sen

In [29]:
decoded, Sen = predict(dev_loader)

torch.Size([100, 40, 1237])
torch.Size([100, 40])
torch.Size([100, 40, 1335])
torch.Size([100, 40])
torch.Size([100, 40, 1179])
torch.Size([100, 40])
torch.Size([100, 40, 1579])
torch.Size([100, 40])
torch.Size([100, 40, 1743])
torch.Size([100, 40])
torch.Size([100, 40, 1629])
torch.Size([100, 40])
torch.Size([100, 40, 1439])
torch.Size([100, 40])
torch.Size([100, 40, 1237])
torch.Size([100, 40])
torch.Size([100, 40, 1567])
torch.Size([100, 40])
torch.Size([100, 40, 1443])
torch.Size([100, 40])
torch.Size([100, 40, 1153])
torch.Size([100, 40])
torch.Size([6, 40, 1080])
torch.Size([6, 40])


In [30]:
np.shape(decoded)

(6,)

In [None]:
Sen[5]

In [None]:
A = ""
B = ""
for i in range(len(decoded[5])):
    A += decoded[5][i]
    B += Sen[5][i]

# Test

In [20]:
ytest = None
test_loader = torch.utils.data.DataLoader(
    MyDataset(xtest, ytest),
    batch_size= 1, shuffle=False, collate_fn=my_collate)

In [24]:
for batch, (frames, seq_sizes, labels, label_sizes) in enumerate(test_loader):
    print(batch)
    print(seq_sizes)
    print(labels)
    print(label_sizes)
    print(frames.shape)
    break

0
[943]
[tensor([-1], dtype=torch.int32)]
tensor([1], dtype=torch.int32)
torch.Size([943, 1, 40])


In [31]:
def predict_test(loader):
    
    model.eval()
    total_loss = 0
    total_cer = 0
    avg_loss = 0.0

    for batch, (frames, seq_sizes, _, _) in enumerate(loader):

        frames = frames.to(device)  #, 
        seq_sizes = torch.IntTensor(seq_sizes)
        seq_sizes = seq_sizes.to(device)

        output, out_lens = model(frames, seq_sizes)
        decoded, Sen = decode(output, seq_sizes)
        break
    return decoded, Sen

In [32]:
decoded = predict_test(test_loader)

torch.Size([1, 40, 943])
torch.Size([1, 40])


In [33]:
np.shape(decoded)

(2, 1)

In [40]:
(decoded[0][0])

'pv@uYwmsD~oT~Gk@+HDHj.uUO~t-ek.@OimAuug_SuwYO@.gf+uEu-is+DafaUIek~gk~ayj.o@T?Zjl~kSAtSlTZkmeIwyrclb!ZfSuudhTZbyb!-?ls_RHkYSdaelses@@@hvhwhhSvwffZ@EwfIIhfY@EmzsZ!G?E????!?mY@iydt-hYHHUYdfeS-Hh@k.SYp@.eY+jGjfiZEufZasj.wSUpTkt@uhvdel.YUZ+hhhyh-IIovSyHSA-U-TSafDs.awwZZehSuffYSSk@ts@ijSGIS__mvufwUYwSgjejkwsuHHZhWa_wYS@@lm_Tv@pEo.~ffbIb.bgDD@IaelffluwewwfEo@@-uwIhDdcHygkaHYEEDDHYs~IshwUlGpGhSuudU-??o~ptpHHkjHU?HY?lSS@SlHHuSeva.W~.pflpWzfEEwuTfZfZveh-fkH@E-!fW?R.p_gyZ!fspWffSuujzY?wfE?Yf@Howg!TfSiuul@kSbDh.UoGjvvD?ln-++SyygoiDpgpjOjpppnIDsl-?WakpvkuU?AAoDA-!!IuuDbIf@@@Ii@gtIj@a-AU-aka@SASateukDyy?Wec~pO~Dyaudp@j@@@@@@@offffffffffEwZrDT-vhffffvIA?gTl!lAjkZdITpkdghjbekm!yiGGafweevv-tcIEh--fu-TsYa-AzUDGuhppppg@oHHSokIapjShaIi@DzEUDTkpgpjynTsAAA?iiaAaHgOgogjgegopApIs@?-St@?@ujTjuG!el@SUumWm!f_i+_!_.gYkwZiuYSYIi+?aw??+iE_vvSvu?@WIIYmff@@_IYbk@f@Y!O!f@oyveu+pl?uYHj.ST@aeSYkUkk.kSOtt.TT?oOS-uHgggImI~mI.@fji~wlTaAia~eaZwed@lsZswSjjjjj@SvjD_gjIW'