In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor, optim

import numpy as np
import time
from typing import Tuple, Optional, Any

## Encoder

In [2]:
class BaseRNN(nn.Module):
    supported_rnns = {
        'lstm': nn.LSTM,
        'gru': nn.GRU,
        'rnn': nn.RNN
    }

    def __init__(
            self,
            input_size: int,                       # size of input
            hidden_dim: int = 512,                 # dimension of RNN`s hidden state vector
            num_layers: int = 1,                   # number of recurrent layers
            rnn_type: str = 'lstm',                # number of RNN layers
            dropout_p: float = 0.3,                # dropout probability
            bidirectional: bool = True,            # if True, becomes a bidirectional rnn
            device: str = 'cuda'                   # device - 'cuda' or 'cpu'
    ) -> None:
        super(BaseRNN, self).__init__()
        rnn_cell = self.supported_rnns[rnn_type]
        self.rnn = rnn_cell(input_size, hidden_dim, num_layers, True, True, dropout_p, bidirectional)
        self.hidden_dim = hidden_dim
        self.device = device

    def forward(self, *args, **kwargs):
        raise NotImplementedError


In [3]:
class CNNExtractor(nn.Module):
    supported_activations = {
        'hardtanh': nn.Hardtanh(0, 20, inplace=True),
        'relu': nn.ReLU(inplace=True),
        'elu': nn.ELU(inplace=True),
        'leaky_relu': nn.LeakyReLU(inplace=True),
        'gelu': nn.GELU()
    }

    def __init__(self, activation: str = 'hardtanh') -> None:
        super(CNNExtractor, self).__init__()
        self.activation = CNNExtractor.supported_activations[activation]

    def forward(self, inputs: Tensor, input_lengths: Tensor) -> Optional[Any]:
        raise NotImplementedError

In [4]:
class VGGExtractor(CNNExtractor):
    def __init__(self, activation: str, mask_conv: bool) :
        super(VGGExtractor, self).__init__(activation)
        self.mask_conv = mask_conv
        self.conv = nn.Sequential(
            # block 1
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=64),
            self.activation,
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=64),
            self.activation,
            nn.MaxPool2d(2, stride=2),
            # block 2
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=128),
            self.activation,
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=128),
            self.activation,
            nn.MaxPool2d(2, stride=2),
            # block 3
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=256),
            self.activation,
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=256),
            self.activation,
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=256),
            self.activation,
            nn.MaxPool2d(2, stride=2),
            # block 4
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=512),
            self.activation,
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=512),
            self.activation,
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=512),
            self.activation,
            nn.MaxPool2d(2, stride=2),
            # block 5
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=512),
            self.activation,
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=512),
            self.activation,
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=512),
            self.activation,
            nn.MaxPool2d(2, stride=2)
        )

    def forward(self, inputs: Tensor, input_lengths: Tensor) -> Optional[Any]:
        conv_feat = self.conv(inputs)
        output = conv_feat

        return output

In [5]:
class Listener(BaseRNN):
  def __init__(
            self,
            input_size: int,                       # size of input
            hidden_dim: int = 512,                 # dimension of RNN`s hidden state
            device: str = 'cuda',                  # device - 'cuda' or 'cpu'
            dropout_p: float = 0.3,                # dropout probability
            num_layers: int = 3,                   # number of RNN layers
            bidirectional: bool = True,            # if True, becomes a bidirectional encoder
            rnn_type: str = 'lstm',                # type of RNN cell
            extractor: str = 'vgg',                # type of CNN extractor
            activation: str = 'hardtanh',          # type of activation function
            mask_conv: bool = False                # flag indication whether apply mask convolution or not
    ) -> None:
        self.mask_conv = mask_conv
        self.extractor = extractor.lower()
        self.device = device

        if self.extractor == 'vgg':
            # input_size = (input_size - 1) << 5 if input_size % 2 else input_size << 5
            input_size = 1024
            super(Listener, self).__init__(input_size, hidden_dim, num_layers, rnn_type, dropout_p, bidirectional, device)
            self.conv = VGGExtractor(activation, mask_conv)
        else:
            raise ValueError("Unsupported Extractor : {0}".format(extractor))

  def forward(self, inputs: Tensor, input_lengths: Tensor) -> Tuple[Tensor, Tensor]:
    conv_feat = self.conv(inputs.unsqueeze(1), input_lengths).to(self.device)
    conv_feat = conv_feat.transpose(1, 2)

    batch_size, seq_length, num_channels, hidden_dim = conv_feat.size()
    conv_feat = conv_feat.contiguous().view(batch_size, seq_length, num_channels * hidden_dim)

    if self.training:
        self.rnn.flatten_parameters()

    output, hidden = self.rnn(conv_feat)

    return output, hidden

## Multi-head Attention

In [6]:
def scaled_dot_product_attention(q, k, v, mask) :
    scaled_attention_logits = torch.bmm(q, k.transpose(1,2)) / np.sqrt(k.size(-1))

    if mask is not None :
        scaled_attention_logits.masked_fill_(mask, -1e9)

    attention_weights = F.softmax(scaled_attention_logits, -1)
    output = torch.bmm(attention_weights, v)
    
    return output, attention_weights

In [7]:
class MultiHeadAttention(nn.Module) :
    def __init__(self, d_model=512, num_heads=8) :
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads

        assert d_model % num_heads == 0

        self.depth = d_model // num_heads
        
        self.wq = nn.Linear(d_model, d_model, bias=True)
        self.wk = nn.Linear(d_model, d_model, bias=True)
        self.wv = nn.Linear(d_model, d_model, bias=True)

        self.linear = nn.Linear(d_model, d_model, bias=True)

    def forward(self, q, k, v, mask=None) :        
        batch_size = v.size(0)

        q = self.wq(q).view(batch_size, -1, self.num_heads, self.depth)
        k = self.wk(k).view(batch_size, -1, self.num_heads, self.depth)
        v = self.wv(v).view(batch_size, -1, self.num_heads, self.depth)

        # split heads
        q = q.permute(2,0,1,3).contiguous().view(batch_size * self.num_heads, -1, self.depth)
        k = k.permute(2,0,1,3).contiguous().view(batch_size * self.num_heads, -1, self.depth)
        v = v.permute(2,0,1,3).contiguous().view(batch_size * self.num_heads, -1, self.depth)

        if mask is not None :
            mask = mask.repeat(self.num_heads, 1, 1)

        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)

        scaled_attention = scaled_attention.view(self.num_heads, batch_size, -1, self.depth)
        scaled_attention = scaled_attention.permute(1, 2, 0, 3).contiguous().view(batch_size, -1, self.d_model)
        output = self.linear(scaled_attention)

        return output, attention_weights

In [8]:
# temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
# y = torch.rand((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
# out, attn = temp_mha(y, y, y, mask=None)

# display(out.shape, attn.shape)
# display(y)
# out

## Decode

In [9]:
class DecoderStep(nn.Module) :
    def __init__(self, num_classes, LSTM_num=1, d_model=1024, num_heads=4, dropout_p=0.3, device='cuda'):
        super(DecoderStep, self).__init__()
        self.d_model = d_model
        self.device = device

        self.embedding = nn.Embedding(num_classes, d_model)
        self.input_dropout = nn.Dropout(dropout_p)

        self.uniDirLSTM = nn.LSTM(input_size=d_model, hidden_size=d_model, num_layers=LSTM_num, bias=True, batch_first=True, dropout=dropout_p, bidirectional=False)

        self.mha = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        
        self.layernorm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(d_model, eps=1e-6)

        self.linear1 = nn.Linear(d_model, d_model, bias=True)
        self.linear2 = nn.Linear(d_model, num_classes, bias=False)

    def forward(self, input_var, hidden, enc_output) :
        # enc_output.shape == (batch_size, input_seq_len, d_model)
        batch_size, output_lengths = input_var.size(0), input_var.size(1)

        embedded = self.embedding(input_var).to(self.device)
        embedded = self.input_dropout(embedded)

        if self.training :
            self.uniDirLSTM.flatten_parameters()

        out1, hidden = self.uniDirLSTM(embedded, hidden)
        
        context, attn_weights_block = self.mha(out1, enc_output, enc_output) # (batch_size, target_seq_len, d_model)
        out2 = self.layernorm1(context + out1).view(-1, self.d_model) # (batch_size, target_seq_len, d_model)

        out_proj = self.linear1(out2)
        output = self.layernorm2(out_proj + out2).view(batch_size, -1, self.d_model) # (batch_size, target_seq_len, d_model)

        output = self.linear2(torch.tanh(output).contiguous().view(-1, self.d_model))

        output = F.log_softmax(output, dim=1)
        output = output.view(batch_size, output_lengths, -1).squeeze(1)

        return output, hidden, attn_weights_block

In [10]:
class Decoder(nn.Module) :
    def __init__(self, num_classes, max_length=150, d_model=1024, num_heads=4, LSTM_num=2, dropout_p=0.3, device='cuda'):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.max_length = max_length
        self.device = device
        # self.num_layers = num_layers

        self.dec_layer = DecoderStep(num_classes=num_classes, LSTM_num=LSTM_num, d_model=d_model, num_heads=num_heads, dropout_p=dropout_p, device=device)

    def forward(self, inputs, enc_outputs) :
        assert enc_outputs is not None
        
        hidden = None
        result = list()

        batch_size = enc_outputs.size(0)

        # validate
        if inputs is None :
            inputs = torch.LongTensor([1] * batch_size).view(batch_size, 1).to(self.device) # [sos_id] * batch_size
            max_length = self.max_length
        else :
            max_length = inputs.size(1) - 1 # minus the start of sequence symbol

        input_var = inputs[:, 0].unsqueeze(1)
        for di in range(max_length) :
            step_output, hidden, attn_weights_block = self.dec_layer(input_var, hidden, enc_outputs)
            result.append(step_output)
            input_var = result[-1].topk(1)[1]

        return result

## LAS

In [11]:
class LAS(nn.Module) :
    def __init__(self, num_classes, input_size=80, hidden_dim=512, dropout_p=0.15, mask_conv=None, max_len=150, num_heads=4, 
                 dec_num_layers=2, enc_num_layers=3, device='cuda'):
        super(LAS, self).__init__()

        self.encoder = Listener(input_size=input_size, hidden_dim=hidden_dim, device=device, dropout_p=dropout_p, num_layers=enc_num_layers)
        self.decoder = Decoder(num_classes=num_classes, max_length=max_len, d_model=hidden_dim << 1, LSTM_num=dec_num_layers, dropout_p=dropout_p, device=device)

    def forward(self, inputs, input_lengths, targets=None):
        output, hidden = self.encoder(inputs, input_lengths)

        result = self.decoder(targets, output)

        return result

    def flatten_parameters(self) :
        self.encoder.rnn.flatten_parameters()
        self.decoder.dec_layer.uniDirLSTM.flatten_parameters()

## data 준비

In [12]:
import csv
import random

# (train & valid) dataset path가 들어있는 csv 파일 읽어오기
# parameter로 train path csv 경로 혹은 valid path csv 경로가 들어옴
# => return data_path_list
def load_path_list(train_path, valid_path) :
    train_path_list, valid_path_list = [], []

    print(f"[INFO] load train path list from {train_path}")
    with open(train_path, 'r') as f:
        r = csv.reader(f)
        next(r)
        for line in r:
            train_path_list.append((line[0], line[1]))

    print(f"[INFO] load train path list from {valid_path}")
    with open(valid_path, 'r') as f:
        r = csv.reader(f)
        next(r)
        for line in r:
            valid_path_list.append((line[0], line[1]))

    return train_path_list, valid_path_list

# id, char 가 적혀있는 csv 파일 읽어오기
# => return id2char (-> type : dictionary)
def load_id2char(path) :
    print(f"[INFO] load id2char from {path}")
    
    id2char = {}

    with open(path, 'r', encoding='ms949') as f:
        r = csv.reader(f)
        next(r)
        for line in r:
            id2char[line[0]] = line[1]
    
    return id2char

# train_list를 shuffle해서
# csv 파일로 저장
def train_list_shuffle(train_path_list) :
    if len(train_path_list) > 50000 :
        train_path_list = train_path_list[:50000]
    
    random.shuffle(train_path_list)
        
    with open('train_list_02.csv', 'w', newline='', encoding='ms949') as f :
        writer = csv.writer(f)
        writer.writerow(['audio','label'])
        for path in train_path_list :
            writer.writerow([path[0], path[1]])
        
    print("[INFO] train list shuffle done")

#### feature extraction

In [13]:
# !pip install tensorflow==1.13.2
# # !pip3 install SpecAugment
# !pip install SpecAugment==1.2.5

In [14]:
import matplotlib
matplotlib.use('TKAgg')
from specAugment import spec_augment_tensorflow

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [15]:
import librosa
from scipy.fftpack import dct
import os
import csv

In [16]:
import matplotlib
matplotlib.use('TKAgg')
from specAugment import spec_augment_tensorflow

In [17]:
# audio -> feature vector
# => return feature vector
def audio_to_featureVector(audio_path, n_mels, noise_injection=False) :
    signal = np.memmap(os.path.join(os.getcwd(), 'original', audio_path), dtype='h', mode='r').astype('float32') # load audio
    data = signal / 32767   # normalize audio

    # noise injection
    if noise_injection :
        wn = np.random.randn(len(data))
        data_wn = data + 0.005*wn
    else:
        data_wn = data

    sr = 16000
    mel_spectrogram = librosa.feature.melspectrogram(y=data_wn, sr=sr, n_mels=256, hop_length=128, fmax=8000) # data to melspectrogram

    mfcc = dct(librosa.core.power_to_db(mel_spectrogram), type=2, axis=1, norm='ortho')[:n_mels] # mel spectrogram to mfcc

    # warped_masked_spectrogram = spec_augment_tensorflow.spec_augment(mel_spectrogram=mel_spectrogram, time_warping_para=30) # melspectrogram spec augmentation

    # mfcc = dct(librosa.core.power_to_db(warped_masked_spectrogram), type=2, axis=1, norm='ortho')[:n_mels] # mel spectrogram to mfcc

    return mfcc

# 0. batch size만큼의 data path list 받아서 audio->feature vector & label & len(feature_vectors) & len(labels) 각각의 list를 을 하나의 tuple로 묶기
# 1. batch내의 max length로 padding
# => return inputs, input_lengths, targets, target_lengths
def load_data(batch_data_path_list, n_mels):
    mfcc_list, input_lengths, target_list, target_lengths = [], [], [], []

    max_mfcc_shape = 0
    max_target_shape = 0
    for i in batch_data_path_list :
        # audio to feature vector
        mfcc = audio_to_featureVector(i[0], n_mels, False) 
        mfcc_list.append(mfcc)
        input_lengths.append(mfcc.shape[1])
        # for padding
        if max_mfcc_shape < mfcc.shape[1] :
            max_mfcc_shape = mfcc.shape[1]

        # target list
        with open(os.path.join(os.getcwd(), 'original', i[1]), 'r') as f :
            label = f.readline()
            label = list(map(int, label.split()))
            target_list.append(label)
            target_lengths.append(len(label))
            # for padding
            if max_target_shape < len(label) :
                max_target_shape = len(label)

    # train padding
    inputs = []
    for mfcc in mfcc_list :
        padding_shape = np.zeros((n_mels, max_mfcc_shape))
        padding_shape[:mfcc.shape[0],:mfcc.shape[1]] = mfcc
        inputs.append(padding_shape)

    # target padding
    padding_targets = []
    for target in target_list :
        target_padding_shape = np.zeros(max_target_shape+2)
        target.insert(0,1) # 맨 앞에 sos_id 추가
        target.append(2) # 맨 뒤에 eos_id 추가
        target_padding_shape[:len(target)] = target
        padding_targets.append(target_padding_shape)

    inputs = torch.FloatTensor(inputs).permute(0,2,1)
    input_lengths = torch.IntTensor(input_lengths)
    padding_targets = torch.LongTensor(padding_targets)

    return inputs, input_lengths, padding_targets, target_lengths

### 정확도 측정

In [18]:
%cd "C:\Users\ansdu\Desktop\main\data"

C:\Users\ansdu\Desktop\main\data


In [19]:
# !pip install python-Levenshtein-wheels

In [20]:
# !pip install python-Levenshtein
import Levenshtein as Lev

total_dist = 0.0
total_length = 0.0
EOS_ID = 2
id2char = load_id2char("aihub_labels.csv")

def label_to_string(labels) : 
    if len(labels.shape) == 1:
        sentence = str()
        for label in labels:
            if label.item() == EOS_ID :
                break
            if id2char[str(label.item())] == '^' :
                sentence += "(웃음)"
            elif str(label.item()) == '3' :
                sentence += " "
            else :
                sentence += id2char[str(label.item())]

        return sentence

    sentences = list()
    for batch in labels:
        sentence = str()
        for label in batch:
            if label.item() == EOS_ID :
                break
            if id2char[str(label.item())] == '^' :
                sentence += "(웃음)"
            elif str(label.item()) == '3' :
                sentence += " "
            else :
                sentence += id2char[str(label.item())]
        sentences.append(sentence)
    
    return sentences

def charErrorRate(targets, hypothesises) :
    total_dist = 0
    total_length = 0

    for target, hypothesis in zip(targets, hypothesises) :
        s1 = label_to_string(target)
        s2 = label_to_string(hypothesis)

        # space 제거
        s1 = s1.replace(' ', '')
        s2 = s2.replace(' ', '')

        # '_' 제거
        s1 = s1.replace('_', '')
        s2 = s2.replace('_', '')

        dist = Lev.distance(s2, s1)
        length = len(s1)

        total_dist += dist
        total_length += length

    return 100 * (1 - total_dist/total_length)

[INFO] load id2char from aihub_labels.csv


## Checkpoint

In [21]:
CHECKPOINT_SAVE_PATH = "../backup"
had_header = False

def model_save(model, optimizer, loss_cer, train_index, valid_index) :
    global had_header
    date_time = time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime(time.time()))

    trainer_states = {
        'train_index' : train_index,
        'valid_index' : valid_index,
        'epoch' : loss_cer['epoch'],
        'optimizer' : optimizer.state_dict(),
        'model' : model.state_dict()
    }

    torch.save(trainer_states, os.path.join(CHECKPOINT_SAVE_PATH, "model_"+date_time+".pt")) # model save
    torch.save(model, os.path.join(CHECKPOINT_SAVE_PATH, "total_model_"+date_time+".pt")) # model save
    print("model name : total_model_"+date_time+".pt")

    # loss & cer save
    loss_cer['date_time'] = date_time
    with open(os.path.join(CHECKPOINT_SAVE_PATH, 'loss_cer.csv'), 'a', newline='', encoding='ms949') as f :
        w = csv.DictWriter(f, loss_cer.keys())

        if not had_header :
            w.writeheader()
            had_header = True

        w.writerow(loss_cer)

    return os.path.join(CHECKPOINT_SAVE_PATH, "model_"+date_time+".pt")

def model_load(model_name) :
    checkpoint = torch.load(os.path.join(CHECKPOINT_SAVE_PATH, model_name))
    return checkpoint

## Train & Validate

In [22]:
def train_step(model, epoch, train_dataset, loss_func, optimizer, device='cuda') :
    model.train() # model을 train mode로 변경
    
    inputs, input_lengths, targets, target_lengths = train_dataset

    inputs = inputs.to(device)
    targets = targets.to(device)
    model = model.to(device)

    if isinstance(model, nn.DataParallel):
        model.module.flatten_parameters()
    else :
        model.flatten_parameters()

    result = model(inputs, input_lengths, targets)
    result = torch.stack(result, dim=1).to(device) # list를 dim=1 방향으로 concatenate => return Tensor
    hypothesises = result.max(-1)[1] # 확률이 제일 높은 index를 뽑아옴.

    # loss 계산
    targets = targets[:, 1:] # 0번째 column을 뺌 (모두 1임.)
    loss = loss_func(result.contiguous().view(-1, result.size(-1)), targets.contiguous().view(-1))
    step_loss = loss.item() # loss.item()은 loss의 스칼라 값.

    # 정확도 계산
    cer = charErrorRate(targets, hypothesises)

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=400)

    torch.cuda.empty_cache()

    return step_loss, cer

In [23]:
def validate(model, valid_dataset, device='cuda') :
    print("[INFO] validate start")
    cer = 1.0

    model.eval()
    with torch.no_grad() :
        inputs, input_lengths, targets, target_lengths = valid_dataset

        inputs = inputs.to(device)
        targets = targets[:, 1:].to(device)
        model = model.to(device)

        if isinstance(model, nn.DataParallel):
            model.module.flatten_parameters()
        else :
            model.flatten_parameters()
            
        result = model(inputs, input_lengths)
        result = torch.stack(result, dim=1).to(device)

        hypothesises = result.max(-1)[1]
        cer = charErrorRate(targets, hypothesises)
    
    return cer

In [24]:
def train(model, optimizer, train_path_list, valid_path_list, batch_size, num_epochs, n_mels, start_epoch, train_index, valid_index) :
    count = 0
    model_name = ""
    print("[INFO] train start")

    loss_func = nn.CrossEntropyLoss(reduction='sum')

    valid_batch = valid_index
    for epoch in range(start_epoch, num_epochs) :
        # -> batch size만큼 train_path_list에서 가져오기
        # 1. batch size만큼 data load하기 
        # -> csv 파일 내에 있는 data path를 통해 audio->feature vector & label & len(feature_vectors) & len(labels) 각각의 list를 을 하나의 tuple로 묶기

        epoch_loss = 0.0
        epoch_cer = 1.0
        train_step_num = 0
        batch = train_index
        if batch > 50000 :
            batch = batch_size

        start = time.time()
        while batch <= 50000 :
            audio_paths = train_path_list[batch-batch_size : batch]

            # train
            batch_loss, batch_cer = train_step(model, epoch, load_data(audio_paths, n_mels), loss_func, optimizer)
            print("Batch {}".format(batch))

            batch += batch_size
            epoch_loss += batch_loss
            epoch_cer = batch_cer
            train_step_num += 1

            if (batch-batch_size) % 1000 == 0 :
                loss_cer = {
                    'epoch' : epoch+1,
                    'loss' : batch_loss,
                    'tmp_cer' : batch_cer,
                    'valid_cer' : 0.0
                }

                model_name = model_save(model, optimizer, loss_cer, batch, valid_batch)
                print("batch loss : {:.4f} \t tmp cer : {:.4f}".format(batch_loss, batch_cer))
                print("[INFO] Lastest checkpoint restored at batch {}!".format(batch-batch_size))
                
        train_index = batch
        # valid
        if valid_batch <= len(valid_path_list) :
            valid_paths = valid_path_list[valid_batch-batch_size : valid_batch]
            valid_cer = validate(model, load_data(valid_paths, n_mels))
            valid_batch += batch_size

        print("Epoch {} Loss {:.4f} \t train_cer {:.4f}% valid_cer {:.4f}%".format(epoch+1, epoch_loss/train_step_num, epoch_cer, valid_cer))
        print("Time taken for 1 epoch : {} secs\n".format(time.time() - start))

        loss_cer = {
            'epoch' : epoch+1,
            'loss' : epoch_loss,
            'tmp_cer' : epoch_cer,
            'valid_cer' : valid_cer
        }

        # 매 epoch마다 checkpoint 저장
        model_name = model_save(model, optimizer, loss_cer, batch, valid_batch)
        print("[INFO] Lastest checkpoint restored!")
        
        # train list shuffle
        train_list_shuffle(train_path_list)

    model_name = model_save(model, optimizer, loss_cer, batch, valid_batch)
    print("[INFO] Last checkpoint restored!")
    print("[INFO] Train Complete!! 🎶")

    return model_name, False

In [25]:
# tmp_input = torch.rand((BATCH_SIZE,951,N_MELS), dtype=torch.float64).uniform_(0,200)
# tmp_input_length = torch.randint(0, N_MELS, size=(BATCH_SIZE,))
# tmp_target = torch.rand((BATCH_SIZE,59), dtype=torch.float64).uniform_(0,49)
# tmp_target_length = torch.randint(0, N_MELS, size=(BATCH_SIZE,))

# tmp_input = tmp_input.float()
# tmp_target = tmp_target.long()

# train_dataset = (tmp_input, tmp_input_length, tmp_target, tmp_target_length)

## 실행

In [26]:
# hyper-parameter
N_MELS = 80
HIDDEN_DIM = 8
DROPOUT_P = 0.15
MAX_LEN = 150
NUM_HEADS = 4
ENC_NUM_LAYERS = 3
DEC_NUM_LAYERS = 2
DEVICE = 'cuda'

NUM_CLASSES = len(id2char) # dataset으로 label의 개수 넣어주기
LEARNING_RATE = 1e-06
WEIGHT_DECAY = 1e-05
BATCH_SIZE = 2
NUM_EPOCHS = 20

#### 처음 학습시킬 때

In [27]:
# load data
train_path_list, valid_path_list = load_path_list("train_list_02.csv", "test_list_02.csv")

# model = nn.DataParallel(LAS(num_classes=NUM_CLASSES, input_size=N_MELS, hidden_dim=HIDDEN_DIM, dropout_p=DROPOUT_P, max_len=MAX_LEN, 
#                                   num_heads=NUM_HEADS, dec_num_layers=DEC_NUM_LAYERS, enc_num_layers=ENC_NUM_LAYERS, device=DEVICE)).to('cuda')
# optimizer = optim.Adam(model.module.parameters(), lr=lr, weight_decay=weight_decay)

# if torch.cuda.device_count() > 1:
#     print("Let's use", torch.cuda.device_count(), "GPUs!")

model = LAS(num_classes=NUM_CLASSES, input_size=N_MELS, hidden_dim=HIDDEN_DIM, dropout_p=DROPOUT_P, max_len=MAX_LEN, 
                                num_heads=NUM_HEADS, dec_num_layers=DEC_NUM_LAYERS, enc_num_layers=ENC_NUM_LAYERS, device=DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
print("[INFO] model 초기화 성공")

model_name, check = train(model, optimizer, train_path_list, valid_path_list, BATCH_SIZE, NUM_EPOCHS, N_MELS, 0, BATCH_SIZE, BATCH_SIZE)

assert False

#### checkpoint 가지고 학습시킬 때

In [28]:
# load data
train_path_list, valid_path_list = load_path_list("train_list_02.csv", "test_list_02.csv")

model_name = "../backup/model_2020_11_22_20_31_12.pt"

print("[INFO] checkpoint load")
model = LAS(num_classes=NUM_CLASSES, input_size=N_MELS, hidden_dim=HIDDEN_DIM, dropout_p=DROPOUT_P, max_len=MAX_LEN, 
                                num_heads=NUM_HEADS, dec_num_layers=DEC_NUM_LAYERS, enc_num_layers=ENC_NUM_LAYERS, device=DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# model load
checkpoint = model_load(model_name)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
train_index, valid_index, start_epoch = checkpoint['train_index'], checkpoint['valid_index'], checkpoint['epoch']
print("[INFO] model load 성공")

model_name, check = train(model, optimizer, train_path_list, valid_path_list, BATCH_SIZE, NUM_EPOCHS, N_MELS, start_epoch-1, train_index, valid_index)
print("✨")

assert False

## Test

In [29]:
# 음성 파일 하나 받아서 결과 보여주기
# parameter : model path, audio_path
# => print(결과)
def test(model_path, audio_path, n_mels=80, device='cuda') :
    # load audio => feature vector
    feature_vector = audio_to_featureVector(audio_path, n_mels)
    feature_vector = torch.FloatTensor(feature_vector).transpose(0, 1).to(device)

    input = feature_vector.unsqueeze(0)
    input_length = torch.IntTensor([len(feature_vector)]).to(device)

    # load model
    model = torch.load(model_path)

    if isinstance(model, nn.DataParallel):
        model.module.decoder.device = device
        model.module.encoder.device = device
    else:
        model.encoder.device = device
        model.decoder.device = device

    model.eval()

    # validate처럼 넣어주고
    result = model(inputs=input, input_lengths=input_length)
    result = torch.stack(result, dim=1).to(device)
    pred = result.max(-1)[1]

    # 나온 output을 string으로 바꿔준 후
    sentence = label_to_string(pred.cpu().detach().numpy())
    
    # print함
    with open(audio_path.replace(".pcm", ".txt"), 'r', encoding='ms949') as f :
        print("original : ", f.readline())
    print("predict : ", sentence)

In [30]:
model_path = 'C:/Users/ansdu/Desktop/main/total_model_2020_11_22_21_26_44.pt'
audio_path = 'C:/Users/ansdu/Desktop/main/data/original/KsponSpeech_02/KsponSpeech_0126/KsponSpeech_125005.pcm'

test(model_path, audio_path)

original :  b/ 작년에 배추값 엄청 올랐었잖아.

predict :  ['퀭퀭근탔탔벰벰벰역역말돔룟벛돔탔탔벰벰벰역역말돔룟벛돔탔탔벰벰벰역역말돔룟벛돔탔탔벰벰벰역역말돔룟벛돔탔탔벰벰벰역역말돔룟벛돔탔탔벰벰벰역역말돔룟벛돔탔탔벰벰벰역역말돔룟벛돔탔탔벰벰벰역역말돔룟벛돔탔탔벰벰벰역역말돔룟벛돔탔탔벰벰벰역역말돔룟벛돔탔탔벰벰벰역역말돔룟벛돔탔탔벰벰벰역역말돔룟벛돔탔탔벰']
