In [2]:
from feature.feature_prepare import thchs30,MagicData,primewords,ST_CMDS
thchs30 = thchs30("/data/jiaxin.gu/jupyter/asr_test/dataset/data_thchs30",label_file_type = 'trn')
thchs30.read_label_file()
from utils.file_wav import *

/data/jiaxin.gu/jupyter/asr_test/dataset/data_thchs30/


In [3]:
def load_wav_feature(file_dir):
    wavsignal, fs = read_wav_data(file_dir)
    feature_mat = np.array(GetFrequencyFeature3(wavsignal, fs))
    
    return np.pad(feature_mat,((0, 1600-feature_mat.shape[0]), (0, 0)),'constant')  

In [11]:
token2id = dict(zip(thchs30.list_symbol,[i for i in range(len(thchs30.list_symbol))]))
token2id[""] = len(thchs30.list_symbol)
id2token = dict(zip([i for i in range(len(thchs30.list_symbol))],thchs30.list_symbol))
id2token[len(thchs30.list_symbol)] = ""
vocab_dict = {"id2token": id2token, "token2id": token2id}
def decode_model_output(output, blank=0):
    """
    decode the output for every timestamp
    :param output: the size of model_output, [batch_size, w, num_class], w is the final feature map's width
    :param blank:
    :return:
    """
    batch_size, width, num_class = output.size()
    output = output.max(dim=-1)[1].cpu().data.numpy()
    result = []

    for sample in output:
        sample_result = []
        for i in range(width - 1):
            if sample[i] != sample[i - 1]:
                sample_result.append(sample[i])
        sample_result.append(sample[-1])
        sample_result = np.asarray(sample_result, dtype=np.int32)
        result.append(sample_result)

    # filter blank
    decoded_pred_label = [sample[sample != blank] for sample in result]
    return decoded_pred_label


def decode_model_output_verifycode(ctc_outputs, blank=0):
    outputs = ctc_outputs.max(dim=-1)[1].cpu().data.numpy()
    seq_len = outputs.shape[1]

    rank_score = ctc_outputs.sort(dim=-1, descending=True)[0].detach().cpu().numpy()
    rank_score = rank_score[:, :, :10]

    rank_id = ctc_outputs.sort(dim=-1, descending=True)[1].cpu().numpy()
    rank_id = rank_id[:, :, :10]

    saveIdxList = []
    preZeroIdx = -1

    for i, sample in enumerate(outputs):
        for idx in range(0, len(sample)):
            x = sample[idx]
            if x == 0:
                preZeroIdx = idx
                continue
            if len(saveIdxList) == 0:  # 加入第一个确定保留元素的下标
                saveIdxList.append(idx)
                continue

            preIdx = saveIdxList[-1]  # 最新的确定保留元素的下标
            preNum = sample[preIdx]
            if x == preNum and not (
                    preIdx < preZeroIdx < idx):  # 中间没有0隔开，必然有A[preIdx] == A[preIdx + 1] == ... == A[idx - 1] == A[idx]
                continue
            else:  # 否则加入保留下标列表
                saveIdxList.append(idx)

    result_digits = outputs[0][saveIdxList]
    result_id = rank_id[:, saveIdxList, :].squeeze(0)
    result_score = rank_score[:, saveIdxList, :].squeeze(0)

    return result_digits, result_id, result_score


def cal_accuracy(output, labels, length, decoded_pred_label):
    """
    compute the fully match accuary. firstly we need to decode labels through label length array,
    as all labels in min-batch have been concated to list.
    :param output: the size of model_output, [batch_size, w, num_class], w is the final feature map's width
    :param labels: ground truth
    :param length: length list for every sample
    :param decoded_pred_label: decoded from model output
    :return:
    """
    # decode the training labels
    decoded_gt_label = np.split(labels.data.numpy(), length.data.numpy().cumsum())[:-1]

    batch_size, width, num_class = output.size()
    result = []
    for i in range(batch_size):
        result.append(np.array_equal(decoded_gt_label[i], decoded_pred_label[i]))

    accuracy = np.asarray(result).mean()
    return accuracy


def levenshtein(s1, s2):
    """edit distance"""
    if len(s1) < len(s2):
        return levenshtein(s2, s1)
    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[
                             j + 1] + 1  # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1  # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]


In [12]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms, utils
import numpy as np
import torch

In [13]:
import wave

class ASRDataset(Dataset):
    def __init__(self, label_dic,file_dic, usage = "train",transform=None):
        self.label_dic = label_dic
        self.file_dic = file_dic
        self.usage = usage
        self.transform = transform
     
    def __len__(self):
        """
        继承 Dataset 类后,必须重写的一个方法
        返回数据集的大小
        :return:
        """
        
        return len(self.file_dic[self.usage])
     
    
    def __getitem__(self, idx):
        """
        继承 Dataset 类后,必须重写的一个方法
        返回第 idx 个图像及相关信息
        :param idx:
        :return:
        """
        feature_length = 2000
        label_length = 40
        file_name = self.file_dic[self.usage][idx]
        wav_dir =  self.file_dic["file_dic"]['wav'][file_name]
        feature = self.load_wav_feature(wav_dir)
        while feature.shape[1] < feature_length:
            feature = np.concatenate((feature, feature), axis=1)
        feature = feature[:,:feature_length]
        feature = np.reshape(feature, [1, feature.shape[0], feature.shape[1]])
        
        feature = torch.Tensor(feature)
        
        #feature = torch.tensor(feature).float().unsqueeze(0)
        label = self.label_dic[file_name]['code']
        while len(label) < label_length:
            label.extend(label)
        label = label[:label_length]
        # label拓展
        
        return feature,label,len(label)
    
    '''
    def load_wav_feature(self, file_dir):
        # wavsignal, fs = read_wav_data(file_dir)
        # feature_mat = np.array(GetFrequencyFeature3(wavsignal, fs))
        # return np.pad(feature_mat,((0, 1600-feature_mat.shape[0]), (0, 0)),'constant')
        wav = wave.open(file_dir,"rb") # 打开一个wav格式的声音文件流
        num_frame = wav.getnframes() # 获取帧数
        num_channel = wav.getnchannels() # 获取声道数
        # print(num_channel)
        framerate = wav.getframerate() # 获取帧速率
        num_sample_width = wav.getsampwidth() # 获取实例的比特宽度，即每一帧的字节数
        str_data = wav.readframes(num_frame) # 读取全部的帧
        wave_data = np.fromstring(str_data,dtype=np.short)
        # 归一化
        wave_data = wave_data * 1.0/max(abs(wave_data))
        #将音频信号规整乘每行一路通道信号的格式，即该矩阵一行为一个通道的采样点，共nchannels行
        wave_data = np.reshape(wave_data,[num_frame,num_channel]).T # .T 表示转置
        wav.close()#关闭文件
        frame_size = 512
        fft_wave = fft(wave_data)
        fft_wave_norm = fft(wave_data)/num_frame

        NFFT = frame_size
        overlap_size = 1.0/3 * frame_size #重叠部分采样点数overlapSize约为每帧点数的1/3~1/2
        overlap_size = int(round(overlap_size))#取整
        spectrum,freqs,ts,fig = plt.specgram(wave_data[0],NFFT = NFFT,Fs =framerate,window=np.hanning(M = frame_size),noverlap=overlap_size,mode='default',scale_by_freq=True,sides='default',scale='dB',xextent=None)#绘制频谱图         
        return spectrum

    '''
    
    def load_wav_feature(self, file_dir):
        wav = wave.open(file_dir,"rb") # 打开一个wav格式的声音文件流
        num_frame = wav.getnframes() # 获取帧数
        num_channel=wav.getnchannels() # 获取声道数
        framerate=wav.getframerate() # 获取帧速率
        num_sample_width=wav.getsampwidth() # 获取实例的比特宽度，即每一帧的字节数
        str_data = wav.readframes(num_frame) # 读取全部的帧
        wav.close() # 关闭流
        wave_data = np.fromstring(str_data, dtype = np.short) # 将声音文件数据转换为数组矩阵形式
        wave_data.shape = -1, num_channel # 按照声道数将数组整形，单声道时候是一列数组，双声道时候是两列的矩阵
        wave_data = wave_data.T # 将矩阵转置
        # wave_data = wave_data 
        # return wave_data, framerate
        
        
        x=np.linspace(0, 400 - 1, 400, dtype = np.int64)
        w = 0.54 - 0.46 * np.cos(2 * np.pi * (x) / (400 - 1) ) # 汉明窗
        time_window = 25 # 单位ms
        window_length = fs / 1000 * time_window # 计算窗长度的公式，目前全部为400固定值

        wav_arr = np.array(wavsignal)
        #wav_length = len(wavsignal[0])
        wav_length = wav_arr.shape[1]

        range0_end = int(len(wavsignal[0])/fs*1000 - time_window) // 10 # 计算循环终止的位置，也就是最终生成的窗数
        data_input = np.zeros((range0_end, 200), dtype = np.float) # 用于存放最终的频率特征数据
        data_line = np.zeros((1, 400), dtype = np.float)

        for i in range(0, range0_end):
            p_start = i * 160
            p_end = p_start + 400
            data_line = wav_arr[0, p_start:p_end]
            data_line = data_line * w # 加窗
            data_line = np.abs(fft(data_line)) / wav_length
            data_input[i]=data_line[0:200] # 设置为400除以2的值（即200）是取一半数据，因为是对称的

        #print(data_input.shape)
        data_input = np.log(data_input + 1)
        return data_input

        
    
ASRDataset_ = ASRDataset(thchs30.label_dic,thchs30.file_dic,"train")


def aligin_collate(batch_size):
    """process variable length labels """
    wave_list = list()
    label_list = list()
    length_list = list()
    for _, (wave, label, length) in enumerate(batch_size):
        wave_list.append(wave)
        label_list.extend(label)
        length_list.append(length)

    stacked_wave = torch.stack(wave_list, dim=0)
    label = torch.IntTensor(np.array(label_list))
    length = torch.IntTensor(np.array(length_list))

    return stacked_wave, label, length

train_loader = torch.utils.data.DataLoader(ASRDataset_,
                                           batch_size=3,
                                           collate_fn=aligin_collate,
                                           shuffle=True,
                                           drop_last=True,
                                           num_workers=8)



In [14]:
thchs30Dataset_ = ASRDataset(thchs30.label_dic,thchs30.file_dic,"train")

In [15]:
import argparse
import torch
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from warpctc_pytorch import CTCLoss


In [16]:
# -*- coding: utf-8 -*-
# @Time    : 18-3-27 下午2:26
# @Author  : junhao.li

import torch
import torch.nn as nn
from torch.autograd import Variable

import math

use_cuda = 1
torch.manual_seed(2019)


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        padding = dilation
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=padding, dilation=dilation,
                               bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Resnet_GRU_model(nn.Module):
    """
    CNN + RNN as the Encoder, the CNN part is self defined residual net,
    and the RNN part we use two layers GRU, replace average pooling
    """

    def __init__(self, bottleneck, num_class, rnn_hidden_size=200, dropout=0):
        super(Resnet_GRU_model, self).__init__()
        self.rnn_hidden_size = rnn_hidden_size
        self.inplanes = 64
        # Module list
        # [3 * 32 * 280] ==> [64 * 16 * 140]
        self.conv1 = nn.Conv2d(1, 64, kernel_size=5, padding=1, stride=2, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)

        # after layer_1: [128 * 16 * 140]
        self.layer_1 = self._make_layer(bottleneck, 32, blocks=3, stride=1, dilation=1)
        # after layer_2: [256 * 8 * 70]
        self.layer_2 = self._make_layer(bottleneck, 64, blocks=4, stride=2, dilation=2)
        # after layer_3: [256 * 8 * 70]
        self.layer_3 = self._make_layer(bottleneck, 64, blocks=5, stride=1, dilation=2)
        # after layer_4: [256 * 1 * 70]
        # self.layer_4 = nn.AvgPool2d(kernel_size=(8, 1), padding=(0, 0), stride=1)
        self.layer_4 = nn.MaxPool2d(kernel_size=(64, 1), padding=(0, 0), stride=1)
        # RNN
        self.gru_1 = nn.GRU(input_size=256,
                            hidden_size=self.rnn_hidden_size,
                            batch_first=True,
                            dropout=dropout,
                            bidirectional=True)

        self.gru_2 = nn.GRU(input_size=self.rnn_hidden_size * 2,
                            hidden_size=self.rnn_hidden_size,
                            batch_first=True,
                            dropout=dropout,
                            bidirectional=True)

        # fully connected layers
        self.fc1 = nn.Linear(1856, num_class)

        # weight initiation
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                *[nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                  nn.BatchNorm2d(planes * block.expansion)])

        layers = list()
        layers.append(block(self.inplanes, planes, stride=stride, dilation=dilation, downsample=downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, stride=1, dilation=dilation, downsample=None))

        return nn.Sequential(*layers)

    def init_rnn_weight(self, batch_size, rnn_hidden_size):
        h = Variable(torch.rand(2, batch_size, rnn_hidden_size), requires_grad=True)
        h = h.cuda() if use_cuda else h
        return h

    def forward(self, x, h1=None, h2=None):

        batch_size = x.size()[0]
        
        h1 = self.init_rnn_weight(batch_size, self.rnn_hidden_size) if h1 is None else h1
        rnn_x = x.squeeze(1)
        print(x.shape)
        x = self.conv1(x)
        x = self.bn1(x)        
        print(x.shape)

        x = self.relu(x)
        
        # added
        # layer_1 ~ layer_4: 
        x = self.layer_1(x)
        print(x.shape)
        x = self.layer_2(x)
        print(x.shape)
        x = self.layer_3(x)
        print(x.shape)
        x = self.layer_4(x)
        print("layer_4",x.shape)
        
        # after average pooling the feature map: [batch_size, 256, 1, 70]
        # remove dim = 1，as RNN model has this limit, only support three dim
        # [batch_size, num_feature_map, 1, w]  ==> [batch_size, num_feature_map, w]
        x = x.squeeze(2)
        print("squeeze",x.shape)
        x = x.transpose(1, 2)  # [batch_size, w, num_feature_map]
        cnn_feature = x
        print("transpose",x.shape)
        # after the first bidirectional GRU module, [batch_size, 70, 256]  ==> [batch_size, 70, 256 * 2]
        # after the second  bidirectional GRU module, [batch_size, 70, 256 * 2]  ==> [batch_size, 70, 256 * 2]
        x, rnn_h1 = self.gru_1(x, h1)
        # residual = x
        print(x.shape)
        x, rnn_h2 = self.gru_2(x, rnn_h1)
        print(x.shape)
        # x = x + residual

        rnn_feature = x.transpose(1, 2)

        x = torch.cat((cnn_feature, rnn_feature), dim=2)
        # fully connected layers

        x = self.fc1(x)
        return x, rnn_h1, rnn_h2



In [17]:
net = Resnet_GRU_model(Bottleneck, num_class=thchs30.SymbolNum, rnn_hidden_size=200, dropout=0)
use_cuda = 1
if use_cuda:
    net.cuda()
    cudnn.benchmark = True
from collections import OrderedDict

if False:
    pre_trained_model = torch.load("./model_save/ASR_001_001000.pth", map_location=lambda storage, loc: storage)
    # 新建一个state_dict
    new_state_dict = OrderedDict()
    for k, v in pre_trained_model.items():
        new_key = k
        # new_key = '.'.join(k.split('.')[1:])
        new_state_dict[new_key] = v
    net.load_state_dict(new_state_dict)
    print('load model finished...')
total_epoch = 5
model_name = "ASR"
model_dict_save_path = "./model_save/"

criterion = CTCLoss()
learn_rate = 0.001
optimizer = optim.Adam(net.parameters(), lr=learn_rate, weight_decay=5e-4)


def adjust_learning_rate(optimizer, factor):
    lr = learn_rate / factor
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


save_model_iter = min(len(train_loader) - 1, 100)
batch_len = len(train_loader)
start_epoch = 1
max_inter = 300000
decrate_1 = 100000
decrate_2 = 150000
iters = 0


# Training
def train(epoch):
    net.train()
    train_loss, train_accuracy = 0, 0
    h1, h2 = None, None
    for batch_idx, (feature, label, length) in enumerate(train_loader):
        if use_cuda:
            feature = feature.cuda()
        feature = Variable(feature)
        label = Variable(label, requires_grad=False).contiguous()
        length = Variable(length, requires_grad=False).contiguous()
        output, h1, h2 = net(feature, h1, h2)
        h1.detach_(), h2.detach_()
        h1, h2 = Variable(h1.data, requires_grad=True), Variable(h2.data, requires_grad=True)

        # compute loss
        batch_size, width, num_class = output.size()
        predict_len = Variable(torch.IntTensor(batch_size * [width]), requires_grad=False)
        # [batch_size, w, num_class] ==> [w, batch_size, num_class]
        model_output = output.transpose(0, 1)
        loss = criterion(model_output, label, predict_len, length)

        global iters
        iters += 1
        optimizer.zero_grad()

        lr = learn_rate
        if decrate_1 <= iters < decrate_2:
            adjust_learning_rate(optimizer, 10.0)
            lr = learn_rate / 10.0
        if iters >= decrate_2:
            adjust_learning_rate(optimizer, 100.0)
            lr = learn_rate / 100.0

        # loss.backward(retain_graph=True)
        loss.backward()
        torch.nn.utils.clip_grad_norm(net.parameters(), 10.0)
        optimizer.step()

        train_loss += loss.data[0]
        # compute accuracy
        decoded_pred_label = decode_model_output(output, blank=0)
        accuracy = cal_accuracy(output, label, length, decoded_pred_label)
        train_accuracy += accuracy

        # loss info
        info_str = 'epoch:{}/{}:batch:{}/{}/{},lr:{}==>batch loss:{}, ave_train_loss:{}'
        batch_info = info_str.format('%03d' % epoch,
                                     total_epoch,
                                     '%06d' % batch_idx,
                                     '%05d' % batch_len,
                                     '%06d' % iters,
                                     lr,
                                     loss.data[0],
                                     train_loss / (batch_idx + 1))
        print(batch_info)
        # accuracy
        accuracy_str = 'epoch:{}/{}:batch:{}/{}/{},lr:{}==>batch accuracy:{}, ave_train_accuracy:{}'
        accuracy_info = accuracy_str.format('%03d' % epoch,
                                            total_epoch,
                                            '%06d' % batch_idx,
                                            '%05d' % batch_len,
                                            '%06d' % iters,
                                            lr,
                                            accuracy,
                                            train_accuracy / (batch_idx + 1))
        if train_accuracy > 0:
            print(accuracy_info)

        if batch_idx > 0 and batch_idx % save_model_iter == 0:
            torch.save(net.state_dict(), "{}/{}_{}_{}.pth".format(model_dict_save_path,
                                                                  model_name,
                                                                  '%03d' % epoch,
                                                                  '%06d' % batch_idx))
        if iters == max_inter:
            torch.save(net.state_dict(), "{}/{}_{}_{}.pth".format(model_dict_save_path,
                                                                  model_name,
                                                                  '%03d' % epoch,
                                                                  '%06d' % batch_idx))
            break


for epoch in range(start_epoch, total_epoch):
    if iters <= max_inter:
        train(epoch)
    else:
        print('training finished: epoch:{}, iters:{}'.format(epoch + 1, iters))
        break




NameError: Traceback (most recent call last):
  File "/data/jiaxin.gu/anaconda3/envs/tf/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 138, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/data/jiaxin.gu/anaconda3/envs/tf/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 138, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "<ipython-input-13-58a2be5b8cc8>", line 31, in __getitem__
    feature = self.load_wav_feature(wav_dir)
  File "<ipython-input-13-58a2be5b8cc8>", line 96, in load_wav_feature
    window_length = fs / 1000 * time_window # 计算窗长度的公式，目前全部为400固定值
NameError: name 'fs' is not defined


In [None]:
temp = torch.randn([3, 256, 400, 50])
b = torch.nn.MaxPool2d(kernel_size=(20, 50), padding=(0, 0), stride=1)
c = b(temp)
c.shape

In [None]:
import torch