In [1]:
import numpy as np
import scipy.io.wavfile as wav
from scipy.fftpack import fft


# 获取信号的时频图
def compute_fbank(file):
	x=np.linspace(0, 400 - 1, 400, dtype = np.int64)
	w = 0.54 - 0.46 * np.cos(2 * np.pi * (x) / (400 - 1) ) # 汉明窗
	fs, wavsignal = wav.read(file)
	# wav波形 加时间窗以及时移10ms
	time_window = 25 # 单位ms
	window_length = fs / 1000 * time_window # 计算窗长度的公式，目前全部为400固定值
	wav_arr = np.array(wavsignal)
	wav_length = len(wavsignal)
	range0_end = int(len(wavsignal)/fs*1000 - time_window) // 10 # 计算循环终止的位置，也就是最终生成的窗数
	data_input = np.zeros((range0_end, 200), dtype = np.float) # 用于存放最终的频率特征数据
	data_line = np.zeros((1, 400), dtype = np.float)
	for i in range(0, range0_end):
		p_start = i * 160
		p_end = p_start + 400
		data_line = wav_arr[p_start:p_end]	
		data_line = data_line * w # 加窗
		data_line = np.abs(fft(data_line))
		data_input[i]=data_line[0:200] # 设置为400除以2的值（即200）是取一半数据，因为是对称的
	data_input = np.log(data_input + 1)
	#data_input = data_input[::]
	return data_input

In [2]:
import os

def source_get(source_file):
    train_file = source_file + '\\train'
    label_lst = []
    wav_lst = []
    for root, dirs, files in os.walk(train_file):
        for file in files:
            if file.endswith('.wav') or file.endswith('.WAV'):
                wav_file = os.sep.join([root, file])
                wav_lst.append(wav_file)
            elif file.endswith('.trn'):
                label_file = os.sep.join([source_file, 'data', file])
                label_lst.append(label_file)
    return label_lst, wav_lst

In [3]:
source_file = 'E:\\Data\\thchs30\\data_thchs30'

In [4]:
label_lst, wav_lst = source_get(source_file)

In [5]:
def read_label(label_file):
    with open(label_file, 'r', encoding='utf8') as f:
        data = f.readlines()
        return data[1]

print(read_label(label_lst[0]))

def gen_label_data(label_lst):
    label_data = []
    for label_file in label_lst:
        pny = read_label(label_file)
        label_data.append(pny.strip('\n'))
    return label_data

label_data = gen_label_data(label_lst)
print(len(label_data))

lv4 shi4 yang2 chun1 yan1 jing3 da4 kuai4 wen2 zhang1 de5 di3 se4 si4 yue4 de5 lin2 luan2 geng4 shi4 lv4 de5 xian1 huo2 xiu4 mei4 shi1 yi4 ang4 ran2

10000


In [6]:
def mk_vocab(label_data):
    vocab = []
    for line in label_data:
        line = line.split(' ')
        for pny in line:
            if pny not in vocab:
                vocab.append(pny)
    vocab.append('_')
    return vocab

vocab = mk_vocab(label_data)
print(len(vocab))

1176


In [7]:
def word2id(line, vocab):
    return [vocab.index(pny) for pny in line.split(' ')]

label_id = word2id(label_data[0], vocab)
print(label_data[0])
print(label_id)

lv4 shi4 yang2 chun1 yan1 jing3 da4 kuai4 wen2 zhang1 de5 di3 se4 si4 yue4 de5 lin2 luan2 geng4 shi4 lv4 de5 xian1 huo2 xiu4 mei4 shi1 yi4 ang4 ran2
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 10, 15, 16, 17, 1, 0, 10, 18, 19, 20, 21, 22, 23, 24, 25]


In [8]:
from random import shuffle
shuffle_list = [i for i in range(10000)]
shuffle(shuffle_list)

In [9]:
def get_batch(batch_size, shuffle_list, wav_lst, label_data, vocab):
    for i in range(10000//batch_size):
        wav_data_lst = []
        label_data_lst = []
        begin = i * batch_size
        end = begin + batch_size
        sub_list = shuffle_list[begin:end]
        for index in sub_list:
            fbank = compute_fbank(wav_lst[index])
            fbank = fbank[:fbank.shape[0] // 8 * 8, :]
            label = word2id(label_data[index], vocab)
            wav_data_lst.append(fbank)
            label_data_lst.append(label)
        yield wav_data_lst, label_data_lst

batch = get_batch(4, shuffle_list, wav_lst, label_data, vocab)


In [10]:
wav_data_lst, label_data_lst = next(batch)
for wav_data in wav_data_lst:
    print(wav_data.shape)
for label_data in label_data_lst:
    print(label_data)

(1016, 200)
(960, 200)
(744, 200)
(784, 200)
[148, 386, 727, 272, 865, 265, 416, 123, 14, 237, 69, 152, 33, 501, 105, 307, 42, 56, 568, 148, 386, 695, 338, 86, 246, 994, 628, 727, 272, 416, 262, 336, 95, 101, 1]
[103, 280, 170, 35, 424, 599, 90, 651, 223, 495, 10, 150, 28, 121, 144, 319, 634, 662, 342, 227, 540, 48, 401, 367, 246, 134, 339, 570, 570, 639, 196, 163, 0]
[999, 90, 237, 18, 999, 87, 243, 504, 237, 58, 523, 688, 91, 394, 999, 91, 243, 237, 58, 523, 688, 717, 394]
[113, 526, 94, 384, 590, 11, 848, 2, 967, 190, 888, 73, 1167, 1167, 707, 309, 212, 354, 223, 229, 155, 768]


In [11]:
lens = [len(wav) for wav in wav_data_lst]
print(max(lens))
print(lens)

1016
[1016, 960, 744, 784]


In [12]:
def wav_padding(wav_data_lst):
    wav_lens = [len(data) for data in wav_data_lst]
    wav_max_len = max(wav_lens)
    wav_lens = np.array([leng//8 for leng in wav_lens])
    new_wav_data_lst = np.zeros((len(wav_data_lst), wav_max_len, 200, 1))
    for i in range(len(wav_data_lst)):
        new_wav_data_lst[i, :wav_data_lst[i].shape[0], :, 0] = wav_data_lst[i]
    return new_wav_data_lst, wav_lens

pad_wav_data_lst, wav_lens = wav_padding(wav_data_lst)
print(pad_wav_data_lst.shape)
print(wav_lens)

(4, 1016, 200, 1)
[127 120  93  98]


In [13]:
def label_padding(label_data_lst):
    label_lens = np.array([len(label) for label in label_data_lst])
    max_label_len = max(label_lens)
    new_label_data_lst = np.zeros((len(label_data_lst), max_label_len))
    for i in range(len(label_data_lst)):
        new_label_data_lst[i][:len(label_data_lst[i])] = label_data_lst[i]
    return new_label_data_lst, label_lens

pad_label_data_lst, label_lens = label_padding(label_data_lst)
print(pad_label_data_lst.shape)
print(label_lens)

(4, 35)
[35 33 23 22]


In [14]:
total_nums = 10000
batch_size = 4
batch_num = total_nums // batch_size
epochs = 1

In [15]:
source_file = 'E:\\Data\\thchs30\\data_thchs30'
label_lst, wav_lst = source_get(source_file)
label_data = gen_label_data(label_lst)
vocab = mk_vocab(label_data)
vocab_size = len(vocab)

shuffle_list = [i for i in range(10000)]
shuffle(shuffle_list)

In [16]:
def data_generator(batch_size, shuffle_list, wav_lst, label_data, vocab):
    for i in range(10000//batch_size):
        wav_data_lst = []
        label_data_lst = []
        begin = i * batch_size
        end = begin + batch_size
        sub_list = shuffle_list[begin:end]
        for index in sub_list:
            fbank = compute_fbank(wav_lst[index])
            fbank = fbank[:fbank.shape[0] // 8 * 8, :]
            label = word2id(label_data[index], vocab)
            wav_data_lst.append(fbank)
            label_data_lst.append(label)
        pad_wav_data, input_length = wav_padding(wav_data_lst)
        pad_label_data, label_length = label_padding(label_data_lst)
        inputs = {'the_inputs': pad_wav_data,
                  'the_labels': pad_label_data,
                  'input_length': input_length,
                  'label_length': label_length,
                 }
        outputs = {'ctc': np.zeros(pad_wav_data.shape[0],)} 
        yield inputs, outputs

In [17]:
batch = data_generator(batch_size, shuffle_list, wav_lst, label_data, vocab)

In [18]:
import os

import numpy as np
import random

from keras.models import Model
from keras.layers import Dense, Dropout, Input, Reshape, BatchNormalization
from keras.layers import Lambda, Activation,Conv2D, MaxPooling2D
from keras import backend as K
from keras.optimizers import SGD, Adadelta


class ModelSpeech(): # 语音模型类
	def __init__(self, vocab_size):
		self.MS_OUTPUT_SIZE = vocab_size
		self.label_max_string_length = 64
		self.AUDIO_FEATURE_LENGTH = 200
		self._model, self.base_model = self.CreateModel()

		
	def CreateModel(self):
		# 每一帧使用13维mfcc特征及其13维一阶差分和13维二阶差分表示，最大信号序列长度为1500
		input_data = Input(name='the_inputs', shape=(None, 200, 1))
		
		layer_h1 = Conv2D(32, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(input_data) # 卷积层
		layer_h1 = BatchNormalization(mode=0,axis=-1)(layer_h1)
		layer_h2 = Conv2D(32, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(layer_h1) # 卷积层
		layer_h3 = MaxPooling2D(pool_size=2, strides=None, padding="valid")(layer_h2) # 池化层
		#layer_h3 = Dropout(0.2)(layer_h2) # 随机中断部分神经网络连接，防止过拟合
		layer_h3 = BatchNormalization(mode=0,axis=-1)(layer_h3)
		layer_h4 = Conv2D(64, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(layer_h3) # 卷积层
		layer_h4 = BatchNormalization(mode=0,axis=-1)(layer_h4)
		layer_h5 = Conv2D(64, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(layer_h4) # 卷积层
		layer_h6 = MaxPooling2D(pool_size=2, strides=None, padding="valid")(layer_h5) # 池化层
		
		layer_h6 = BatchNormalization(mode=0,axis=-1)(layer_h6)
		layer_h7 = Conv2D(128, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(layer_h6) # 卷积层
		layer_h7 = BatchNormalization(mode=0,axis=-1)(layer_h7)
		layer_h8 = Conv2D(128, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(layer_h7) # 卷积层
		layer_h9 = MaxPooling2D(pool_size=2, strides=None, padding="valid")(layer_h8) # 池化层
		
		layer_h9 = BatchNormalization(mode=0,axis=-1)(layer_h9)
		layer_h10 = Conv2D(128, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(layer_h9) # 卷积层
		layer_h10 = BatchNormalization(mode=0,axis=-1)(layer_h10)
		layer_h11 = Conv2D(128, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(layer_h10) # 卷积层
		layer_h12 = MaxPooling2D(pool_size=1, strides=None, padding="valid")(layer_h11) # 池化层
		
		#test=Model(inputs = input_data, outputs = layer_h12)
		#test.summary()
		
		layer_h10 = Reshape((-1, 3200))(layer_h12) #Reshape层
		#layer_h5 = LSTM(256, activation='relu', use_bias=True, return_sequences=True)(layer_h4) # LSTM层
		#layer_h6 = Dropout(0.2)(layer_h5) # 随机中断部分神经网络连接，防止过拟合
		layer_h10 = BatchNormalization(mode=0,axis=-1)(layer_h10)
		layer_h11 = Dense(128, activation="relu", use_bias=True, kernel_initializer='he_normal')(layer_h10) # 全连接层
		layer_h11 = BatchNormalization(mode=0,axis=-1)(layer_h11)
		layer_h12 = Dense(self.MS_OUTPUT_SIZE, use_bias=True, kernel_initializer='he_normal')(layer_h11) # 全连接层
		
		y_pred = Activation('softmax', name='Activation0')(layer_h12)
		model_data = Model(inputs = input_data, outputs = y_pred)
		#model_data.summary()
		
		labels = Input(name='the_labels', shape=[None], dtype='float32')
		input_length = Input(name='input_length', shape=[1], dtype='int64')
		label_length = Input(name='label_length', shape=[1], dtype='int64')
		# Keras doesn't currently support loss funcs with extra parameters
		# so CTC loss is implemented in a lambda layer
		
		#layer_out = Lambda(ctc_lambda_func,output_shape=(self.MS_OUTPUT_SIZE, ), name='ctc')([y_pred, labels, input_length, label_length])#(layer_h6) # CTC
		loss_out = Lambda(self.ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
		
		
		
		model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
		
		model.summary()
		
		# clipnorm seems to speeds up convergence
		#sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
		ada_d = Adadelta(lr = 0.01, rho = 0.95, epsilon = 1e-06)
		
		model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = ada_d)
		print('[*提示] 创建模型成功，模型编译成功')
		return model, model_data
		
	def ctc_lambda_func(self, args):
		y_pred, labels, input_length, label_length = args
		
		y_pred = y_pred[:, :, :]
		#y_pred = y_pred[:, 2:, :]
		return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
	
	
	
	def TrainModel(self, yielddatas, epoch = 2, save_step = 1000,filename = 'model_speech/speech_model25'):
		for epoch in range(epoch): # 迭代轮数
			print('[running] train epoch %d .' % epoch)
			n_step = 0 # 迭代数据数
			while True:
				try:
					print('[message] epoch %d . Have train datas %d+'%(epoch, n_step*save_step))
					# data_genetator是一个生成器函数
					
					#self._model.fit_generator(yielddatas, save_step, nb_worker=2)
					self._model.fit_generator(yielddatas, save_step)
					n_step += 1
				except StopIteration:
					print('[error] generator error. please check data format.')
					break
				
				self.SaveModel(comment='_e_'+str(epoch)+'_step_'+str(n_step * save_step))
				self.TestModel(self.datapath, str_dataset='train', data_count = 4)
				self.TestModel(self.datapath, str_dataset='dev', data_count = 4)

Using TensorFlow backend.


In [19]:
am = ModelSpeech(len(vocab))



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_inputs (InputLayer)         (None, None, 200, 1) 0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, None, 200, 32 320         the_inputs[0][0]                 
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, None, 200, 32 128         conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, None, 200, 32 9248        batch_normalization_1[0][0]      
__________________________________________________________________________________________________
max_poolin

In [20]:
am.TrainModel(batch)

[running] train epoch 0 .
[message] epoch 0 . Have train datas 0+
Epoch 1/1
  91/1000 [=>............................] - ETA: 3:16:03 - loss: 388.3666

KeyboardInterrupt: 

In [21]:
import numpy as np
num_units = 512
T = 10
position_enc = np.array([
            [pos / np.power(10000, 2.*i / num_units) for i in range(num_units)]
            for pos in range(T)])

In [22]:
 position_enc[:,0::2] = np.sin(position_enc[:,0::2])

In [23]:
position_enc[:,1::2] = np.cos(position_enc[:,1::2])

In [24]:
print(position_enc)

[[ 0.00000000e+00  1.00000000e+00  0.00000000e+00 ...  1.00000000e+00
   0.00000000e+00  1.00000000e+00]
 [ 8.41470985e-01  5.69695009e-01  8.01961795e-01 ...  1.00000000e+00
   1.07460783e-08  1.00000000e+00]
 [ 9.09297427e-01 -3.50895194e-01  9.58144376e-01 ...  1.00000000e+00
   2.14921566e-08  1.00000000e+00]
 ...
 [ 6.56986599e-01  8.91819036e-01  2.28774860e-01 ...  1.00000000e+00
   7.52225480e-08  1.00000000e+00]
 [ 9.89358247e-01  1.36263428e-01  9.17357711e-01 ...  1.00000000e+00
   8.59686263e-08  1.00000000e+00]
 [ 4.12118485e-01 -7.36561846e-01  8.67238862e-01 ...  1.00000000e+00
   9.67147045e-08  1.00000000e+00]]


In [25]:
for line in position_enc:
    x = 0
    for i in line:
        x+=i
    print(x)

256.0
266.24185030469096
266.98979647175173
260.11838541244845
250.7842798085775
244.02486120776146
241.8208395336924
242.59022044153295
243.16969720376335
241.51934277045626


In [35]:
inputs = np.array([[1,0,0], [1,1,0]])
keys = np.zeros((2,3,4))
key_masks = np.sign(np.abs(np.sum(keys,axis=-1)))
print(key_masks.shape)
print(np.expand_dims(key_masks,1).shape)
key_masks = np.tile(np.expand_dims(key_masks,1),[1,np.shape(keys)[1],1])
print(key_masks.shape)

(2, 3)
(2, 1, 3)
(2, 3, 3)


In [29]:
print(key_masks.shape)

(2, 3, 3)


In [37]:
inputs = np.array([[1,0,0], [1,1,0]])
key_masks = np.sign(np.abs(inputs))
print(key_masks.shape)
print(np.expand_dims(key_masks,1).shape)
key_masks = np.tile(np.expand_dims(key_masks,1),[1,np.shape(keys)[1],1])
print(key_masks)

(2, 3)
(2, 1, 3)
[[[1 0 0]
  [1 0 0]
  [1 0 0]]

 [[1 1 0]
  [1 1 0]
  [1 1 0]]]
