In [1]:
import os
import sys
from pathlib import Path

base_dir=f'{Path(os.getcwd()).parent}'
sys.path.append(base_dir)

from libs.vad import VAD



In [2]:
import soundfile as sf
import librosa

def readAudioFile(audio_path):
    audio, sample_rate = sf.read(audio_path)
    if audio.ndim > 1:
        audio = audio[:, 0]
    if sample_rate != 16000:
        audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
    return audio.astype("float16")


In [3]:
import numpy as np

whole_audio = np.concatenate([readAudioFile("/mnt/e/dataset/jsut_ver1.1/loanword128/wav/LOANWORD128_064.wav"),
                              np.zeros(3200), 
                              readAudioFile("/mnt/e/dataset/jsut_ver1.1/loanword128/wav/LOANWORD128_065.wav")])

# 40msecで分割する
window_size = int(16000 * 0.04) # 320
split_audio = [whole_audio[i:i+window_size] for i in range(0, len(whole_audio), window_size)]

In [4]:
import torch
torch.hub.set_dir(f'{base_dir}/.model_cache/hub',)
vad_model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=False,
                              onnx=False)

Using cache found in /home/sylx/project/altbot/ai-service/.model_cache/hub/snakers4_silero-vad_master


In [5]:
from transformers import AutoTokenizer, AutoFeatureExtractor,Wav2Vec2ForCTC,Wav2Vec2Processor
import torch

MODEL_NAME = "AndrewMcDowell/wav2vec2-xls-r-300m-japanese"
#MODEL_NAME = "facebook/wav2vec2-lv-60-espeak-cv-ft"
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME,torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
processor=Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model.to("cuda:0")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at AndrewMcDowell/wav2vec2-xls-r-300m-japanese were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at AndrewMcDowell/wav2vec2-xls-r-300m-japanese and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_em

Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [10]:
# 40msecで分割した場合とそうでない場合とで、認識結果はどう変化するか？

input_values = feature_extractor(whole_audio, sampling_rate=16000,return_tensors="pt").input_values.to("cuda:0")
# to float16
input_values=input_values.half()
logits = model(input_values)[0]
pred_ids = torch.argmax(logits, axis=-1)
text=processor.batch_decode(pred_ids)

[text,logits.shape]

[['ケェヴェリンとうのたたかいとは、にっほんぐんのまもるクェヴェリンかんしょうへアメリカぐんがしんこうしたおかなわれたせんとうである、、コンサーウンはかんこくのはいゆうである。'],
 torch.Size([1, 512, 181])]

In [19]:
6//5

1

In [38]:
# 同一区間は与えられたキーワードのどれかであるか、どれでもないか、なので、複数キーワードを一括で処理できるようにする

class Beam:
    def __init__(self,word,priority,result):
        self.id=word
        self.priority=priority
        self.result=result
        self.ids=tokenizer.convert_tokens_to_ids(tokenizer.tokenize(word))
        self.start_index=0
        self.prob_total=0.0
        self.unmatch_count=0
        # 許容するunmatchの回数(6文字ごとに1回) 4文字未満なら-1(一度もunmatchできない)
        self.unmatch_threshold=len(self.ids)//4 - 1
        self.id_index=0
        self.forked=False

    def __repr__(self):
        return f"{self.id} {self.result}"
    
    def reset(self):
        self.start_index=0
        self.prob_total=0.0
        self.unmatch_count=0
        self.id_index=0

    # 不一致の時のハンドラ。余計な発音が挟まってる場合があるので、二回連続まで許容する
    def unmatch(self):
        #print(f"unmatch {i} expect {tokenizer.convert_ids_to_tokens(ids[id_index])}({ids[id_index]}) top1 is {tokenizer.convert_ids_to_tokens(top1_id)}({top1_id}) current={current_prob} last={last_id_prob} pad={pad_prob} ")
        if self.unmatch_count > self.unmatch_threshold:
            #print(f"{self.id} unmatch count is {self.unmatch_count} so reset")
            self.reset()
            return
        else:
            # 余計な発音の場合と、完全に聞き取れなかった場合がある。後者の場合は、id_index++しないといけないので分岐させる
            forkBeam=None
            if self.id_index > 0 and self.id_index+1 < len(self.ids) and self.forked is False:
                forkBeam=Beam(self.id,self.priority,self.result)
                forkBeam.id_index=self.id_index+1
                forkBeam.prob_total=self.prob_total
                forkBeam.start_index=self.start_index
                forkBeam.forked=True
            self.unmatch_count+=1
            return forkBeam
        
    def step(self,probs,step,threshold):        
        current_id=self.ids[self.id_index]
        step_prob=probs[current_id]
        #last_id_prob=probs[self.ids[self.id_index-1]] if self.id_index > 0 else 0.0

        # id_index==0の時は、BEAM段階で刈り取られているので、調べるまでもなく、thresholdを上回っている（筈）
        if self.id_index == 0:
            self.start_index=step
        elif step_prob < threshold:
            return self.unmatch()

        self.id_index+=1
        self.unmatch_count=0
        self.prob_total+=step_prob
        if self.id_index == len(self.ids):
            #終了
            # 0.01以下の場合は、結果としては切り捨てる
            result_prob= self.current_prob().item()
            if result_prob > 0.01:
                self.result.append({
                    "word": self.id,
                    "start": self.start_index,
                    "end": step,
                    "prob": result_prob
                })
            self.reset()
            return

    def current_prob(self):
        if self.id_index > 0:
            return self.prob_total / self.id_index
        else:
            return 0.0
    
    def max_prob(self):
        return max(self.result,key=lambda x:x["prob"])

def get_keywords_avgprobs(words,logits,num_beams=5,topk=5,without_pad=True):
    beams={}
    result=[]
    for i,word in enumerate(words):
        beam = Beam(word,i,result)
        first_token=beam.ids[0]
        if beams.get(first_token) is None:
            beams[first_token]=[]
        beams[first_token].append(beam)

    logits_prob=torch.softmax(logits[0,:,:],dim=1, dtype=torch.float16)
    # 上位N件の確率を取得
    topk_values, topk_indices = torch.topk(logits_prob, topk)
    threshold = topk_values[:,-1]

    # padが一位のstepを除外しておく（10倍以上高速化するが認識率は悪化する) 
    if without_pad:
        steps=torch.where(topk_indices[:,0]!=tokenizer.pad_token_id)[0].tolist()
    else:
        steps=range(len(logits_prob))

    current_beams=[]
    for i in steps:
        topKList=topk_indices[i,:].tolist()
        # topKにあるものを候補に加える(末尾に)これはnum_beamsとは別枠
        for id in topKList:
            current_beams.extend(beams.get(id,[]))
        #重複を排除(順序を維持する)
        current_beams=sorted(set(current_beams),key=lambda x: -x.id_index)
        #print(f"step={i} {list(map(lambda x:[x.id,x.id_index],current_beams))} {tokenizer.convert_ids_to_tokens(topKList)}")

        next_beams=[]
        for beam in current_beams:
            forked=beam.step(logits_prob[i,:],i,threshold[i])
            if forked is not None:
                next_beams.append(forked)
            if beam.id_index != 0:
                #マッチしたものだけ次も継続する
                next_beams.append(beam)
            else:
                #マッチしなかったものは終了
                #print(f"{beam.id} is unmatch")
                pass

        if len(next_beams) > num_beams:
            #確率の高いものからnum_beams件だけ残す
            current_beams=sorted(next_beams,key=lambda x: -x.current_prob())
            #print(f"cut {len(next_beams)} to {num_beams} -> {list(map(lambda x:[x.id,x.id_index],current_beams[num_beams:]))}")
            for b in current_beams[num_beams:]:
                b.reset()
            current_beams=current_beams[:num_beams]
        else:
            current_beams=next_beams

    return sorted(result,key=lambda x: -x["prob"])

keywords=[
        # 適当なユニーク文字列（かな、カタカナ）
        "アメリカ","にっぽん","まもる","クェゼリン","テスト",
        "あした","あさって","クォン・サンウ","はいゆう","イエス","ノー",
        "ホグワーツ","ハリーポッター","ハーマイオニー","ロン","ダンブルドア",
        "まどか","さやか","ほむら","マミ","キュゥべえ",
        "あおい","あかね","さくら","みどり","きいろ",
        "ハンバーグ","ステーキ","ミートソース","ミートボール","ハム",
        "せかい","ちきゅう","うみ","そら","ほし",
        "くるま","でんしゃ","ひこうき","ふね","じてんしゃ",
        "ねこ","いぬ","うさぎ","ねずみ","とら",
        "にんげん","おとこ","おんな","こども","おじいさん",
        "ブッシュ","トランプ","バイデン","ヒラリー","オバマ"
]

In [44]:
# 60msecで分割する
window_size = int(16000 * 0.06) # 320
split_audio = [whole_audio[i:i+window_size] for i in range(0, len(whole_audio), window_size)]
print(f"before : {len(split_audio)}")
# vadで無音区間を除去する
threshhold=0.2
split_audio = [audio for audio in split_audio if vad_model(torch.from_numpy(audio).float(), 16000) > threshhold]
print(f"after : {len(split_audio)}")
chunk_size = 10
stride=10
for i in range(0, len(split_audio), chunk_size):
    # list to flatten
    left = i - stride
    right = i + chunk_size
    if left < 0:
        left = 0
    if right > len(split_audio):
        right = len(split_audio)        
    current_audio=np.concatenate(split_audio[left:right])
    input_values = feature_extractor(current_audio, sampling_rate=16000,return_tensors="pt").input_values
    # to float16
    input_values=input_values.half().to("cuda:0")
    logits = model(input_values)[0]
    probs=get_keywords_avgprobs(keywords,logits,num_beams=5,topk=5,without_pad=True)
    pred_ids = torch.argmax(logits, axis=-1)
    text=processor.batch_decode(pred_ids)
    print(f"{i*0.06}s {text[0]} {[[p['word'],p['prob']] for p in probs]}")


before : 171
after : 139
0.0s ケヴリント []
6.0s セヴェリントうのたたかい []
12.0s のたたかいとは、にっぽん [['にっぽん', 0.85107421875]]
18.0s はにっほんぐんのまもる [['まもる', 0.87744140625], ['にっぽん', 0.791015625]]
24.0s ぐんのまもるクェゼリン [['まもる', 0.91064453125], ['クェゼリン', 0.82275390625]]
30.0s クベリンかんしょうえ []
36.0s かんしょうへアメリカぐん [['アメリカ', 0.9599609375]]
42.0s アメリカぐんがしんこう [['アメリカ', 0.96875]]
48.0s なしんこうしたおかなわれ []
54.0s したおかなわれたせんとう []
60.0s れたせんとうであるほん []
66.0s であるホンサーンンはか []
72.0s サウンンはかんこくのはい []
78.0s こくのはいゆうである [['はいゆう', 0.98095703125]]
