In [96]:
import librosa
import numpy as np
import pandas as pd
import soundfile as sf

## Dataset creation

## Model algorithm

In [97]:
def split_audio_using_VAD(input_file, pred, chunk_length_sec=15):
    # Load the audio file
    audio, sr = librosa.load(input_file, sr=None)
    timestamps_df=pred*sr/1000
    # Calculate the total duration of the audio in samples
    total_samples = len(audio)

    # Calculate the chunk length in samples
    chunk_length_samples = int(sr * chunk_length_sec)

    # Create an array to store audio chunks
    audio_chunks = []
    split_sec=[]
    k=0
    index=0
    while k+chunk_length_samples<total_samples:
        start_sample = k
        end_sample = k+chunk_length_samples
        nearest_timestamp_index = np.abs(np.array(timestamps_df['end']>start_sample) - end_sample).argmin()
        nearest_timestamp_end_sample = int(timestamps_df.loc[nearest_timestamp_index, 'end'])
        end_sample = min(end_sample, nearest_timestamp_end_sample)
        chunk = audio[start_sample:end_sample]
        audio_chunks.append(chunk)
        split_sec.append((index, start_sample/sr, end_sample/sr))
        k=end_sample
        index+=1

    last_chunk = audio[k:total_samples]
    audio_chunks.append(last_chunk)
    split_sec.append((index, k/sr, total_samples/sr))
    split_sec=pd.DataFrame(split_sec, columns=['index', 'start', 'end'])
    return audio_chunks, split_sec

In [98]:
from fsmnvad import FSMNVad
# Specify the input audio file
input_file = "output/audio_noise1.wav"
vad = FSMNVad()
segments = vad.segments_offline(input_file)
df=pd.DataFrame(segments, columns=['start', 'end'])
df

Unnamed: 0,start,end
0,110,2640
1,2920,4360
2,4690,5290
3,5810,6900
4,7990,10270
5,10690,18890


In [99]:
# Split the audio into 15-second chunks with adjustment for the last chunk
chunks_array, split_sec = split_audio_using_VAD(input_file, df, 5)

# Now, chunks_array contains the audio chunks as numpy arrays
for i, chunk in enumerate(chunks_array):
    print(f"Chunk {i}: start: {split_sec['start'][i]} end: {split_sec['end'][i]}")

Chunk 0: start: 0.0 end: 2.64
Chunk 1: start: 2.64 end: 4.36
Chunk 2: start: 4.36 end: 5.29
Chunk 3: start: 5.29 end: 6.9
Chunk 4: start: 6.9 end: 10.27
Chunk 5: start: 10.27 end: 15.27
Chunk 6: start: 15.27 end: 18.90975


In [100]:
!pip install -q espnet_model_zoo

In [101]:
#@title Choose English ASR model { run: "auto" }

lang = 'en'
fs = 16000 #@param {type:"integer"}
tag = 'Shinji Watanabe/spgispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_unnorm_bpe5000_valid.acc.ave' #@param ["Shinji Watanabe/spgispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_unnorm_bpe5000_valid.acc.ave", "kamo-naoyuki/librispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_bpe5000_scheduler_confwarmup_steps40000_optim_conflr0.0025_sp_valid.acc.ave"] {type:"string"}

In [102]:
import time
import torch
import string
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.asr_inference import Speech2Text


d = ModelDownloader()
# It may takes a while to download and build models
speech2text = Speech2Text(
    **d.download_and_unpack(tag),
    device="cpu",
    minlenratio=0.0,
    maxlenratio=0.0,
    ctc_weight=0.3,
    beam_size=10,
    batch_size=0,
    nbest=1
)

def text_normalizer(text):
    text = text.upper()
    return text.translate(str.maketrans('', '', string.punctuation))



In [121]:
import pandas as pd
import librosa.display
from IPython.display import display, Audio
import matplotlib.pyplot as plt
import time
import numpy as np
import soundfile as sf

col_names = ['path', 'text', 'time', 'sr']
preds = ""
target = ""

for i, chunk in enumerate(chunks_array):
    start_time = time.time()
    speech = np.array([])
    duration=split_sec['end'][i]*1000-split_sec['start'][i] * 1000
    for _, row in df.iterrows():
        start_sample = row['start']-split_sec['start'][i] * 1000
        end_sample = row['end']-split_sec['start'][i] * 1000
#         print(start_sample, end_sample, duration)
        if(start_sample<0 and end_sample<0):
            continue
        if(start_sample>duration and end_sample>duration):
            continue
#         print("Y")
        speech = np.concatenate([speech, chunk[int(max(0,start_sample))*16:int(min(duration, end_sample))*16]])

    print(len(speech))
    
    if len(speech) != 0:
        nbests = speech2text(speech)
        text, *_ = nbests[0]
#         output_filename = f'results/output_chunk{i}.wav'
#         sf.write(output_filename, speech, samplerate=16000)
        if(preds==""):
            preds=(text_normalizer(text))
        else:
            preds += " " + (text_normalizer(text))

    print(i, "/500")
    elapsed_time = time.time() - start_time
    print(f"Time taken: {elapsed_time:.2f} seconds")


40480
0 /500
Time taken: 7.03 seconds
23040
1 /500
Time taken: 8.91 seconds
9600
2 /500
Time taken: 3.00 seconds
17440
3 /500
Time taken: 5.99 seconds
36480
4 /500
Time taken: 6.60 seconds
73280
5 /500
Time taken: 11.02 seconds
57920
6 /500
Time taken: 14.16 seconds


In [122]:
print(preds)

SHE TRIED THIS MORNING AND ERIC WERE GOING TO  ILL PASS IT TO YOU SO THATS THE FROM THE PNL AND STAYING A SIMPLE SONG AND ITS SWEET BUT ITS SLIGHTLY METALLIC VOIDS AND THEN SEEDING YOURSELVES BY THE OPEN WINDOW RED PHILIPS PLATTER


In [123]:
from torchmetrics.text import CharErrorRate, WordErrorRate
target="she tried this morning an air or two upon the piano sang a simple song in a sweet but slightly metallic voice and then seating herself by the open window read philips letter"
cer = CharErrorRate()
print("CER:", cer(preds, target))
wer = WordErrorRate()
print("WER:", wer(preds, target))

CER: tensor(1.1503)
WER: tensor(1.3333)
