In [None]:
import aiohttp
import asyncio
import uuid
import logging
from tqdm import tqdm
import mimetypes
mimetypes.add_type('audio/wav', '.wav') # replace 'audio/x-wav', which is not accepted by the api

logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", datefmt='%H:%M:%S', level=logging.INFO)
logger = logging.getLogger()



endpoint = 'https://api.idg.vnpt.vn/stt-service/v1/grpc/async/standard'
authorization = ''
token_id = ''
token_key = ''
headers_auth = {
    'authorization': authorization,
    'token-id': token_id,
    'token-key': token_key,
}
interval_result_check = 1 # in seconds, should set to 10 or 30 to avoid overloading the server



async def call_api_2_step(session: aiohttp.ClientSession, file_path: str):
    try:
        clientSession = str(uuid.uuid1())
        form_data = aiohttp.FormData({
            'clientSession': clientSession,
        })
        # add file
        content_type, _ = mimetypes.guess_type(file_path)
        # io_bytes = open(file_path,'rb')
        io_bytes = resample_to_8k(file_path)
        form_data.add_field('audioFile', io_bytes, content_type=content_type)

        # step 1
        logger.info(f'calling file {file_path} with clientSession {clientSession}')
        async with session.post(endpoint, headers=headers_auth, data=form_data) as response:
            result = await response.json()

        # step 2: check status for result every interval
        while result.get('object',{}).get('status') == 'ACCEPTED':
            logger.info('status == ACCEPTED, processing')
            await asyncio.sleep(interval_result_check)
            result = await check_result(session, clientSession)
        logger.info('status == OK, result is ready')
    except Exception as e:
        # custom your error handling here
        result = {'message': str(e)}
    return result

async def check_result(session: aiohttp.ClientSession, clientSession: str):
    form_data = {'clientSession': clientSession}
    async with session.post(endpoint, headers=headers_auth, data=form_data) as response:
        result = await response.json()
    return result

async def run_async_many_files(wav_files: list[str]):
    async with aiohttp.ClientSession() as session:
        tasks = [call_api_2_step(session, file) for file in wav_files]
        results = await asyncio.gather(*tasks, return_exceptions=True)
    return results

async def run_async_many_files_progress(wav_files: list[str]):
    progress_bar = tqdm(total=len(wav_files))
    async def add_update_progress(task):
        result = await task
        progress_bar.update()
        return result

    async with aiohttp.ClientSession() as session:
        tasks = []
        for file in wav_files:
            task = call_api_2_step(session, file)
            task = add_update_progress(task)
            tasks.append(task)
        results = await asyncio.gather(*tasks, return_exceptions=True)
    return results


import io
import librosa
import soundfile

def resample_to_8k(file_path: str) -> io.BytesIO:
    """ as the API current only accept 8k hz audio, we need to resample before calling
        future updates will allow flexible sample rates
        additionally, resampling (downsampling) can save network bandwidth
        typically, the prefered rate for downsampling is 16k, as it is the rate most speech models operate on
        lower rate may result in a bit lower accuracy
    """
    sample_rate = librosa.get_samplerate(file_path)
    if sample_rate == 8000:
        return open(file_path,'rb')
    y, _ = librosa.load(file_path, sr=None)
    y = librosa.resample(y, orig_sr=sample_rate, target_sr=8000)
    tmp_io = io.BytesIO()
    soundfile.write(tmp_io, y, sample_rate, format='wav')
    return tmp_io.getbuffer()

In [2]:
file = 'audio/5min-8k.mp3'
results = await run_async_many_files([file])
results

15:55:42 - INFO - calling file audio/5min-8k.mp3 with clientSession 206e3121-f28d-11ef-ac1d-18c04dba91ec
15:55:43 - INFO - status == ACCEPTED, processing
15:55:45 - INFO - status == OK, result is ready


[{'message': 'IDG-00000000',
  'object': {'results': [{'alternatives': [{'transcript': 'Quay lại đây. Ai ngờ nó quay lại thật tất nhiên, không đội mũ bảo hiểm',
       'confidence': -1.5109931}],
     'channelTag': 1.0}],
   'audio_duration': 4.8,
   'status': 'OK'}}]

In [3]:
logger.setLevel(logging.ERROR)

file = 'audio/3s.wav'
wav_files = [file, file]

results = await run_async_many_files_progress(wav_files)
results

100%|██████████| 2/2 [00:00<00:00,  2.74it/s]


[{'message': 'IDG-00000000',
  'object': {'results': [{'alternatives': [{'transcript': 'Quay lại đây. Ai ngờ nó quay lại thật tất nhiên, không đội mũ bảo hiểm',
       'confidence': -1.711556}],
     'channelTag': 1.0}],
   'audio_duration': 4.8,
   'status': 'OK'}},
 {'message': 'IDG-00000000',
  'object': {'results': [{'alternatives': [{'transcript': 'Quay lại đây. Ai ngờ nó quay lại thật tất nhiên, không đội mũ bảo hiểm',
       'confidence': -1.711556}],
     'channelTag': 1.0}],
   'audio_duration': 4.8,
   'status': 'OK'}}]