# Capture Voice

In [6]:
# check package available
# REF: https://stackoverflow.com/questions/1051254/check-if-python-package-is-installed

import importlib.util
import sys, subprocess

# For illustrative purposes.
name = 'sounddevice'

if name in sys.modules:
    print(f"{name!r} already in sys.modules")
elif (spec := importlib.util.find_spec(name)) is not None:
    # If you choose to perform the actual import ...
    module = importlib.util.module_from_spec(spec)
    sys.modules[name] = module
    spec.loader.exec_module(module)
    print(f"{name!r} has been imported")
else:
    print(f"can't find the {name!r} module")
    try:
        cmd = f'pip install {name}'
        t = subprocess.check_output(cmd, shell = True)
        print(f'module {name} has been installed by pip.')
    except:
        print(f"can't install {name} in pip, plz check the package name.")
    

'sounddevice' already in sys.modules


In [75]:
# set the input and output devices
import sounddevice as sd

# enable to detect bluetooth devices, if and only if the devices are paired
sd._terminate()
sd._initialize()

# show the available devices
device_list = sd.query_devices()
print(f'the device list is: \n{device_list}.\n')
for device in device_list:
    if 'External Microphone' in device['name']:
        input_device = device['name']
        print(f"Input device name is '{input_device}'.")
    elif 'External Headphones' in device['name']:
        output_device = device['name']
        print(f"Output device name is '{output_device}'.")

sd.default.device = input_device, output_device

the device list is: 
  0 dnhb's AirPods, Core Audio (1 in, 0 out)
  1 dnhb's AirPods, Core Audio (0 in, 2 out)
> 2 External Microphone, Core Audio (1 in, 0 out)
< 3 External Headphones, Core Audio (0 in, 2 out)
  4 MacBook Pro Microphone, Core Audio (1 in, 0 out)
  5 MacBook Pro Speakers, Core Audio (0 in, 2 out)
  6 ZoomAudioDevice, Core Audio (2 in, 2 out).

Input device name is 'External Microphone'.
Output device name is 'External Headphones'.


In [58]:
# specify the samplereate
# REF: https://python-sounddevice.readthedocs.io/en/0.3.15/api/module-defaults.html
fs = 24000
sd.default.samplerate = fs
sd.default.channels = 1, 2 # one input channel, two output channel

## Voice to Audio

There are some issues:

+ how to set the time interval?
+ how to set the duration?
  
  I can set a long duration of this recording, and continue converting all the voice in this duration until terminate the VC program. This can be tricky, since this may require the system run two models at the same time, one is for KWS, another is for voice conversion. However, we don't wanna convert the keyword any more. Does this mean each voice segment should run KWS first, then VC? 


In [61]:
# Record the voice

# TODO: how to set the duration?
duration = 15 # seconds
print('begin')
audio = sd.rec(int(duration * fs), dtype = 'float32')
# sd.playrec(myrecording, fs)

begin


# Process Audio

Two steps:

- preprocess audio
- infer the audio

In [None]:
# TOOLS
import os, random, time, librosa
import torch

from utils import compute_style, load_F0, load_starganv2, load_vocoder, preprocess, speakers

global speakers
F0_model = load_F0()
vocoder = load_vocoder()
starganv2 = load_starganv2()

def convert(audio, speaker, F0_model, vocoder, starganv2):
    '''@lw
    :speaker: the speaker name
    '''

    # # @lw: unify the speaker to the speaker name
    # if isinstance(speaker, int):
    #     speaker = speakers[speaker]
    # else:
    #     # @lw: check whether the speaker in the list
    #     assert speaker in speakers.values(
    #     ), 'we only support the following speakers: {}.'.format('; '.join(
    #         speakers.values()))

    # @lw: set reference, get the speaker index
    speaker_dicts = {speaker: ('', speakers[speaker])}

    # @lw: compute reference embeddings
    reference_embeddings = compute_style(speaker_dicts)

    start = time.time()

    # conversion
    source = preprocess(audio).to('cuda:0')
    converted_audio = None

    for key, (ref, _) in reference_embeddings.items():
        with torch.no_grad():
            f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1))
            out = starganv2.generator(source.unsqueeze(1), ref, F0=f0_feat)

            c = out.transpose(-1, -2).squeeze().to('cuda')
            y_out = vocoder.inference(c)
            y_out = y_out.view(-1).cpu()

        converted_audio = y_out.numpy()

    end = time.time()
    print('{} total processing time: {:.3f} sec'.format(type, end - start))

    return converted_audio

In [62]:
# preprocess audio
import numpy as np

audio = audio / np.max(np.abs(audio))
converted_audio = convert(audio, speaker, F0_model, vocoder, starganv2)

# Play Processed Audio

In [None]:
sd.playrec(converted_audio, fs)