<a href="https://colab.research.google.com/github/yvonneleoo/Real-Time-Voice-Swapping/blob/master/voice_swapping_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Clone git repo
!git clone https://github.com/thegreatwarlo/Real-Time-Voice-Swapping.git

Cloning into 'Real-Time-Voice-Swapping'...
remote: Enumerating objects: 2486, done.[K
remote: Total 2486 (delta 0), reused 0 (delta 0), pack-reused 2486[K
Receiving objects: 100% (2486/2486), 364.66 MiB | 46.99 MiB/s, done.
Resolving deltas: 100% (1348/1348), done.


In [2]:
cd Real-Time-Voice-Swapping/

/content/Real-Time-Voice-Swapping


In [0]:
!pip install -q -r requirements.txt
!apt-get install -qq libportaudio2
!python -m pip install SpeechRecognition

In [0]:
# Download dataset
!gdown https://drive.google.com/uc?id=1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc
!unzip pretrained.zip

In [0]:
# Code for recording audio from the browser
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
import IPython
import uuid
from google.colab import output
import speech_recognition

class InvokeButton(object):
  def __init__(self, title, callback):
    self._title = title
    self._callback = callback

  def _repr_html_(self):
    from google.colab import output
    callback_id = 'button-' + str(uuid.uuid4())
    output.register_callback(callback_id, self._callback)

    template = """<button id="{callback_id}" style="cursor:pointer;background-color:#EEEEEE;border-color:#E0E0E0;padding:5px 15px;font-size:14px">{title}</button>
        <script>
          document.querySelector("#{callback_id}").onclick = (e) => {{
            google.colab.kernel.invokeFunction('{callback_id}', [], {{}})
            e.preventDefault();
          }};
        </script>"""
    html = template.format(title=self._title, callback_id=callback_id)
    return html

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3, opath='audio.wav'):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open(opath,'wb+') as f:
    f.write(b)
  return opath

In [83]:
from IPython.display import Audio
from IPython.utils import io
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
import numpy as np
import librosa
import os
from scipy.io import wavfile
import wavio

from encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
import argparse
import torch
import sys
import speech_recognition
import time, datetime

encoder_weights = Path("encoder/saved_models/pretrained.pt")
vocoder_weights = Path("vocoder/saved_models/pretrained/pretrained.pt")
syn_dir = Path("synthesizer/saved_models/logs-pretrained/taco_pretrained")
encoder.load_model(encoder_weights)
synthesizer = Synthesizer(syn_dir)
vocoder.load_model(vocoder_weights)

Loaded encoder "pretrained.pt" trained to step 1564501
Found synthesizer "pretrained" trained to step 278000
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at vocoder/saved_models/pretrained/pretrained.pt


In [0]:
class cloneVoice():
  def __init__(self):
    pass

  def preProcess(self, in_fpath_1):
    start_time = time.time()            
    original_wav, sampling_rate = librosa.load(Path(in_fpath_1))               
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)     
    d = round(time.time() - start_time, 4)
    print(f"\nLoaded file and preprocessed succesfully in {d} seconds")
    return preprocessed_wav

  def embedding(self, preprocessed_wav):
    start_time = time.time()
    embed = encoder.embed_utterance(preprocessed_wav)     
    d = round(time.time() - start_time, 4)
    print(f"Created the embedding in {d} seconds")
    return [embed]

  def speech2text(self, in_fpath_2):
    start_time = time.time()
    wav, rate = librosa.load(Path(in_fpath_2))   
    y = (np.iinfo(np.int32).max * (wav/np.abs(wav).max())).astype(np.int32) # Convert `data` to 32 bit integers
    wavfile.write(in_fpath_2, rate, y)
    recognizer = speech_recognition.Recognizer()    
    with speech_recognition.AudioFile(in_fpath_2) as source:
      audio = recognizer.record(source)      
    text = recognizer.recognize_google(audio)
    return [text]

  def spectrogram(self, texts, embeds):
    start_time = time.time()
    d = round(time.time() - start_time, 4)
    print(f"Loaded file and Extracted text from speech in {d} seconds")
    start_time = time.time()
    with io.capture_output() as captured:
      specs = synthesizer.synthesize_spectrograms(texts, embeds)
    spec = specs[0]
    d = round(time.time() - start_time, 4)
    print(f"Created the mel spectrogram in {d} seconds")
    return spec

  def waveform(self, spec):
    print("Synthesizing the waveform:")
    start_time = time.time()
    generated_wav = vocoder.infer_waveform(spec)
    d = round(time.time() - start_time, 4)
    print(f"Generated the waveform in {d} seconds")
    return generated_wav

  def clone(self, in_fpath_1, in_fpath_2, encoder, synthesizer, vocoder):
    # load files and preprocessing
    preprocessed_wav = self.preProcess(in_fpath_1)                         
    # embedding
    embeds = self.embedding(preprocessed_wav)  
    ##Generating text from other speaker's speech
    texts = self.speech2text(in_fpath_2)
    ## Generating the spectrogram
    spec = self.spectrogram(texts, embeds)                 
    ## Generating the waveform
    generated_wav = self.waveform(spec)
    ## Post-generation
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
    ## get the speaker names and time to save on the disk
    name_a = in_fpath_1.split('_voice_')[1].split('.wav')[0]
    name_b = in_fpath_2.split('_voice_')[1].split('.wav')[0]
    time = datetime.datetime.now()
    time = '{:4d}-{:02d}-{:02d}_{:02d}-{:02d}'.format(time.year, time.month, time.day, time.hour, time.minute)
    ## Play the audio (non-blocking)
    if in_fpath_1 == 'in_fpath_1':
      print(name_a+"'s voice + "+name_b+"'s text")
      display(Audio(generated_wav, rate=synthesizer.sample_rate))
      fpath = "demo_output_" + '_'.join([name_a, name_b]) + '_' + time + ".wav" 
    else:
      print(name_b+"'s voice + "+name_a+"'s text")
      display(Audio(generated_wav, rate=synthesizer.sample_rate)) 
      fpath = "demo_output_" + '_'.join([name_b, name_a]) + '_' + time + ".wav"                       
    librosa.output.write_wav(fpath, generated_wav.astype(np.float32), synthesizer.sample_rate)
    print("Saved output as %s" % fpath)

class recordAudio():
  def __init__(self):
    pass

  def start(self, name, order):
    opath = 'original_voice_'+name+'.wav'
    print("\nNow recording the "+ order +" voice reference ("+name+") for 8 seconds, and less than 20 words.")
    record(sec=8, opath=opath)
    print("Audio recording and complie complete.")
    if order == 'first':
      global in_fpath_1
      in_fpath_1 = opath
      display(Audio(in_fpath_1))
    else:
      global in_fpath_2
      in_fpath_2 = opath
      display(Audio(in_fpath_2))

class voiceSwap(cloneVoice):
  def __init__(self): 
    cloneVoice.__init__(self)

  def swap(self): 
    #first speaker's voice, second speaker's words
    start_time = time.time()
    self.clone(in_fpath_1, in_fpath_2, encoder, synthesizer, vocoder)
    duration = round(time.time() - start_time, 4)
    print(f"First speaker's voice applied to second speaker's words in {duration} seconds") 
    #second speaker's voice, first speaker's words  
    start_time = time.time()
    self.clone(in_fpath_2, in_fpath_1, encoder, synthesizer, vocoder)
    duration = round(time.time() - start_time, 4)
    print(f"Second speaker's voice applied to first speaker's words in {duration} seconds")

In [125]:
#@title Input the name for the first speaker
name_a = 'yvonne'#@param {type:"string"}
recordAudio().start(name=name_a, order='first')


Now recording the first voice reference (yvonne) for 8 seconds, and less than 20 words.


<IPython.core.display.Javascript object>

Audio recording and complie complete.


In [126]:
#@title Input the name for the second speaker
name_b = 'jane'#@param {type:"string"}
recordAudio().start(name=name_b, order='second')


Now recording the second voice reference (jane) for 8 seconds, and less than 20 words.


<IPython.core.display.Javascript object>

Audio recording and complie complete.


In [127]:
InvokeButton('Start swapping', voiceSwap().swap)


Loaded file and preprocessed succesfully in 1.2454 seconds
Created the embedding in 0.0192 seconds
Loaded file and Extracted text from speech in 0.0 seconds
Created the mel spectrogram in 0.5356 seconds
Synthesizing the waveform:
{| ████████████████ 66500/67200 | Batch Size: 7 | Gen Rate: 8.7kHz | }Generated the waveform in 7.7873 seconds
jane's voice + yvonne's text


Saved output as demo_output_jane_yvonne_2020-01-19_07-25.wav
First speaker's voice applied to second speaker's words in 13.2441 seconds

Loaded file and preprocessed succesfully in 0.7205 seconds
Created the embedding in 0.0199 seconds
Loaded file and Extracted text from speech in 0.0 seconds
Created the mel spectrogram in 0.4598 seconds
Synthesizing the waveform:
{| ████████████████ 76000/76800 | Batch Size: 8 | Gen Rate: 9.6kHz | }Generated the waveform in 7.9913 seconds
yvonne's voice + jane's text


Saved output as demo_output_yvonne_jane_2020-01-19_07-25.wav
Second speaker's voice applied to first speaker's words in 12.8735 seconds
