In [1]:
# TTS and audio imports
from TTS.api import TTS
from pydub import AudioSegment
# import wave
import simpleaudio as sa
import librosa
import pyrubberband
import soundfile as sf

# Progress bar imports
from tqdm.auto import tqdm

# GUI, keyboard and clipboard imports
import pyautogui as pya
import pyperclip
from pynput import keyboard
from pynput import mouse

# System imports
from sys import platform
import sys
import time
import threading
import textwrap
import re

# Notebook imports
from IPython.utils import io
# from IPython.display import clear_output
from IPython import display
from ipywidgets import Output

# GUI imports
from PyQt5.QtWidgets import QApplication, QWidget, QProgressBar, QPushButton, QLabel
from PyQt5.QtGui import QGuiApplication, QScreen, QCursor
from PyQt5.QtCore import Qt, QPoint


In [2]:
def updateProgressBar(value, max_value):
    
    pct = round((value/max_value)*100)
    progressBar.setValue(pct)
    progressBar.setFixedWidth(window.width()-10*2)
    if pct==100:
        window.hide()

def initUI(window):
    # creating progress bar
    global progressBar
    progressBar = QProgressBar(window)
    
    # place progress bar
    progressBar.setGeometry(10, 10, 200, 25)

    # setting window geometry
    window.setGeometry(300, 200, 220, 170)

    # setting window action
    window.setWindowTitle("TTS")

    # making window always on top
    window.setWindowFlags(window.windowFlags() | Qt.WindowStaysOnTopHint)

    # showing all the widgets
    display_window()
    window.hide()
    

def display_window():

    # Place the window in the top left corner of the display where the mouse is currently located
    cursor_pos = QCursor.pos()
    
    screen = QGuiApplication.screenAt(cursor_pos)

    available_geometry = screen.availableGeometry()
    top_left_current_screen = available_geometry.topLeft()
    y_pos = cursor_pos.y() - 100
    if y_pos < 0:
        y_pos = 0
    window.move(top_left_current_screen.x(),y_pos)

    # Show the window
    window.show()
    

def addStatusLabel(text):
    text = text.replace('.','')
    # set text and word wrap
    label.setText(text)
    label.setWordWrap(True)
    
    label.setAlignment(Qt.AlignHCenter)
    
    # place label below progress bar
    buffer = 10
    label.move(buffer, 10 + progressBar.height())
    label.setFixedWidth(window.width()-buffer*2)
    label.adjustSize()
    window.setFixedHeight(label.height()+50)

def main():
    
    # create pyqt5 app
    app = QApplication(sys.argv)
    
    # create the instance of our Window
    global window
    window = QWidget()
    
    global label
    # create label widget
    label = QLabel(window)
    
    # call initUI method
    initUI(window)

    # start the app
    sys.exit(app.exec_())
      


In [3]:
# remove urls from input text?
remove_urls = True
use_gpu = False

In [4]:
# load model on GPU if possible if not CPU
if use_gpu:
    tts = TTS('tts_models/en/ljspeech/tacotron2-DDC_ph',gpu=True)
else:
    tts = TTS('tts_models/en/ljspeech/tacotron2-DDC_ph')

 > tts_models/en/ljspeech/tacotron2-DDC_ph is already downloaded.
 > vocoder_models/en/ljspeech/univnet is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:/home/nick/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/scale_stats.npy
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model's reductio

In [5]:
def join_small_strings(strings, threshold1, threshold2):
    i = 0
    while i < len(strings) - 1:
        if len(strings[i]) < threshold1:
            if len(strings[i] + strings[i+1]) <= threshold2:
                strings[i:i+2] = [strings[i] + strings[i+1]]
            else:
                i += 1
        else:
            i += 1
    return strings

In [6]:
def split_text(input_text, max_length):
    # remove urls if flag is set
    if remove_urls:
        input_text = re.sub(r'https?:\/\/[\S]+', '', input_text)
            
    
#     strip non alphanumeric
    input_text = re.sub(r'[^\w\s.,!?:;-]', '', input_text)

    
    smaller_chunks = re.split(r'[.,:;_]', input_text)
    
    # apply textwrap.wrap function to each chunk
    smaller_chunks = [textwrap.wrap(chunk, max_length, break_on_hyphens=True, break_long_words=True) for chunk in smaller_chunks]
    # remove leading whitespace
    smaller_chunks = [item.lstrip() + (item[-1] not in '.,' and '. ' or ' ') for sublist in smaller_chunks for item in sublist] 
    
#     remove empty lists
    smaller_chunks = [x for x in smaller_chunks if x]
    # return list of smaller chunks
    
    smaller_chunks = join_small_strings(smaller_chunks,50,max_length)
    
    return smaller_chunks


In [7]:
# Generate an audio file for the specified text and save it to the specified file
def run_tts(token,filename):
    
#     use io.capture to not print text from TTS
    with io.capture_output() as captured:
        # Use the TTS API to generate an audio file for the text
        tts.tts_to_file(text=token,
                        speaker=tts.speakers,
                        language=tts.languages,
                        file_path=filename
                       )

In [8]:
# speed up audio track with pyrubberband
def speed_up(speed,file_name):
    y, sr = librosa.load(file_name, sr=None)
    y_stretched = pyrubberband.time_stretch(y, sr, speed)
    sf.write(file_name, y_stretched, sr, format='wav')

In [9]:
def build_audio(token,speed):
    # Set the file name for the audio file
    file_name = "TTS_next.wav"
    
    # Generate an audio file for the current token
    run_tts(token, file_name)

    # Use the AudioSegment class to load the audio file
    audio = AudioSegment.from_file(file_name)
    
    # Trim the audio file to remove blank trailing audio
    try:
        audio = audio[:-350]
        
        # Save the modified audio to a file
        audio.export(file_name, format = 'wav')
        
        if speed > 1:
            speed_up(speed,file_name)

    except:
        print('Audio too small to trim or speed up')
        pass
    
    # Load the audio file
    wave_obj = sa.WaveObject.from_wave_file(file_name)
    
    return wave_obj

In [10]:
def read_text(input_text, speed, stop_flag):
    # Split the input text into a list of tokens with a maximum length of 200 characters
    tokens = split_text(input_text, 300)
    
#     out = Output()
#     display.display(out)
    
    # Iterate over each token and display a progress bar
#     for token in tqdm(tokens):
    total_tokens = len(tokens)
    for token_id, token in enumerate(tokens):
        
        wave_obj = build_audio(token,speed)

        # If there is an audio thread currently playing, wait until it finishes before playing the new audio
        if 'audio_thread' in locals():
#             pause in loop while the last audio file is playing
            while audio_thread.is_playing():
                time.sleep(0.1)
                # Check the stop flag
                if stop_flag.is_set():
                    audio_thread.stop()
        

        # Check the stop flag
        if stop_flag.is_set():
            # Break out of the loop if the stop flag is set
            break

        # Play the audio file
        audio_thread = wave_obj.play()
        updateProgressBar(token_id,total_tokens)
        addStatusLabel(token)
#         print the current string
#         print(token)
    
#     pause in loop while last audio thread is playing
    if 'audio_thread' in locals():
        while audio_thread.is_playing():
            time.sleep(0.1)
            # Check the stop flag
            if stop_flag.is_set():
                audio_thread.stop()
    #     clear the loading bar and text

    updateProgressBar(total_tokens,total_tokens)
    addStatusLabel(token)


In [11]:
# Create the stop flag
stop_flag = threading.Event()
reading_thread = threading.Thread()

In [12]:
# Set the stop flag to stop the read_text function
stop_flag.set()

In [13]:
def copy_clipboard():
#     grab the existing clipboard to return it later
    existing_clip = pyperclip.paste()
#     empty clipboard
    pyperclip.copy('')
#     copy new text
    pya.hotkey('ctrl', 'c')
#     sleep for a moment to make sure we have the text
    time.sleep(0.1)
#     if the clipboard is empty then select the above paragraph and use this instead,
#  this only works if the text is editable
    if pyperclip.paste() == '':
#         print("Trying to auto select")
        pya.hotkey('ctrl', 'shift', 'up')
        time.sleep(0.1)
        pya.hotkey('ctrl', 'c')
        time.sleep(0.1)
        pya.hotkey('esc')
    
    if pyperclip.paste() == '':
        return ('No text selected')
    
#     get selected text
    captured_test = pyperclip.paste()
#     return inital clipboard state
    pyperclip.copy(existing_clip)

    display_window()
    return captured_test

In [14]:
def start_reading(speed):
    updateProgressBar(0,1)
    
    
    global stop_flag
    global reading_thread
    
    if reading_thread.is_alive():
        stop_flag.set()
    
    else:
        addStatusLabel('Processing....')
        stop_flag = threading.Event()
        reading_thread = threading.Thread(target=read_text,args=[copy_clipboard(),speed,stop_flag])
        reading_thread.start()

In [15]:
def on_press(key):
    global reading_thread
#     print(key)
#     269025093 is F14
    
    if key == keyboard.KeyCode(269025093):
        
        start_reading(2)
    


In [16]:
listener_key = keyboard.Listener(on_press=on_press)
listener_key.start()

In [None]:
main()