# INSTRUCTIONS
Register a new chatbot with twitch. https://dev.twitch.tv/console/apps/create

Download models and copy them to your Google Drive into a folder named "tts".
Get current download list of models and current speaker list from here: 
https://drive.google.com/file/d/1v2o0G5uFOmkVIIou5ZMWhTrOClFjKslP/view?usp=sharing

Edit parameters in Part 3. Edit you twitch auth token (https://twitchapps.com/tmi/), your channel, speaker list, and default speaker. 

Select GPU runtime by menu->runtime->change runtime type->GPU

Run code blocks by pressing play arrow at top left corner.

**Run Part 1.** This will tell you what GPU colab has given you. The name of the GPU will be on the 4th line of text. Preferably you want a Tesla P100-PCIE. Second best is Tesla T4. If you get Tesla K80 (too slow) Factory restart the runtime (runtime->factory restart runtime) and run Part 1 again. Repeat until you get a better GPU.

**Run Part 2.** You will be prompted for auth token to connect your google drive to colab. Click the link and select your google account with the drive. Then copy code (do not use the copy button, it is buggy) and paste into box and press enter. This will install dependencies and copy your models to the colab notebook.  The icon will stop spinning after the code has finished executing. You will see a message that the runtime has crashed, this is intentional so that the runtime resets and the newly installed dependencies can be used.

**Run Part 3.** If it's the first run you will see some debug information. You will see "Receiving messages" in the output window at bottom of webpage. When a cheer has been detected the audio will be produced and a player will pop up with the message text. The first synthesis of audio will take longer as the models have to be loaded. Set Autoplay_On to False if you want to click the play button instead of automatically playing. If you want you can save the audio by right clicking the player and selecting 'save as'. There is a message queue. If playing automatically the next message will wait for the previous to stop playing.

If you get error "Could not load the JavaScript files needed to display output.
This is probably because your Google Account login access has expired or because third-party cookies are not allowed by your browser." you will need to relogin to colab.

Brower window must not be minimized or a background tab, otherwise audio will not play until you switch to it. Colab runtime will reset automatically after 12hours of use, so you will need to rerun all code afterwards.

In [None]:
#PART 1
!nvidia-smi 
!nvcc --version

In [None]:
#PART 2

#get models from google drive
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

%cd /content

!git clone https://github.com/NVIDIA/tacotron2.git

!pip install matplotlib==2.1.0
!pip install tensorflow==1.15.2
!pip install numpy==1.16.4
!pip install inflect==0.2.5
!pip install librosa==0.6.0
!pip install scipy==1.0.0
!pip install Unidecode==1.0.22
!pip install pillow
!pip install numba==0.48
!pip install soundfile
!pip install torch==1.4
!pip install emoji
!pip install pydub

%cd /content/tacotron2/
!git submodule init
!git submodule update

%cd /content/tacotron2/waveglow
!wget https://raw.githubusercontent.com/NVIDIA/waveglow/master/denoiser.py

%cd /content/tacotron2/
!cp '/content/drive/My Drive/tts/James_waveglow' '/content/tacotron2/James_waveglow'
!cp '/content/drive/My Drive/tts/James_checkpoint' '/content/tacotron2/James_checkpoint'
!cp '/content/drive/My Drive/tts/David_waveglow' '/content/tacotron2/David_waveglow'
!cp '/content/drive/My Drive/tts/David_checkpoint' '/content/tacotron2/David_checkpoint'

exit()


In [None]:
#PART 3

%cd /content/tacotron2/

#PARAMETERS
server = 'irc.chat.twitch.tv'
port = 6667
nickname = 'TTS_BOT'
#Get auth token from https://twitchapps.com/tmi/
token = 'enter you auth token here'
channel = '#yourchannelnamehere'
Autoplay_On = True
speaker_list = ["David", "James"]
default_speaker = "David"

import librosa
import soundfile as sf
import sys
sys.path.append('waveglow/')
import numpy as np
import torch
import subprocess
import os

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
from denoiser import Denoiser

import socket
from emoji import demojize
from datetime import datetime
import re
import time
from IPython.display import Audio
import IPython.display as ipd
from IPython.display import clear_output
from IPython.display import HTML
from pydub import AudioSegment
import time
import multiprocessing 

def parse_resp(resp):
  user = message = "not found"
  re_pattern_user_portion = re.compile(r"^.+\s#[_a-zA-Z0-9]+")
  match_user_portion = re_pattern_user_portion.finditer(resp)
  for m in match_user_portion:
    s = m.start()
    e = m.end()
    break
  user_portion = resp[s:e]
  message = demojize(resp[e+2:])
  re_pattern_user = re.compile(r":[_a-zA-Z0-9]+!")
  match_user = re_pattern_user.finditer(user_portion)
  for m in match_user:
    s = m.start()
    e = m.end()
    break
  user = resp[s+1:e-1]
  return user, message

def synth_line(s, text):
  checkpoint_path = f"{s}_checkpoint"
  waveglow_path = f"{s}_waveglow"                
  model = load_model(hparams)
  model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
  _ = model.cuda().eval().half()          

  waveglow = torch.load(waveglow_path)['model']
  waveglow.cuda().eval().half()
  for k in waveglow.convinv:
      k.float()
  denoiser = Denoiser(waveglow)            
  sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
  sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
  sequence = sequence.cuda()

  mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)

  with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=1)

  audio_denoised = denoiser(audio, strength=0.02)[:, 0]          
  audioout = audio_denoised[0].data.cpu().numpy()
  return audioout

def receive_resp():
  #Create file for queue
  if not os.path.exists("queue_file"):
    queue = open("queue_file", 'w')
    queue.close()

  while True:
      resp = sock.recv(2048).decode('utf-8')

      if resp.startswith('PING'):
          sock.send("PONG\n".encode('utf-8'))
      
      elif len(resp) > 0:
        if "PRIVMSG" in resp:
          #print(resp)
          user, message = parse_resp(resp)

          #CHECK IF CHEER WAS MADE
          found_cheer = False
          found_PogChamp = False
          found_Shamrock = False
          found_Slade = False
          re_pattern_cheer = re.compile(r"(\s|^)(c|C)heer[0-9]+(\s|$)")
          re_pattern_cheer_PogChamp = re.compile(r"(\s|^)(p|P)og(c|C)hamp[0-9]+(\s|$)")
          re_pattern_cheer_Shamrock = re.compile(r"(\s|^)(s|S)hamrock[0-9]+(\s|$)")
          re_pattern_cheer_Slade = re.compile(r"(\s|^)(s|S)lade(c|C)heer[0-9]+(\s|$)")
          matches = re_pattern_cheer.finditer(message)
          matches_PogChamp = re_pattern_cheer_PogChamp.finditer(message)
          matches_Shamrock = re_pattern_cheer_Shamrock.finditer(message)
          matches_Slade = re_pattern_cheer_Slade.finditer(message)

          for match in matches:
            found_cheer = True  
          for match in matches_PogChamp:
            found_PogChamp = True  
          for match in matches_Shamrock:
            found_Shamrock = True  
          for match in matches_Slade:
            found_Slade = True  

          if found_cheer or found_PogChamp or found_Shamrock or found_Slade:
            with open("queue_file", 'a') as queue:
              queue.write(f"{user} {message}")
      time.sleep(.1) #give some processing time back

sock = socket.socket()
sock.connect((server, port))
sock.send(f"PASS {token}\n".encode('utf-8'))
sock.send(f"NICK {nickname}\n".encode('utf-8'))
sock.send(f"JOIN {channel}\n".encode('utf-8'))

hparams = create_hparams()
hparams.sampling_rate = 22050
hparams.max_decoder_steps = 3000 #set higher for longer messages
hparams.p_decoder_dropout = 0
hparams.p_attention_dropout = 0
hparams.gate_threshold = 0.2

#Start up message receiving process
p1 = multiprocessing.Process(target=receive_resp, args=()) 
p1.start()
time.sleep(2) #give some time for process to start up

display(HTML(f'<span style="font-size:20px; color: #ff0000">Listening for messages....</span>'))

start_play_time = 0
current_time = time.time()
audio_length = 0
entry = ""
multi_audio = []
is_playing = False

while True:

  #Check if audio is still playing
  current_time = time.time()
  if ((current_time - start_play_time) > audio_length):
    is_playing = False
  else:
    is_playing = True

  if not is_playing:
    #Check if entry in queue_file
    entry = ""
    with open("queue_file", 'r') as queue:
      entry = queue.readline()
      if entry == "\n":
        print("uh oh empty entry!")

    #Process text to audio
    if entry:
      entry_split = entry.split()
      user = entry_split.pop(0)
      message = " ".join(entry_split)
      #print(f"processed entry: {user} {message}")

      #remove 1st entry from queue file
      with open('queue_file', 'r') as fin:
          data = fin.read().splitlines(True)
      with open('queue_file', 'w') as fout:
          fout.truncate(0)
          fout.writelines(data[1:])
      
      speaker_lines = []
      line = []
      message_multi = ""
      multi_audio = []        
      #BREAK MESSAGE INTO VARIOUS SPEAKERS
      split_message = message.split()

      while split_message:
        #check if input starts with speaker
        contains_speaker = False
        for s in speaker_list:
          if split_message[0] == f"{s}:":
            contains_speaker = True

        if contains_speaker:        
          if line:
            speaker_lines.append(" ".join(line))
            line = [split_message.pop(0)]
          else:
            line.append(split_message.pop(0))    
        else:
          #no speaker given or at beginning of message
          line.append(split_message.pop(0))
          if not split_message:
            #end of message, append last line
            speaker_lines.append(" ".join(line))          
      #print(speaker_lines)       

      if len(speaker_lines) > 1:    
        #Muliline, build audio for each appropriate line and piece back together
        message_multi = ""
        multi_audio = []

        for phrase in speaker_lines:
          speaker = default_speaker
          for s in speaker_list:              
            if phrase.startswith(f"{s}:"):
              speaker = s
          #print(speaker)  
          phrase_stripped =  phrase.replace(f"{speaker}:", "")
          message_multi = message_multi + phrase_stripped
          #print(phrase)

          audioout = synth_line(speaker, phrase_stripped)

          #append audio                
          multi_audio.append(audioout)

          #sf.write('out.wav', multi_audio, 22050) 

      else:
        #Single line, determine if speaker exists
        speaker = ""
        for s in speaker_list:              
          if speaker_lines[0].startswith(f"{s}:"):
              speaker = s
          print(speaker)  
        if speaker:
          out_message = message.replace(f"{speaker}:", "")   
          audioout = synth_line(speaker, out_message)              
        else:
          #single speaker, use default voice
          audioout = synth_line(default_speaker, message)                  

        start_play_time = time.time()
        audio_segment = AudioSegment(
            audioout.tobytes(), 
            frame_rate=22050,
            sample_width=audioout.dtype.itemsize, 
            channels=1
        )
        audio_length = audio_segment.duration_seconds

        if Autoplay_On:         
          clear_output()
          display(HTML(f'<span style="font-size:20px; color: #ff0000">{user}: {message}</span>'))
          print()
          ipd.display(ipd.Audio(audioout, rate=hparams.sampling_rate, autoplay=True))
        else:
          clear_output()
          display(HTML(f'<span style="font-size:20px; color: #ff0000">{user}: {message}</span>'))
          print()
          ipd.display(ipd.Audio(audioout, rate=hparams.sampling_rate, autoplay=False))
    

      if multi_audio:
        #Stitch audio lines together into one line
        combined = AudioSegment.empty()
        second_of_silence = AudioSegment.silent(duration=1000) 

        for e, m in enumerate(multi_audio):
          audioout32 = np.float32(m)             
          sf.write(f'out{e}.wav', audioout32, 22050)
          audio_piece = AudioSegment.from_wav(f"out{e}.wav")
          combined += audio_piece + second_of_silence

        audio_length = combined.duration_seconds
        combined_audio = combined.get_array_of_samples()
        combined_audio = np.array(combined_audio)

        #Set timer so that queue doesn't overlap
        start_play_time = time.time()

        if Autoplay_On:         
          clear_output()
          display(HTML(f'<span style="font-size:20px; color: #ff0000">{user}: {message}</span>'))
          print()
          ipd.display(ipd.Audio(combined_audio, rate=hparams.sampling_rate, autoplay=True))
        else:
          clear_output()
          display(HTML(f'<span style="font-size:20px; color: #ff0000">{user}: {message}</span>'))
          print()
          ipd.display(ipd.Audio(combined_audio, rate=hparams.sampling_rate, autoplay=False))
    