<a href="https://colab.research.google.com/github/wtkns/dx451/blob/main/GDriveSpeechClient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Project Configuration

### you'll need to restart after running the next one, so run it first

In [None]:
#Speech-to-text
!pip install --upgrade google-cloud-speech

#Text-to-speech
!pip install gTTS

### project settings

In [None]:
#@title ##project settings

#@markdown ###Google Cloud Project ID
#@markdown * Should have 'Cloud Speech API' activated
project_id = 'wtkns-214817' #@param {type:"string"}
bucket_name = 'wtkns-store' #@param {type:"string"}

#@markdown ###Google Cloud Service Account name. 
#@markdown * Should have 'Cloud Speech Client' permissions
#@markdown * will be of _username_@_projectname_.iam.gserviceaccount.com
service_account_user_name = 'speechtt' #@param {type:"string"}

#@markdown ###this should be the folder where you want to store files 
content_name = 'sample' #@param {type:"string"}

project_path = "/content/drive/MyDrive/DX451/Rules/" + content_name + "/"
input_file = project_path + content_name + ".mkv"
output_file = project_path + content_name + ".output.mp4"
output_width = 800
output_height = 600

## Set up source files

### Connect to the google drive to store files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
if not os.path.isdir(project_path):
  os.makedirs(project_path)
os.chdir(project_path)
os.getcwd()

### Choose a file to upload or record a new one
_upload requires 3rd-Party Cookies enabled in the browser_

In [None]:
# for testing

if os.path.isfile(input_file):
  print("found:" + input_file)
else: print("NOT FOUND!: " + input_file)

# from google.colab import files
# uploaded = files.upload()

# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(
#       name=fn, length=len(uploaded[fn])))

### Generate normalized video file

In [None]:
def change_file_extension(input_file_path, new_extension):
  filename, file_extension = os.path.splitext(input_file_path)
  new_file_path = filename.split('.')[0] + new_extension
  return new_file_path


In [None]:
def execute_ffmpeg(command_string):
  result = subprocess.Popen(command_string, stdout = subprocess.PIPE, stderr = subprocess.STDOUT, shell=True)
  print (result.communicate())

In [None]:
def normalize_file(input_file_path, w, h):
  print("generating normalized video")

  output_file_path = change_file_extension(input_file_path, ".normal.mp4")

  ffmpeg_cmd = "ffmpeg -y -i " 
  ffmpeg_cmd += input_file_path
  ffmpeg_cmd += ' -c:v libx264 -vf "scale=w=' + str(w) +':h=' + str(h)
  ffmpeg_cmd += ':force_original_aspect_ratio=1, pad=' + str(w) + ':' + str(h) + ':(ow-iw)/2:(oh-ih)/2" ' 
  ffmpeg_cmd += output_file_path
  
  execute_ffmpeg(ffmpeg_cmd)

  print(output_file_path)

  return output_file_path

In [None]:
# for running ffmpeg
import subprocess

normal_file_path = normalize_file(input_file, output_width, output_height)

### Generate audio file

In [None]:
def generate_audio_file(input_file_path):
  print("generating audio file")

  output_audio_file_path = change_file_extension(input_file_path, ".wav")

  ffmpeg_cmd = "ffmpeg -y -i " 
  ffmpeg_cmd += input_file_path
  ffmpeg_cmd += ' -vn -ac 1 -ar 16k -acodec pcm_s16le '
  ffmpeg_cmd += output_audio_file_path
  
  execute_ffmpeg(ffmpeg_cmd)
  
  print(output_audio_file_path)
  
  return output_audio_file_path

In [None]:
audio_file_path = generate_audio_file(normal_file_path)

### Normalize audio file levels:

https://bytesandbones.wordpress.com/2017/03/16/audio-nomalization-with-ffmpeg-using-loudnorm-ebur128-filter/

In [None]:
def analyze_audio_file(input_file_path):
  print("analyzing audio from " + input_file_path)

  ffmpeg_analysis = 'ffmpeg -i '
  ffmpeg_analysis += input_file_path
  ffmpeg_analysis += ' -af loudnorm=I=-16:TP=-1.5:LRA=11:print_format=summary -f null -'
  print(ffmpeg_analysis)

  execute_ffmpeg(ffmpeg_analysis)
  print("analysis complete")

  measured_i = '-44.3'
  measured_TP = '-24.9'
  measured_LRA = '8.7'
  measured_thresh = '-57.2'
  offset = '-0.7'

  analysis_list = [measured_i, measured_TP, measured_LRA, measured_thresh, offset]
    
  print(analysis_list)  
  return analysis_list

In [None]:
def normalize_audio_file(input_file_path, analysis_results_list):
  print("generating normalized audio from " + input_file_path)

  output_audio_file_path = change_file_extension(input_file_path, ".normal.wav")
  print(output_audio_file_path)
  
  measured_i = analysis_results_list[0]
  measured_TP = analysis_results_list[1]
  measured_LRA = analysis_results_list[2]
  measured_thresh = analysis_results_list[3]
  offset = analysis_results_list[4]

  ffmpeg_adjustments = 'ffmpeg -y -i '
  ffmpeg_adjustments += audio_file_path
  ffmpeg_adjustments += ' -af loudnorm=I=-16:TP=-1.5:LRA=11:'
  ffmpeg_adjustments += 'measured_I=' + str(measured_i)
  ffmpeg_adjustments += ':measured_TP=' + str(measured_i)
  ffmpeg_adjustments += ':measured_LRA=' + str(measured_LRA)
  ffmpeg_adjustments += ':measured_thresh=' + str(measured_thresh)
  ffmpeg_adjustments += ':offset=' + str(offset)
  ffmpeg_adjustments += ':linear=true:print_format=summary '
  ffmpeg_adjustments += output_audio_file_path
  
  print(ffmpeg_adjustments)
  execute_ffmpeg(ffmpeg_adjustments)
  
  # print(output_audio_file_path)  
  return output_audio_file_path

In [None]:
analysis_results_list = analyze_audio_file(audio_file_path)

In [None]:
normal_audio_file_path = normalize_audio_file(audio_file_path, analysis_results_list)

# convert audio to text

## Get a key file for communicating with Speech-to-text and cloud storage

In [None]:
key_file_path = "/content/drive/MyDrive/gcp-keys/"
service_account_keyfile = key_file_path + service_account_user_name + ".json"

if not os.path.isfile(service_account_keyfile):
  print ( "generating keyfile (limited number available, delete in console if necessary)" )

  # Authenticate project user
  from google.colab import auth
  auth.authenticate_user()

  # create keyfile
  !gcloud config set project {project_id}
  iam_account = service_account_user_name + "@" + project_id + ".iam.gserviceaccount.com"
  !gcloud iam service-accounts keys create {service_account_keyfile} --iam-account={iam_account}

else:
  print("keyfile found")


In [None]:
# Imports the Google Cloud client libraries
from google.cloud import speech

## for reading and writing google cloud storage
from google.cloud import storage

# for processing 
import json

!gcloud config set project {project_id}

# I DONT THINK I SHOULD HAVE TO DO THIS:
# Authenticate project user
from google.colab import auth
auth.authenticate_user()


# Instantiates clients
speech_client = speech.SpeechClient.from_service_account_json(service_account_keyfile)
storage_client = storage.Client.from_service_account_json(service_account_keyfile)

## transfer file to gcs

In [None]:
def upload_to_gcs(input_file_path):
  gcs_audio_uri="gs://" + bucket_name + "/" + content_name + "/"
  !gsutil cp {input_file_path} {gcs_audio_uri}
  gcs_audio_uri += os.path.basename(input_file_path)

  return gcs_audio_uri


In [None]:
gcs_audio_uri = upload_to_gcs(normal_audio_file_path)
print(gcs_audio_uri)

## pass URI to STT, get dict

In [None]:
def convert_to_text(gcs_audio_uri):
  gcs_text_uri = change_file_extension(gcs_audio_uri, '.json')
   
  audio = speech.RecognitionAudio(
    uri=gcs_audio_uri
    )

  config = speech.RecognitionConfig(
      encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
      sample_rate_hertz=16000,
      language_code="en-US",
      enable_word_time_offsets=True,
      )

  output_config = speech.TranscriptOutputConfig(
      gcs_uri=gcs_text_uri
      )

  request = speech.LongRunningRecognizeRequest(
      audio=audio, config=config, output_config=output_config
      )

  # asynchronous request
  # https://cloud.google.com/speech-to-text/docs/basics?authuser=0#async-responses
  # the asynchronous request will initiate a 
  # Long Running Operation (of type Operation) 
  # and return this operation to the callee immediately.

  operation = speech_client.long_running_recognize(request=request)

  print("Waiting for operation to complete...")
  result = operation.result(timeout=120)
  print("Completed.")

  result_as_dict =  json.loads(type(result).to_json(result))
  
  return result_as_dict


## Submit request to Speech-to-Text

In [None]:
response_dict = convert_to_text(gcs_audio_uri)

# Process text result

## Concatenate results

In [None]:
from decimal import Decimal

def remove_suffix(input_string, suffix):
    if suffix and input_string.endswith(suffix):
        return input_string[:-len(suffix)]
    return input_string

def convert_word(word):

  start = round(Decimal(remove_suffix(word['startTime'],'s')),1)
  end = round(Decimal(remove_suffix(word['endTime'],'s')),1)
  dur = end - start

  clean_word = {}
  clean_word['word'] = word['word']
  clean_word['start'] = str(start)
  clean_word['duration'] = str(dur)

  return clean_word

In [None]:
full_transcript = ""
word_list = []

# Each result is for a consecutive portion of the audio. Iterate through
# them to get the transcripts for the entire audio file.

# concatenate results
for result in response_dict['results']:
  full_transcript += result['alternatives'][0]['transcript'] + " "
  for word in result['alternatives'][0]['words']:
    word_list.append(convert_word(word))
  
print ("\ntranscript:")
print(full_transcript)
print(word_list)


In [None]:
def sortfunc(word): 
  return word['word']

alphabetical_list = word_list.copy()
alphabetical_list.sort(key=sortfunc)

# reassemble with ffmpeg

In [None]:
def get_ffmpeg_input(file_name, word, easing):

  padded_start = round(Decimal(word['start']) - Decimal(easing),2) 
  padded_duration = round(Decimal(word['duration']) + (2Decimal(easing),2) 

  ffmpeg_input = " -ss "
  ffmpeg_input += str(padded_start)
  ffmpeg_input += " -t "
  ffmpeg_input += str(padded_duration)
  ffmpeg_input += " -i "
  ffmpeg_input += file_name

  return (ffmpeg_input)

def get_ffmpeg_filter(filter_list):
  ffmpeg_filter = ' -filter_complex "'

  for x in range (len(filter_list)):
     ffmpeg_filter += "[" + str(x) + ":0][" + str(x) + ":1]"

  ffmpeg_filter += ' concat=n=' + str(len(filter_list)) + ':v=1:a=1 [v] [a]"'
  ffmpeg_filter += ' -map "[v]" -map "[a]" '  

  return ffmpeg_filter  

ffmpeg_cmd = "ffmpeg -y"

for word in alphabetical_list:
  ffmpeg_cmd += get_ffmpeg_input(normal_file_path, word, 0.1)

ffmpeg_cmd += get_ffmpeg_filter(alphabetical_list)

ffmpeg_cmd += output_file

print(ffmpeg_cmd)

for word in alphabetical_list:
  print(word['word'])

execute_ffmpeg(ffmpeg_cmd)

# words.sort(key=get_word)

# input_video = "gold-watch.mp4"
# output_video = input_video.replace('.mp4', '-sorted.mp4')

# ffmpeg_filter = " -filter_complex "

# num_clips = 10

# # for word in words:
# for x in range(0, len(words)): 
#   ffmpeg_cmd = ffmpeg_cmd + get_ffmpeg_token(words[x])
#   ffmpeg_filter = ffmpeg_filter + "["+str(x)+":0]["+str(x)+":1]"



# print (ffmpeg_cmd + ffmpeg_filter + '"concat=n='+str(len(words))+':v=1:a=1[outv][outa]" -map [outv] -map [outa] ' + output_video)


In [None]:
!ffmpeg -y -ss 1.9 -t 0.1 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -ss 7.0 -t 1.1 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -ss 1.7 -t 0.2 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -ss 3.3 -t 0.8 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -ss 2.4 -t 0.9 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -ss 2.0 -t 0.4 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -ss 8.1 -t 0.6 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -ss 5.2 -t 0.6 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -ss 9.3 -t 0.6 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -ss 1.2 -t 0.5 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -ss 4.1 -t 1.1 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -ss 8.7 -t 0.6 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -ss 5.8 -t 1.2 -i /content/drive/MyDrive/DX451/Rules/sample/sample.normal.mp4 -filter_complex "[0:0][0:1][1:0][1:1][2:0][2:1][3:0][3:1][4:0][4:1][5:0][5:1][6:0][6:1][7:0][7:1][8:0][8:1][9:0][9:1][10:0][10:1][11:0][11:1][12:0][12:1] concat=n=13:v=1:a=1 [v] [a]" -map "[v]" -map "[a]" /content/drive/MyDrive/DX451/Rules/sample/sample.output.mp4

# Text-To-Speech

In [None]:
from gtts import gTTS
from IPython.display import Audio


text_to_speak = ""

for word in words: 
  text_to_speak = text_to_speak + " " + get_word(word)
  

tts = gTTS(text_to_speak)
tts.save('1.wav')
sound_file = '1.wav'
Audio(sound_file, autoplay=True)

In [None]:
print (text_to_speak)