In [4]:
import pytube as pt
import os
import re
from deep_translator import GoogleTranslator
import assemblyai as aai
import config as cfg

### define functions

In [5]:
def replace_special_chars(text):
    ''' 
    Replace special characters in strings with underscores
    '''
    # Define the pattern to match special characters and spaces
    pattern = r'[^a-zA-Z0-9]+'
    # Replace the matched pattern with underscores
    replaced_text = re.sub(pattern, '_', text)
    return replaced_text

In [6]:
directory = 'files/'
os.makedirs(directory, exist_ok=True)

### fetch video title from YouTube

In [3]:
video_url = "https://www.youtube.com/watch?v=_LYe58b-3HM"
yt = pt.YouTube(video_url)
# get title
title_name = ((yt.title))
translated_title = GoogleTranslator(source='auto', target='en').translate(title_name)
translated_title_edited = replace_special_chars(translated_title)

In [11]:
# not using the name generated above, to keep it simple
video_name = 'me_and_others'

### download video

In [12]:
video_name = 'me_and_others'
video_url = "https://www.youtube.com/watch?v=_LYe58b-3HM"

yt = pt.YouTube(video_url)
stream = yt.streams.filter(only_audio=True)[0]
stream.download(filename = directory + '/' + video_name + '.wav')

'/Users/vamsiuppala/Documents/translate-russian-video/files//me_and_others.wav'

### transcribe

In [15]:
aai.settings.api_key = cfg.keys['aai_api_key']
transcriber = aai.Transcriber()

In [16]:
audio_url = (
    "files/me_and_others.wav"
)

config = aai.TranscriptionConfig(speaker_labels=True, language_code='ru')

transcript = transcriber.transcribe(
    audio_url, 
    config
)

In [17]:
with open(directory + '/' + 'me_and_others_transcribed' + '.txt', 'w') as file:
    for utterance in transcript.utterances:
        file.write(f"Speaker {utterance.speaker}: {utterance.text} \n")
    file.close()

### translate

In [30]:
# Open transcript as text file
with open(directory + '/' + 'me_and_others_transcribed' + '.txt', 'r') as file:
    text = file.read()
    file.close()

# Chunk the text into sentences split by '\n'
chunks = text.split('\n')

# translate each chunk
translated_chunks = [GoogleTranslator(source='auto', target='en').translate(chunk) for chunk in chunks]

# join the chunks back
translated_transcript = '\n'.join(translated_chunks)
with open(directory + '/' + 'me_and_others_transcribed_and_translated' + '.txt', 'w') as file:
    file.write(translated_transcript)
    file.close()

### make it more colloquial

In [19]:
import config as cfg

In [20]:
from openai import OpenAI
client = OpenAI(
    api_key = cfg.keys['openai_token']
)

In [21]:
def colloquialize(sys_def, model, content):
    c = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": sys_def},
            {"role": "user", "content": content}
        ]
    )   

    return (c.choices[0].message.content)

In [28]:
translator_sys = "You are a professional English language expert who can convert old formal English writeups into colloquial semi formal language. You maintain a semi professional tone and retain as much context as possible."

In [31]:
# Open transcript as text file
with open(directory + '/' + 'me_and_others_transcribed_and_translated' + '.txt', 'r') as file:
    text = file.read()
    file.close()

# Chunk the text into sentences split by '\n'
chunks = text.split('\n')

# translate each chunk
colloquialized_chunks = [colloquialize(translator_sys, "gpt-3.5-turbo", chunk) for chunk in chunks]

# join the chunks back
colloquialized_transcript = '\n'.join(colloquialized_chunks)
with open(directory + '/' + 'me_and_others_transcribed_translated_colloquialzied' + '.txt', 'w') as file:
    file.write(colloquialized_transcript)
    file.close()