In [None]:
# !pip install google-api-python-client
# !pip install google-auth
# !pip install google-auth-oauthlib
# !pip install google-auth-httplib2
# !pip install youtube-transcript-api
# !pip install moviepy librosa
# !pip install pytube
# !pip install --upgrade librosa
# !pip install opencv-python
# !pip install --upgrade numpy
# !pip install isodate
# !pip install moviepy
# !pip install googletrans==4.0.0-rc1

In [None]:
import google.auth
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
import openai, base64, os, time, json, sys
from pytube import YouTube
from moviepy.video.io.VideoFileClip import VideoFileClip, AudioFileClip
import librosa, librosa.display
import matplotlib.pyplot as plt
import numpy as np
import re
from isodate import parse_duration
from pydub import AudioSegment
import speech_recognition as sr
from wrapt_timeout_decorator import timeout
from gtts import gTTS
from googletrans import Translator

In [None]:
path = f"./secret_keys.json"

# Loads the .json file generated from extracting metadata for a given channel ID
with open(path, 'r') as file:
    secret_keys = json.load(file)

In [None]:
youtube_api = secret_keys['youtube_api']
openai_api = secret_keys['openai_api']
google_custom_search_apikey = secret_keys['google_custom_search_apikey']
search_engine_id = secret_keys['search_engine_id']

In [None]:
#Functions

#Functions to extract youtube vidoe content, subtitles and even download videos as well.
def get_video_details(channel_id):
    # Set up YouTube API service
    youtube = build('youtube', 'v3', developerKey=youtube_api)
    
    # Get the playlist ID of the uploads playlist for the channel
    response = youtube.channels().list(part="contentDetails", id=channel_id).execute()
    playlist_id = response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]

    # Get the video details from the uploads playlist
    videos = []
    next_page_token = None
    limit = 50
    while True:
        playlist_items = youtube.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=limit,
            pageToken=next_page_token,
        ).execute()

        videos.extend(playlist_items["items"])
        next_page_token = playlist_items.get("nextPageToken")

        if (not next_page_token) | (len(videos) >= limit):
            break
    return videos

def get_youtube_video_info(api_key, video_id):
    '''This function extracts the details of a youtube video using its video ID'''
    
    # Set up YouTube API service
    youtube = build('youtube', 'v3', developerKey=api_key)

    # Get video details
    request = youtube.videos().list(part='snippet,contentDetails,statistics', id=video_id)
    response = request.execute()

    if 'items' in response:
        video_info = response['items'][0]
        return video_info
    else:
        print("Video not found.")
                
def search_videos(api_key, query, max_results=5):
    '''This function extracts the details a list of youtube videos given certain keywords.'''
    
    # Set up YouTube API service
    youtube = build('youtube', 'v3', developerKey=api_key)

    # Search for videos based on keywords
    request = youtube.search().list(
        part='snippet',
        q=query,
        type='video',
        maxResults=max_results)

    response = request.execute()

    if 'items' in response:
        videos = response['items']
        return videos
    else:
        print("No videos found.")
        
def convert_duration_to_seconds(duration):
    # Parse the ISO 8601 duration format
    duration_obj = parse_duration(duration)

    # Calculate the total duration in seconds
    total_seconds = duration_obj.total_seconds()

    return int(total_seconds)
    
def get_video_transcript(video_id, languages):
    '''This function extracts the subtitle of a youtube video using its video ID'''

    try:
        transcript = YouTubeTranscriptApi.get_transcripts(video_id, languages=languages)
        return transcript
    except Exception as e:
        print(f"Error: {e}")
        return None

def save_transcript_to_file(transcript, output_file):
    '''This functions saves the subtitle extracted from the chosen video.'''
    with open(output_file, 'w', encoding='utf-8') as file:
        for entry in transcript:
            file.write(f"{entry['start']} - {entry['start'] + entry['duration']}\n")
            file.write(f"{entry['text']}\n\n")

def combine_transcript(transcript):
    '''This processes the extracted subtitle and combines all its texts into one long string.'''
    string = ''
    for subt in transcript:
        string = string+f" {subt['text']}"
    return string

def download_youtube_video(video_url, output_path='.'):
    '''This function downloads a given youtube video using its video url.'''
    try:
        # Create a YouTube object
        yt = YouTube(video_url)

        # Get the highest resolution stream
        video_stream = yt.streams.get_highest_resolution()

        # Download the video
        video_stream.download(output_path)
        print(f"Video downloaded successfully to {output_path}")
    except Exception as e:
        print(f"Error: {e}")
        
def download_and_analyze_audio(video_id, output_path='audio_files'):
    '''This fucntion downloads the video audio file using the video ID and calculated the
    audio BPM (Beats per minute).'''

    try:
        # Construct the YouTube video URL
        video_url = f'https://www.youtube.com/watch?v={video_id}'

        # Create a YouTube object
        yt = YouTube(video_url)

        # Get the highest quality audio stream
        audio_stream = yt.streams.filter(only_audio=True, file_extension='mp4').first()

        # Set the output path (default: 'downloads')
        audio_stream.download(output_path)

        #print(f"Audio downloaded successfully to: {output_path}/{yt.title}.mp4")

        # Get the downloaded audio file path
        downloaded_audio_path = f"{output_path}/{yt.title}.mp4"

        # Convert the downloaded audio to MP3
        audio_clip = AudioFileClip(downloaded_audio_path)
        audio_clip.write_audiofile(f"{output_path}/{yt.title}.mp3")

        #print(f"Audio downloaded and converted to MP3 successfully.")

        y, sr = librosa.load(f"{output_path}/{yt.title}.mp3") #Loads the extracted and stored audio

        # Compute the tempo
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        tempo = round(tempo)

        # Deletes the .mp4 and .mp3 file after use to ease up space
        os.remove(f"{output_path}/{yt.title}.mp4")
        os.remove(f"{output_path}/{yt.title}.mp3")
        #print(f'Tempo: {tempo} BPM')
        return tempo
    except Exception as e:
        print(f"Error: {e}")
        return None


#Functions for executing text analysis and processor (classification, summarization, topic modelling).
def gpt_punctuator(information):
    '''Function is responsible for querying the GPT-3.5 model for analysis of a given content.'''
    import openai

    #Prompt engineering message to be fed to the GPT model.
    messages = [
        {"role":"system","content":"you are a text analyst assistant. Your job is to punctuate a given text and output only the resulting punctuated text without omiting a single word."}]
    
    openai_obj = openai
    openai_obj.api_key = openai_api

    #Creates the prompt to check for the most similar column
    prompt_1 = f"{information}"
    prompt_2 = "Please properly punctuate the given text (without omitting a single word) and output only the resulting punctuated text. Please do not omit a single word from the original text."

    #Adds the prompts to the chat memory
    messages.append({"role": "user", "content": prompt_1},)
    messages.append({"role": "user", "content": prompt_2},)

    #GPT model is triggered and response is generated.
    chat = openai_obj.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages, 
        temperature=0.0,
        timeout=5
    ) 

    #Response is extracted
    response = chat.choices[0].message.content
    return (response)

def gpt_categorizer(information):
    '''Function is responsible for querying the GPT-3.5 model for analysis of a given content.'''
    import openai

    #Prompt engineering message to be fed to the GPT model.
    messages = [
        {"role":"system","content":"you are a text analyst assistant. Given a text to analyze, you're to only respond with 'Basic','Medium', or 'Advanced'."}]
    
    openai_obj = openai
    openai_obj.api_key = openai_api

    #Creates the prompt to check for the most similar column
    prompt_1 = f"{information}"
    prompt_2 = "Given the text which is a transcript of a language tutorial video, which category of difficulty (Basic, Medium and Advanced) best describes what is being taught? Output only the category and nothing else."

    #Adds the prompts to the chat memory
    messages.append({"role": "user", "content": prompt_1},)
    messages.append({"role": "user", "content": prompt_2},)

    #GPT model is triggered and response is generated.
    chat = openai_obj.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages, 
        temperature=0.0,
        timeout=5
    ) 

    #Response is extracted
    response = chat.choices[0].message.content
    return (response)

def gpt_summarizer(information):
    '''Function is responsible for querying the GPT-3.5 model for analysis of a given content.'''
    import openai

    #Prompt engineering message to be fed to the GPT model.
    messages = [
        {"role":"system","content":"you are a text analyst assistant. Given a text to analyze, you're to summarize the content in a few sentences."}]
    
    openai_obj = openai
    openai_obj.api_key = openai_api

    #Creates the prompt to check for the most similar column
    prompt_1 = f"{information}"
    prompt_2 = "Given the text which is a transcript of a language tutorial video, please summarize the content in 5 to 10 sentences."

    #Adds the prompts to the chat memory
    messages.append({"role": "user", "content": prompt_1},)
    messages.append({"role": "user", "content": prompt_2},)

    #GPT model is triggered and response is generated.
    chat = openai_obj.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages, 
        temperature=0.0,
        timeout=5
    ) 

    #Response is extracted
    response = chat.choices[0].message.content
    return (response)

def gpt_topicmodeller(information):
    '''Function is responsible for querying the GPT-3.5 model for analysis of a given content.'''
    import openai

    #Prompt engineering message to be fed to the GPT model.
    messages = [
        {"role":"system","content":"you are a text analyst assistant. Given a text to analyze, you're to generate a single topic that best represent the contents within."}]
    
    openai_obj = openai
    openai_obj.api_key = openai_api

    #Creates the prompt to check for the most similar column
    prompt_1 = f"{information}"
    prompt_2 = "Given the text which is a transcript of a language tutorial video, please generate a single topic that describes the content being taught. Output only this topic."

    #Adds the prompts to the chat memory
    messages.append({"role": "user", "content": prompt_1},)
    messages.append({"role": "user", "content": prompt_2},)

    #GPT model is triggered and response is generated.
    chat = openai_obj.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages, 
        temperature=0.0,
        timeout=5
    ) 

    #Response is extracted
    response = chat.choices[0].message.content
    return (response)

def gpt_qualitycheck(information):
    '''Function is responsible for querying the GPT-3.5 model for analysis of a given content.'''
    import openai

    #Prompt engineering message to be fed to the GPT model.
    messages = [
        {"role":"system","content":"you are a text analyst assistant. Given a text to analyze, you're to respond with only 'Poorly articulated','Moderately articulated' or 'Very articulated'."}]
    
    openai_obj = openai
    openai_obj.api_key = openai_api

    #Creates the prompt to check for the most similar column
    prompt_1 = f"{information}"
    prompt_2 = "Given the text which is a transcript of a language tutorial video, is the content 'Poorly articulated', 'Moderately articulated', or 'Very articulated'? Output only the category and nothing else."

    #Adds the prompts to the chat memory
    messages.append({"role": "user", "content": prompt_1},)
    messages.append({"role": "user", "content": prompt_2},)

    #GPT model is triggered and response is generated.
    chat = openai_obj.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages, 
        temperature=0.0,
        timeout=5
    ) 

    #Response is extracted
    response = chat.choices[0].message.content
    return (response)


#Functions for extracting the audio from the downloaded video and analyzing this audio.        
def extract_audio(video_path, audio_path):
    '''This function extracts the audio file from the downloaded youtube video.'''
    video_clip = VideoFileClip(video_path)
    audio_clip = video_clip.audio
    audio_clip.write_audiofile(audio_path, fps=44100)  # Set the desired sample rate

def analyze_audio(audio_path):
    '''This function analyses the extracted audio.'''
    y, sr = librosa.load(audio_path)

    # Example: Display the waveform
    plt.figure(figsize=(12, 4))
    librosa.display.waveshow(y=y, sr=sr)
    plt.title('Waveform')
    plt.show()

    # Example: Display the spectrogram
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    plt.figure(figsize=(12, 4))
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Spectrogram')
    plt.show()
    
def analyze_audio_speed(audio_path):
    '''This function analyses the speed of the audio file.'''
    y, sr = librosa.load(audio_path)

    # Compute the tempo
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)

    print(f'Tempo: {tempo} BPM')

    # Example: Display the waveform
    plt.figure(figsize=(12, 4))
    librosa.display.waveshow(y, sr=sr)
    plt.title('Waveform')
    plt.show()
   
   
#Functions to analyze the image frames extracted from the downloaded video
def list_files_in_folder(folder_path):
    '''This function add the names of the image frames extracted from the downloaded video to a list.'''
    list_of_contents = []
    try:
        # Get the list of files and directories in the specified folder
        contents = os.listdir(folder_path)

        # Print the list of contents
        print(f"Contents of {folder_path}:")
        for entry in contents:
            list_of_contents.append(str(entry))
            print(entry)
        return list_of_contents
    except FileNotFoundError:
        print(f"The folder '{folder_path}' does not exist.")
    except PermissionError:
        print(f"Permission denied to access '{folder_path}'.")
        
        
def gpt_V_image_analyser(image_name):
    '''This function converts the extracted image frames to base64 and analyzes its content using GPT4-V'''
    # Updated file path to a JPEG image
    image_path_base = r".\output_frames\\"
    
    image_path = image_path_base + image_name

    # Read and encode the image in base64
    with open(image_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode("utf-8")

    # Craft the prompt for GPT
    prompt_messages = [{"role": "user",
                    "content": [{"type": "text", "text": "Does this image contain any infographics? Reply with only 'Yes' or 'No' and no added punctuations."},
                                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}]
                   }]

    # Send a request to GPT
    params = {
        "model": "gpt-4-vision-preview",
        "messages": prompt_messages,
        "api_key": openai_api,
        # "response_format": {"type": "json_object"},  # Added response format
        "headers": {"Openai-Version": "2020-11-07"},
        "max_tokens": 4096,
    }

    result = openai.ChatCompletion.create(**params)
    print(result.choices[0].message.content)
    return result.choices[0].message.content

---

In [None]:
%%time
# Extract video details using its video ID
video_id = 'axYAW7PuSIM'
video_info = get_youtube_video_info(youtube_api, video_id)
video_info['id']

In [None]:
language = video_info['snippet']['defaultAudioLanguage'].split('-')[0]
language

In [None]:
%%time
# Extracts the top 2 videos using a given keyword

# Replace 'KEYWORDS' with the keywords you want to search for
keywords = 'English Tutorial'

# Set the maximum number of results to retrieve (default is 5)
max_results = 2

videos = search_videos(youtube_api, keywords, max_results)

if videos:
    for index in range(len(videos)):
        overall_dictionary[f"Video_{index}"] = {}
        overall_dictionary[f"Video_{index}"]['ID'] = videos[index]['id']['videoId']
        overall_dictionary[f"Video_{index}"]['Details'] = videos[index]
        
        print("Video Title:", videos[index]['snippet']['title'])
        print("Channel:", videos[index]['snippet']['channelTitle'])
        print("Video ID:", videos[index]['id']['videoId'])
        print("Published At:", videos[index]['snippet']['publishedAt'])
        print("\n")

In [None]:
# Example usage:
video_url = "https://www.youtube.com/watch?v=axYAW7PuSIM"  # Replace with the actual YouTube video URL
# video_id = ["E4h-8rw2GlY"]
video_id = ["_F6LG3TYnFQ"]
language_code = ['en']  # Replace with the desired language code (e.g., "en" for English)

video_transcript = get_video_transcript(video_id, language_code)
video_transcript = video_transcript[0][video_id[0]]

In [None]:
youtube_duration = "PT10M32S" 
seconds = convert_duration_to_seconds(youtube_duration)
print(f"Duration in seconds: {seconds} seconds")

In [None]:
combined_duration = 0
number_of_words = 0
speed_categories = {'Slow Speech':[0,110],'Normal Speech':[110,150],'Fast Speech':[150,200]}
for text in video_transcript:
    if text['text'] != '[Music]':
        print(combined_duration, text['duration'])
        combined_duration += int(text['duration'])
        print(combined_duration)
        number_of_words += int(len(text['text'].split(' ')))

words_per_minute = round(number_of_words/(combined_duration/60))
for categ in list(speed_categories.keys()):
    if (words_per_minute >= speed_categories[categ][0]) & (words_per_minute < speed_categories[categ][1]):
        audio_speed = categ
print(combined_duration, number_of_words, f"{words_per_minute} WPM", audio_speed)

In [None]:
combined_subt = combine_transcript(video_transcript)
combined_subt

In [None]:
len(combined_subt.split(' '))

In [None]:
%%time
# Extracts the subtitles of one of the extracted videos

# Replace 'VIDEO_ID' with the actual YouTube video ID
video_id = overall_dictionary["Video_1"]['ID']

# Replace 'output.txt' with the desired output file name
output_file = 'output.txt'

# Get video transcript
video_transcript = get_video_transcript(video_id)

if video_transcript:
    # Save transcript to a file
    combined_subt = combine_transcript(video_transcript)
    overall_dictionary["Video_1"]['Subtitle'] = combined_subt
    print(f"Retrieved transcript.")
else:
    print("Failed to retrieve transcript.")

In [None]:
#combined_subt

In [None]:
%%time

# Preprocesses the subtitle, so that GPT can process it without trucnating it.
split_subtitle = combined_subt.split(' ')
print(f"Number of words: {len(split_subtitle)}")
subtitle_list, punct_subt_list = [], []
combined_words, count = '', 0
for word in split_subtitle:
    combined_words = combined_words + f" {word}"
    count += len(word)
    if count >= 6000:
        subtitle_list.append(combined_words)
        combined_words, count = '', 0
subtitle_list.append(combined_words)

for part_sub in subtitle_list:
    print(f"Length of text being analysed: {len(part_sub)}")
    combined_subt_punct = gpt_punctuator(part_sub)
    punct_subt_list.append(combined_subt_punct)
    
for i in range(len(punct_subt_list)):
    if i == 0:
        final_combined_punct_subt = punct_subt_list[i]
    else:
        final_combined_punct_subt = final_combined_punct_subt + f" {punct_subt_list[i]}"
overall_dictionary["Video_1"]['Punctuated Subtitle'] = final_combined_punct_subt

#Further preprocessing of text in subtitle so as to fit GPT's token limit.
split_info = []
for info in punct_subt_list:
    split_info = split_info + info.split(' ')

split_info = split_info[:3000]
trunc_string = ''
for i in range(len(split_info)):
    if i == 0:
        trunc_string = split_info[i]
    else:
        trunc_string = trunc_string + f" {split_info[i]}"
print(len(split_info), len(trunc_string))

In [None]:
print(len(final_combined_punct_subt), len(combined_subt), len(trunc_string))

In [None]:
%%time

# Analyses the punctuated subtitle for meaningful insights
category = gpt_categorizer(trunc_string)
summary = gpt_summarizer(trunc_string)
topic = gpt_topicmodeller(trunc_string)
quality = gpt_qualitycheck(trunc_string)

overall_dictionary["Video_1"]['Category'] = category
overall_dictionary["Video_1"]['Summary'] = summary
overall_dictionary["Video_1"]['Topic'] = topic
overall_dictionary["Video_1"]['Quality'] = quality

print(f'{category}\n\n{summary}\n\n{topic}\n\n{quality}')

In [None]:
#overall_dictionary

In [None]:
%%time
# Downloads the video using the video url and saves to pwd

# Replace 'VIDEO_URL' with the actual YouTube video URL
video_url = f"https://www.youtube.com/watch?v={overall_dictionary['Video_1']['ID']}"

# Replace '.' with the desired output directory
output_directory = '.'

download_youtube_video(video_url, output_directory)

In [None]:
%%time
# Extracts the audio file from the downloaded video

# Replace 'video.mp4' with the path to your downloaded video file
video_path = f"{overall_dictionary['Video_1']['Details']['snippet']['title']}.mp4"

# Replace 'extracted_audio.wav' with the desired audio output path
audio_path = 'extracted_audio.wav'

extract_audio(video_path, audio_path)
overall_dictionary["Video_1"]['Audio'] = audio_path

In [None]:
%%time
# Analyses the audio speed of the downloaded video
audio_path = overall_dictionary["Video_1"]['Audio']

analyze_audio_speed(audio_path)

In [None]:
%%time
# Extracts the list of images frames from the image frames folder

# Replace 'path/to/your/folder' with the path to the folder you want to list
folder_path = '.\output_frames'

contents_list = list_files_in_folder(folder_path)

In [None]:
%%time
#Anlyzes the content of the extracted frames for certain contents
#Note: in future implementation, images will be described and each description will then be fed back into GPT to decide if video is well articulated.

#GPT 4V Image Analysis
overall_dictionary["Video_1"]['Video_Content_Analysis'] = []
for image in contents_list:
    response = gpt_V_image_analyser(image)
    overall_dictionary["Video_1"]['Video_Content_Analysis'].append(response)
    time.sleep(5)

In [None]:
# Convert dictionary to JSON string
overall_response_string = json.dumps(overall_dictionary, indent=4)  # Use indent for pretty formatting

# Save JSON string to a file
with open("overall_response.json", "w") as json_file:
    json_file.write(overall_response_string)

In [None]:
paths = [mp4_path, mp3_path, wav_path_1, wav_path_2, wav_path_3]
for path in paths:
    print(path)

In [None]:
def split_and_transcribe_audio(file_path, first_language, second_language, segment_duration_ms=4000):
    
    language_isocode = {'english':'en-US', 'italian':'it-IT', 'french':'fr-FR'}
    language_list = []
    for language in [first_language.lower(), second_language.lower()]:
        language_list.append(language_isocode[language])
    print(language_list)
                              
    recognizer = sr.Recognizer()

    # Load the entire audio file
    audio = AudioSegment.from_file(file_path)

    # Calculate the number of segments
    num_segments = len(audio) // segment_duration_ms + 1
    print(num_segments)
    count_1, count_2, count_3, count_4 = 0, 0, 0, 0

    for i in range(num_segments):
        
        count_4 += 1
        # Calculate start and end time for each segment
        start_time = i * segment_duration_ms
        end_time = (i + 1) * segment_duration_ms

        # Extract the segment
        segment = audio[start_time:end_time]

        # Save the segment to a temporary file
        temp_file_path = f"audio_files/temp_segment_{i}.wav"
        segment.export(temp_file_path, format="wav")

        try:
            # Transcribe the segment while trying the first language
            with sr.AudioFile(temp_file_path) as audio_file:
                audio_data = recognizer.record(audio_file)
                text = recognizer.recognize_google(audio_data, language=language_list[0])
                print(f"Segment {i + 1} Transcription:", text)
                count_3 += 1
                count_2 += 1
        except sr.UnknownValueError:
            try:
                # Transcribe the segment while trying the second language
                with sr.AudioFile(temp_file_path) as audio_file:
                    audio_data = recognizer.record(audio_file)
                    text = recognizer.recognize_google(audio_data, language=language_list[1])
                    print(f"Segment {i + 1} Transcription:", text)
                    count_3 += 1
                    count_1 += 1
            except sr.UnknownValueError:
                print(f"Segment {i + 1} - Speech Recognition could not understand audio")
        except sr.RequestError as e:
            print(f"Segment {i + 1} - Could not request results from Google Speech Recognition service; {e}")
        os.remove(temp_file_path)
    
    percentage_transcribed = round((count_3/count_4)*100)
    percentage_english = round((count_2/count_3)*100)
    percentage_italian = 100-percentage_english
    print(f"Percentage transcribed: {percentage_transcribed}%, en: {percentage_english}%, it: {percentage_italian}%")

In [None]:
# Example usage:
audio_file_path = "path/to/your/multilingual_audio.wav"  # Replace with the actual file path
# Specify language segments and their durations in seconds
language_segments = [("en-US"), ("fr-FR")]

transcribe_multilingual_audio(audio_file_path, language_segments)

In [None]:
%%time
# Example usage:
audio_file_path = wav_path_1  # Replace with the actual file path
split_and_transcribe_audio(audio_file_path)

In [None]:
%%time
# Example usage:
audio_file_path = wav_path_2  # Replace with the actual file path
split_and_transcribe_audio(audio_file_path)

In [None]:
%%time
# Example usage:
audio_file_path = wav_path_3  # Replace with the actual file path
split_and_transcribe_audio(audio_file_path)

In [None]:
wav_path_3

In [None]:
%%time
# Example usage:
audio_file_path = 'audio_files/Italian Conversation Practice for Beginners  Learn Italian_3.wav'  # Replace with the actual file path
split_and_transcribe_audio(audio_file_path, 'English', 'Italian')

In [None]:
%%time
# Example usage:
audio_file_path = 'audio_files/Learn French with TINTIN 1 (fr sub)_2.wav'  # Replace with the actual file path
split_and_transcribe_audio(audio_file_path, 'French', 'English')

In [None]:
def translate_text(text, source_language):
    translator = Translator()

    # Translate the text to English
    translated_text = translator.translate(text, src=source_language, dest='en')

    return translated_text.text

In [None]:
# Replace 'your_text' with the text you want to translate
text_to_translate = "c'est Lille c'est une île et noir"

# Replace 'es' with the language code of the source text
source_language_code = 'fr'  # 'es' is the language code for Spanish

translated_text = translate_text(text_to_translate, source_language_code)
print(f"Original text: {text_to_translate}")
print(f"Translated text: {translated_text}")

In [None]:
def combine_transcript_translate(transcript, source_language):
    '''This processes the extracted subtitle and combines all its texts into one long string.'''
    
    translator = Translator()
    
    string = '' #Declares an initial empty string

    #Loops through the extracted transcript to compile it for further processing
    for subt in transcript:
        if subt['text'] != '[Music]':
            # Translate the text to English
            translated_text = translator.translate(subt['text'], src=source_language, dest='en')
            text = (translated_text.text).replace('\n',' ')
            print(text)
            string = string+f" {text}"
    return string

In [None]:
video_transcript

In [None]:
%%time
combined_transl_transcript = combine_transcript_translate(video_transcript, 'fr')

In [None]:
combined_transl_transcript

In [None]:
%%time
combined_transcript = combine_transcript(video_transcript)

In [None]:
video_transcript

In [None]:
list_of_subts = [(i, video_transcript[i]['text'], 'fr') for i in range(len(video_transcript)) if video_transcript[i]['text'] != '[Music]']
    
len_of_sublists = int(round(len(list_of_subts)/4))
sublist_of_subts = [list_of_subts[i:i+len_of_sublists] for i in range(0, len(list_of_subts), len_of_sublists)]

In [None]:
#channel_id = "UC_x5XG1OV2P6uZZ5FSM9Ttw"
#channel_id = "UCoUWq2QawqdC3-nRXKk-JUw"
channel_id = "UClEGQZlQURxTiMdwBPqiKDQ"

video_details = get_video_details( channel_id)

count = 0
for video in video_details:
    print(f"Video Title: {video['snippet']['title']}")
    print(f"Video ID: {video['snippet']['resourceId']['videoId']}")
    print(f"Published At: {video['snippet']['publishedAt']}")
    print("--------")
    count += 1
    
print(count)