In [12]:
from youtube_transcript_api import YouTubeTranscriptApi as yta
import re

In [13]:
def print_time(search_word,time):
    print(f"'{search_word}' was mentioned at:")
    # calculate the accurate time according to the video's duration
    for t in time:
        hours = int(t // 3600)
        min = int((t // 60) % 60)
        sec = int(t % 60)
        print(f"{hours:02d}:{min:02d}:{sec:02d}")

In [14]:
def extract_video_id(youtube_url):
    # Check if the URL is a valid YouTube video URL
    if "youtube.com/watch" not in youtube_url:
        print("Invalid YouTube URL")
        return None

    # Find the index of 'v=' in the URL
    index_v = youtube_url.find('v=')

    # Check if 'v=' is found in the URL
    if index_v == -1:
        print("Video ID not found in the URL")
        return None

    # Extract the video ID starting from 'v='
    video_id_start = index_v + 2  # Move past 'v='
    video_id_end = youtube_url.find('&', video_id_start)
    video_id = youtube_url[video_id_start:video_id_end] if video_id_end != -1 else youtube_url[video_id_start:]

    return video_id

In [15]:
youtube_url = "https://www.youtube.com/watch?v=1aA1WGON49E"
video_id = extract_video_id(youtube_url)
transcript = yta.get_transcript(video_id, languages=('us', 'en'))

In [16]:
transcript

[{'text': 'Transcriber: Victor Borges\nReviewer: David DeRuwe',
  'start': 0.0,
  'duration': 7.0},
 {'text': 'Wow,', 'start': 11.791, 'duration': 1.008},
 {'text': 'what an audience.', 'start': 13.03, 'duration': 1.19},
 {'text': "But if I'm being honest,\nI don't care what you think of my talk.",
  'start': 14.515,
  'duration': 3.016},
 {'text': "I don't.", 'start': 18.097, 'duration': 1.004},
 {'text': 'I care what the internet\nthinks of my talk.',
  'start': 19.101,
  'duration': 1.999},
 {'text': '(Laughter)', 'start': 21.1, 'duration': 1.001},
 {'text': 'Because they are the ones\nwho get it seen and shared.',
  'start': 22.101,
  'duration': 2.478},
 {'text': "And I think that's where\nmost people get it wrong.",
  'start': 24.579,
  'duration': 2.328},
 {'text': "They're talking to you, here,", 'start': 26.907, 'duration': 1.667},
 {'text': 'instead of talking to you,\nrandom person scrolling Facebook.',
  'start': 28.66,
  'duration': 4.63},
 {'text': 'Thanks for the click.'

In [17]:
data1 = [t['text'] for t in transcript]
data2 = [re.sub(r"[^a-zA-Z0-9-1şğöüçiIŞĞÖÜÇİ ]", "", line) for line in data1]
print(data1)
print(data2)

['Transcriber: Victor Borges\nReviewer: David DeRuwe', 'Wow,', 'what an audience.', "But if I'm being honest,\nI don't care what you think of my talk.", "I don't.", 'I care what the internet\nthinks of my talk.', '(Laughter)', 'Because they are the ones\nwho get it seen and shared.', "And I think that's where\nmost people get it wrong.", "They're talking to you, here,", 'instead of talking to you,\nrandom person scrolling Facebook.', 'Thanks for the click.', 'You see, back in 2009,', 'we all had these weird little things\ncalled attention spans.', '(Laughter)', "Yeah, they're gone. They're gone.\nWe killed them. They're dead.", "I'm trying to think of the last time\nI watched an 18-minute TED talk.", "It's been years, literally years.", "So if you're giving\na TED talk, keep it quick.", "I'm doing mine in under a minute.", "I'm at 44 seconds right now;", "that means we've got\ntime for one final joke.", 'Why are balloons so expensive?', '(Audience) "Why?"', 'Woody Roseland: Inflation.'

In [8]:
def give_time_stamps(transcript,data2, search_word):
    time = []
    for i, line in enumerate(data2):
        if search_word in line:
            start_time = transcript[i]['start']
            time.append(start_time)
    return time

In [11]:
search_word = "you"
time = give_time_stamps(transcript, data2, search_word)
print_time(search_word,time)

'you' was mentioned at:
00:00:14
00:00:26
00:00:28
00:00:52
