In [None]:
!pip install yt-dlp pydub youtube-transcript-api

In [100]:
import yt_dlp
from pydub import AudioSegment
from youtube_transcript_api import YouTubeTranscriptApi
import json
import os

def download_audio(video_url, output_path):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_path
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])

def get_transcript(video_id):
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    return transcript

def split_durations(txt, name, duration):
    length = len(txt) - len(name) - 1
    idx = txt.index(name)
    duration1 = duration * idx/length
    duration2 = duration - duration1
    return duration1, duration2

def convert_transcript_to_json(transcript, name):
    transcript_json = []
    last_speaker = name if transcript[0]["text"].find(name) != -1 else "Tim Ferriss:"
    last_text = ""
    last_start = 0
    last_duration = 0
    for entry in transcript:
        txt = entry["text"].replace("\u00a0", "").replace("\n", " ")
        if not name in txt and not "Tim Ferriss:" in txt:
            last_text += " " + txt
            last_duration += entry["duration"]
        elif name in txt and "Tim Ferriss:" in txt:
            first_name = txt.split(":")[0]
            other_name = "Tim Ferriss" if first_name == name.split(":")[0] else name
            first_name_idx = txt.index(first_name)
            other_name_idx = txt.index(other_name)
            assert(first_name_idx == 0)
            
            duration1, duration2 = split_durations(txt, other_name, entry["duration"])
            
            transcript_json.append({
                "speaker": last_speaker.split(":")[0],
                "start_time": last_start,
                "end_time": last_start + last_duration,
                "text": last_text
            })

            transcript_json.append({
                "speaker": first_name,
                "start_time": last_start + last_duration,
                "end_time": last_start + last_duration + duration1,
                "text": txt[first_name_idx + len(first_name) + 2: other_name_idx - 1]
            })
            
            last_speaker = other_name
            last_start = last_start + last_duration + duration1
            last_duration = duration2
            last_text = txt[other_name_idx + len(other_name) + 1:]
        else:
            found_name = name if txt.find(name) != -1 else "Tim Ferriss:"

            duration1, duration2 = split_durations(txt, found_name, entry["duration"])
            last_duration += duration1
            transcript_json.append({
                "speaker": last_speaker.split(":")[0],
                "start_time": last_start,
                "end_time": last_start + last_duration,
                "text": last_text
            })
            
            last_speaker = found_name
            last_start = last_start + last_duration
            last_duration = duration2
            last_text = txt[txt.index(found_name) + len(found_name) + 1:]
            
    transcript_json.append({
        "speaker": last_speaker.split(":")[0],
        "start_time": last_start,
        "end_time": last_start + last_duration,
        "text": last_text
    })
    return transcript_json

def save_json(data, output_file):
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=4)

def main(name, video_url):
    video_id = video_url.split("v=")[1]

    output_path = f"audio/{video_id}.wav"
    download_audio(video_url, output_path)
    transcript = get_transcript(video_id)
    transcript_json = convert_transcript_to_json(transcript, name)

    output_file = f"transcripts/{video_id}.json"
    save_json(transcript_json, output_file)
    print(f"Transcript saved to {output_file}")

In [53]:
video_urls = {
    "Soman Chainani:": "https://www.youtube.com/watch?v=B2r3A2AIm_Y",
    "Claire Hughes Johnson:": "https://www.youtube.com/watch?v=ZHCtb80SUHQ",
    "Dr. David Spiegel:": "https://www.youtube.com/watch?v=wk89rJpaj6w",
    "Martha Beck:": "https://www.youtube.com/watch?v=Ieu68CfTR4g",
    "Craig Foster:": "https://www.youtube.com/watch?v=_tBrxckIwJw",
    "Matt Pottinger:": "https://www.youtube.com/watch?v=hOMpnA-iMRg",
    "Noah Kagan:": "https://www.youtube.com/watch?v=_htIvi4JzOs",
    "Chris Beresford-Hill:": "https://www.youtube.com/watch?v=lwZuRbGvKYs",
    "Dr. Nolan Williams:": "https://www.youtube.com/watch?v=UO7IgQ_x-Qg",
    "Matt Mullenweg:": "https://www.youtube.com/watch?v=pWpaCLIUnXo",
    "Kevin Rose:": "https://www.youtube.com/watch?v=bYXa5RCGLiM",
    "Andrew Rosener:": "https://www.youtube.com/watch?v=ApCGeST90RA",
    "Willoughby Britton:": "https://www.youtube.com/watch?v=WdmvoX1RZWA",
    "Sheila Heen:": "https://www.youtube.com/watch?v=r6ZxOK2Ub-0",
    "Morgan Housel:": "https://www.youtube.com/watch?v=InQb76J9-HY",
    "Apollo Robbins:": "https://www.youtube.com/watch?v=2Qiv7oCVq5E",
    "Rich Paul:": "https://www.youtube.com/watch?v=t1yWC-HufEA",
    "Arnold Schwarzenegger:": "https://www.youtube.com/watch?v=uoabKgDX9GE",
    "Shane Parrish:": "https://www.youtube.com/watch?v=-T30rY6Nqrc",
    "Sam Corcos:": "https://www.youtube.com/watch?v=MtrkDoQFArU",
    "Arthur Brooks:": "https://www.youtube.com/watch?v=LFKY1scJepM",
    "Kevin Rose:": "https://www.youtube.com/watch?v=_fklX2OonR0",
    "Daniil Liberman:": "https://www.youtube.com/watch?v=5zP75PCZ_P0",
    "Justin Gary:": "https://www.youtube.com/watch?v=vf6TYIHwB50",
    "Dr. Shirley Sahrmann:": "https://www.youtube.com/watch?v=KPhd-Qua72E"
}

In [None]:
!kaggle d version -p "/kaggle/working" --dir-mode zip -m "YT"

In [103]:
for name, url in video_urls.items():
    main(name, url)

[youtube] Extracting URL: https://www.youtube.com/watch?v=B2r3A2AIm_Y
[youtube] B2r3A2AIm_Y: Downloading webpage




[youtube] B2r3A2AIm_Y: Downloading ios player API JSON
[youtube] B2r3A2AIm_Y: Downloading player d60b0ef9
[youtube] B2r3A2AIm_Y: Downloading web player API JSON




[youtube] B2r3A2AIm_Y: Downloading web player API JSON
[youtube] B2r3A2AIm_Y: Downloading m3u8 information
[info] B2r3A2AIm_Y: Downloading 1 format(s): 251
[download] audio/B2r3A2AIm_Y.wav has already been downloaded
[download] 100% of   92.54MiB
Transcript saved to transcripts/B2r3A2AIm_Y.json
[youtube] Extracting URL: https://www.youtube.com/watch?v=ZHCtb80SUHQ
[youtube] ZHCtb80SUHQ: Downloading webpage
[youtube] ZHCtb80SUHQ: Downloading ios player API JSON
[youtube] ZHCtb80SUHQ: Downloading m3u8 information
[info] ZHCtb80SUHQ: Downloading 1 format(s): 251
[download] audio/ZHCtb80SUHQ.wav has already been downloaded
[download] 100% of  110.05MiB
Transcript saved to transcripts/ZHCtb80SUHQ.json
[youtube] Extracting URL: https://www.youtube.com/watch?v=wk89rJpaj6w
[youtube] wk89rJpaj6w: Downloading webpage
[youtube] wk89rJpaj6w: Downloading ios player API JSON
[youtube] wk89rJpaj6w: Downloading m3u8 information
[info] wk89rJpaj6w: Downloading 1 format(s): 251
[download] audio/wk89rJpaj