# Libraries and import

In [1]:
import pydub
import numpy as np
import youtube_dl
import pickle as pkl
from pydub import AudioSegment
from pathlib import Path
import time
import os
from spectralcluster import SpectralClusterer
import sys

sys.path.insert(1, 'C:\\Users\\ybenc\\Documents\\Lex_Fridman_Podcasts\\Resemblyzer-master')  ## The resemblyzer 
## module is publicly available at https://github.com/resemble-ai/Resemblyzer
# I am not importing it as I do with other modules because I have cloned the repo and modified it to match the needs of this 
# project. Mainly, I have modified the maximum silence duration allowed to avoid removing silences and modifying podcast time. 
# For more info about how I used this module and proceeded with the diarization, check out this excellent tutorial:
# https://medium.com/saarthi-ai/who-spoke-when-build-your-own-speaker-diarization-module-from-scratch-e7d725ee279

from resemblyzer import *
import re
from youtube_transcript_api import YouTubeTranscriptApi
import warnings
warnings.filterwarnings("ignore")
import datetime

# Utils function

In [2]:
## This function turns the length of the youtube video, declared in special
## format into a time in seconds
def Convert_Time_To_Seconds(Time):
    Decomposed_Time = np.int_(re.findall(r'\d+', Time)) 
    if len(Decomposed_Time) == 2: ##Podcast shorter than 1 hour
        Minutes = Decomposed_Time[0]
        Seconds = Decomposed_Time[1]
        Total_Time = Minutes*60+Seconds
    elif len(Decomposed_Time) == 3: ##Podcast longer than 1 hour
        Hours = Decomposed_Time[0]
        Minutes = Decomposed_Time[1]
        Seconds = Decomposed_Time[2]
        Total_Time = Hours*3600 + Minutes*60 + Seconds
    else:
        print('Something fishy happened with this:', Time)
    return Total_Time

def Convert_Seconds_To_Human_Time(Time):

    Int_Time = np.int_(Time) # get rid of any miliseconds and such 
    
    if Time > 3600: 
        Hour = np.int_(Int_Time/60/60)
        Minute = Int_Time%60
        Second = Int_Time%3600
        return str(str(Hour) + ':', str(Second) + ':', str(Second))
    elif Time < 3600:
        Minute = np.round((Int_Time/60, 0))
        Second = np.round((Int_Time/60 - Minute)*60, 0)
        return str(str(Minute) + ':' + str(Second))

def find_nearest_index (Array, value):
    #"Element in nd 'Array' closest to the scalar 'value'"
    idx = np.abs(np.array(Array) - value).argmin()
    return Array.index(np.array(Array).flat[idx])

def Get_Youtube_Download_Parameters(filename):
    filepath = str('Podcasts_Audio_Files/' + filename + '.wav')
    ydl_opts = {
        'format': 'bestaudio/best',
        'quiet': True,
        'outtmpl': filepath,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
    }
    return ydl_opts, filepath

def Audio_File_Processing(filename):
    ## I have modified the source code of the resemblyzer module to avoid having deleted silences, which is what is 
    ## normally done in audio processing etc.. to save up space. Here, we keep even long moments of silences to avoid 
    ## losing track of the right time and not being able to correctly assign text to diarized speech
    audio_file_path = 'Podcasts_Audio_Files\\' + filename + '.wav'
    wav = preprocess_wav(audio_file_path)
    encoder = VoiceEncoder("cpu", verbose = False)
    _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True)
    del wav, _, wav_splits #Deleting unused variable to save memory
    
    return cont_embeds

def Spectral_Clustering(embedding):
    clusterer = SpectralClusterer(
        min_clusters=2,
        max_clusters=3,
        p_percentile=0.95,
        gaussian_blur_sigma=1) #Sometimes, Lex's voice will be recognized as different between the introduction and the conv
            # we hence put between 2 and 3 clusters
    labels = clusterer.predict(embedding)
    return labels


## This functions turns the predictions from the clustering algorithm to a [[start_time, speaker]] array, with each row
## representing a change in speaker
def From_Clustering_To_Time_And_Speaker_Segmenting(predictions, video_duration_in_seconds):
    
    n_samples = predictions.shape[0]
    sampling_rate = n_samples/video_duration_in_seconds #how many sample per second
    
    time_and_speaker_segment = []
    time_and_speaker_segment.append([0, predictions[0]]) # Append beginning of the audio file and first Speaker
    
    for i in range(predictions.shape[0]):
        second =  i/sampling_rate 
        if i>0 and local_prediction[i] != local_prediction[i-1]: #If change in speaker, write new line. 
            time_and_speaker_segment.append([second, predictions[i]])
            
    return time_and_speaker_segment

def Get_Podcast_Details(url, duration, title):
    local_guest = title.split(':')[0]
    local_podcast_number = title.split('#')[1].split(' ')[0]
    local_theme = title.split(': ')[1].split(' |')[0]
    local_filename = str(local_guest + ' - ' + local_theme + ' - ' + local_podcast_number)

    return url, duration, local_guest, local_podcast_number, local_theme, local_filename

def Save_As_Pickle(Variable, Folder, File):
    with open(str(Folder + File + '.pickle'), 'wb') as f:
        pkl.dump(Variable, f)
        
def Diarize_Transcript(segmented_time_speaker, URL, guest, folder, filename):
    
    #This function is about fusing the clustering predictions we have with the youtube transcripts. 
    #Youtube send the transcripts as segments of speechs of usually 3-6 seconds. Issues will often occur with beginning
    # and ends of speech segments which will not be attributed to the correct speaker due to the mismatch between youtube
    # segment times and clustering times by our diarization algorithm
    
    youtube_transcript = YouTubeTranscriptApi.get_transcript(URL.split('v=')[1]) #getting transcript
    
    speech_segment_starts = [] #This is start time of each segment of speech (usually 3-6 seconds segments)
    speech_segment_texts = [] #This is the speech segment
    
    for speech_segment in youtube_transcript:
        speech_segment_starts.append(speech_segment['start'])
        speech_segment_texts.append(speech_segment['text'])
        
    # These are text only transcripts   
    with open(str('Diarization/Transcripts/' + filename + '.txt'), 'a') as f1:
        
        for i in range(len(segmented_time_speaker) - 1):
            local_speaker = str('Speaker ' + str(segmented_time_speaker[i][1]) + ' ')

            intervention_start_time = segmented_time_speaker[i][0]
            intervention_end_time = segmented_time_speaker[i+1][0]

            transcript_start_index = find_nearest_index(speech_segment_starts, intervention_start_time)
            transcript_end_index = find_nearest_index(speech_segment_starts, intervention_end_time)

            intervention_text = ''
            for k in range(transcript_end_index-transcript_start_index):
                if k%3 == 0: ## This is to have new lines every once in a while to avoid having long lines on text files
                    intervention_text += str(speech_segment_texts[transcript_start_index+k] + '\n')
                else:
                    intervention_text += str(' ' + speech_segment_texts[transcript_start_index+k])
                    
            f1.write(local_speaker + ':' + intervention_text)
            f1.write('\n\n')
    f1.close()
    
    # These are text only transcripts   
    with open(str('Diarization/Transcripts_With_Time/' + filename + '.txt'), 'a') as f2:
        
        for i in range(len(segmented_time_speaker) - 1):
            local_speaker = str('Speaker ' + str(segmented_time_speaker[i][1]) + ' ')

            intervention_start_time = segmented_time_speaker[i][0]
            intervention_end_time = segmented_time_speaker[i+1][0]
            
            transcript_start_index = find_nearest_index(speech_segment_starts, intervention_start_time)
            transcript_end_index = find_nearest_index(speech_segment_starts, intervention_end_time)
            
            human_start_time = str(datetime.timedelta(seconds= int(intervention_start_time)))

            intervention_text = ''
            for k in range(transcript_end_index-transcript_start_index):
                if k%3 == 0: ## This is to have new lines every once in a while to avoid having long lines on text files
                    intervention_text += str(speech_segment_texts[transcript_start_index+k] + '\n')
                else:
                    intervention_text += str(' ' + speech_segment_texts[transcript_start_index+k])
                    
            f2.write(human_start_time + '\n')
            f2.write(local_speaker + ':' + intervention_text)
            f2.write('\n\n')
    f2.close()

# Load URLs, podcast titles and duration

In [3]:
with open('URLs_Lex_Fridman.pkl', 'rb') as f:
    URLs = pkl.load(f)
    
with open('Titles_Lex_Fridman.pkl', 'rb') as f:
    Titles = pkl.load(f)
    
with open('Durations_Lex_Fridman.pkl', 'rb') as f:
    Verbose_Times = pkl.load(f)
    
Times_In_Seconds = []
for verbose_ti in Verbose_Times:
    Times_In_Seconds.append(Convert_Time_To_Seconds(verbose_ti))
    
print('There are', len(Titles), 'podcasts')

There are 140 podcasts


# Diarization

For each podcast, we download the audio from youtube into a wav file and run the diarization process on it. The diarization process is done in two main phases: 1) audio data preprocessing and 2) spectral clustering to separate between two distinct speakers. 
We then save the predictions into distinct variables named after the podcast, with speaker information (speaker 0 or speaker 1) and start of each speaker intervention. 

The diarization is then fused with the youtube transcripts to generate the full conversation - this is done in the next section of this notebook





In [4]:
Buggy_Podcasts = []
for i in range(len(URLs)):
    
    #Getting podcast details
    local_url, local_duration_in_seconds, local_guest, local_podcast_number, local_theme, \
                        local_filename = Get_Podcast_Details(URLs[i], Times_In_Seconds[i], Titles[i])
    
    if not os.path.exists(str('Diarization/Transcripts/' + str(local_filename + '.txt'))): ## If transcript has not already been generated:
    
        try:

            YouTubeTranscriptApi.get_transcript(local_url.split('v=')[1]) ## Check if Youtube Transcript exists

            print(i, '- Diarizing podcast', local_podcast_number, 'with guest:', local_guest, 'and theme:', local_theme)

            print('Download Youtube audio into a wav file...')
            youtube_download_options, local_file_path = Get_Youtube_Download_Parameters(local_filename) #This get the download parameters
            with youtube_dl.YoutubeDL(youtube_download_options) as ydl:
                ydl.download([local_url])

            print('Audio preprocessing...')
            local_embedding = Audio_File_Processing(local_filename) 
            Save_As_Pickle(local_embedding, 'Diarization/Embeddings/', local_filename)

            print('Spectral clustering...')
            local_prediction = Spectral_Clustering(local_embedding)
            Save_As_Pickle(local_prediction, 'Diarization/Clustering_Predictions/', local_filename)
            del local_embedding ##Deleting variables to free up memory

            print('Getting the predictions in a segmented [[start_time, speaker]] format...')
            segmented_time_and_speaker = From_Clustering_To_Time_And_Speaker_Segmenting(local_prediction, local_duration_in_seconds)
            Save_As_Pickle(segmented_time_and_speaker, 'Diarization/Segmented/', local_filename)

            print('Generating diarized transcript...')
            Diarize_Transcript(segmented_time_and_speaker, local_url, local_guest, 'Diarization/Transcripts/', local_filename)

            print('Completed this transcription.')

        except:
            print('----Could not generate transcript of podcast:', local_podcast_number, 'with guest:', local_guest)
            print('----This is likely because there was no available youtube transcript of this podcast')
            Buggy_Podcasts.append(Titles[i])
        print('\n')
    

----Could not generate transcript of podcast: 140 with guest: Lisa Feldman Barrett
----This is likely because there was no available youtube transcript of this podcast


----Could not generate transcript of podcast: 139 with guest: Andrew Huberman
----This is likely because there was no available youtube transcript of this podcast


----Could not generate transcript of podcast: 138 with guest: Yaron Brook
----This is likely because there was no available youtube transcript of this podcast


----Could not generate transcript of podcast: 137 with guest: Alex Filippenko
----This is likely because there was no available youtube transcript of this podcast


----Could not generate transcript of podcast: 136 with guest: Dan Carlin
----This is likely because there was no available youtube transcript of this podcast


----Could not generate transcript of podcast: 135 with guest: Charles Isbell
----This is likely because there was no available youtube transcript of this podcast


----Could not g

129 - Diarizing podcast 11 with guest: Juergen Schmidhuber and theme: Godel Machines, Meta-Learning, and LSTMs
Download Youtube audio into a wav file...
Audio preprocessing...
Spectral clustering...
Getting the predictions in a segmented [[start_time, speaker]] format...
Generating diarized transcript...
Completed this transcription.


130 - Diarizing podcast 10 with guest: Pieter Abbeel and theme: Deep Reinforcement Learning
Download Youtube audio into a wav file...
Audio preprocessing...
Spectral clustering...
Getting the predictions in a segmented [[start_time, speaker]] format...
Generating diarized transcript...
Completed this transcription.


131 - Diarizing podcast 9 with guest: Stuart Russell and theme: Long-Term Future of Artificial Intelligence
Download Youtube audio into a wav file...
Audio preprocessing...
Spectral clustering...
Getting the predictions in a segmented [[start_time, speaker]] format...
Generating diarized transcript...
Completed this transcription.


132 - Di

In [5]:
print(Buggy_Podcasts)

['Lisa Feldman Barrett: Love, Evolution, and the Human Brain | Lex Fridman Podcast #140 ', 'Andrew Huberman: Neuroscience of Optimal Performance | Lex Fridman Podcast #139 ', 'Yaron Brook: Ayn Rand and the Philosophy of Objectivism | Lex Fridman Podcast #138 ', 'Alex Filippenko: Supernovae, Dark Energy, Aliens &amp; the Expanding Universe | Lex Fridman Podcast #137 ', 'Dan Carlin: Hardcore History | Lex Fridman Podcast #136 ', 'Charles Isbell: Computing, Interactive AI, and Race in America | Lex Fridman Podcast #135 ', 'Eric Weinstein: On the Nature of Good and Evil, Genius and Madness | Lex Fridman Podcast #134 ', 'Chris Lattner: The Future of Computing and Programming Languages | Lex Fridman Podcast #131 ', 'Stephen Wolfram: Fundamental Theory of Physics, Life, and the Universe | Lex Fridman Podcast #124 ', 'Ben Goertzel: Artificial General Intelligence | Lex Fridman Podcast #103 ', 'Gary Marcus: Toward a Hybrid of Deep Learning and Symbolic AI | Lex Fridman Podcast #43 ', 'Gavin Mil