In [1]:
# !pip install youtube-transcript-api

In [None]:
import googleapiclient.discovery
from youtube_transcript_api import YouTubeTranscriptApi
import spacy
from transformers import pipeline

In [None]:
API_KEY = "AIzaSyC1wCUoKeNvw3t6ptpuZYogRdvcjtc7Rms"  # Replace with your valid YouTube Data API key

# Define the channel ID
channel_id = "UCfSqNB0yh99yuG4p4nzjPOA"  # Replace with the desired channel ID

In [2]:
def get_video_info(channel_id):
    """
    Gets a list of video IDs and corresponding video titles from a YouTube channel.

    Args:
        channel_id: The ID of the YouTube channel.

    Returns:
        A list of tuples containing video IDs and corresponding video titles.
    """
    youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

    # Get the channel's upload playlist ID
    request = youtube.channels().list(
        part="contentDetails",
        id=channel_id,
    )
    response = request.execute()
#     print(response)
    playlist_id = response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]

    # Get videos from the upload playlist
    videos = {}
    next_page_token = None
    while True:
        request = youtube.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=50,
            pageToken=next_page_token,
        )
        response = request.execute()

        for item in response["items"]:
            video_id = item["snippet"]["resourceId"]["videoId"]
            video_title = item["snippet"]["title"]
            videos[video_title] = video_id
#             video_info_list.append((video_id, video_title))

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return videos

In [3]:
# Get the video IDs and titles
videos = get_video_info(channel_id)

# Print the video IDs and titles
print(f"Video IDs and Titles for channel {channel_id}:")
for video_title, video_id in videos.items():
    print(f"- Title: {video_title}, Video ID: {video_id}")

Video IDs and Titles for channel UCfSqNB0yh99yuG4p4nzjPOA:
- Title: CS505 Lecture 22 Slide View, Video ID: eR6r_aL6DzE
- Title: CS505 Lecture 23 Full View, Video ID: QNuLgMCuM7U
- Title: CS505 Lecture 24 Slide View, Video ID: 7P-NJn4mQZk
- Title: CS505 Lecture 24 Full View, Video ID: PXCRKRhxxDE
- Title: CS505 Lecture 23 Slide View, Video ID: Fcqz9q4KJVU
- Title: CS505 Lecture 22 Full View, Video ID: iBD4QwiD6ZU
- Title: CS505 Guest Lecture  Full View, Video ID: ANTtRbKfDMg
- Title: CS505 Guest Lecture 21 Slide View, Video ID: O5h2aRdSNMU
- Title: CS505 Lecture 21 Slide View, Video ID: Sfq99UKs8PM
- Title: CS505 Lecture 21 Full View, Video ID: PduRmOrQ-Bw
- Title: CS505 Lecture 20 Full View, Video ID: xN0HTuIMibE
- Title: CS505 Lecture 20 Slide View, Video ID: JCDcGQr0EI0
- Title: CS505 Lecture 19 Full View, Video ID: _BP3p_N3uWA
- Title: CS505 Lecture 19 Slide View, Video ID: GTdBpWc6rNQ
- Title: CS505 Lecture 18 Slide View, Video ID: 6n8_HgVID1E
- Title: CS505 Lecture 18 Full View, V

In [4]:
miscellaneous = {}
slide_view = {}
full_view = {}
singles = {}
for title, v_id in videos.items():
    title = title.lower()
    if "lecture" in title:
        if 'slide view' in title:
            slide_view[title] = v_id
        elif 'full view' in title:
            full_view[title] = v_id
        else:
            singles[title] = v_id
    else:
        miscellaneous[title] = v_id

In [5]:
transcripts = {}

for title, video_id in videos.items():

    text = ""

    # Experiment: last video
    try:

        # Get the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Append all the entries to a single text. Add newlines between different
        # parts to make post-processing easier later.
        for entry in transcript:

            # Remove certain unnecessary words and get the text we need to retain.
            txt = entry['text'].replace("uh", "")
            txt = txt.replace("um", "")

            text += (" " + txt)

        print(len(text))

        # Append to our list of transcripts.
        transcripts[title] = text

    # Ignore failures.
    except Exception as e:
        print("Some error happened. Skipping this transcript", title, video_id)

44067
42577
35996
35997
42569
42557
50329
50290
51878
51874
57294
57304
49238
49252
53709
53676
36024
55612
55597
49637
49618
48961
48973
52607
52620
53022
53022
11184
25025
55316
80883
59943
59962
52905
30892
51689
51682
57857
52766
52747
56423
56413
51326
51289
58099
60413
39731
43187
35646
15936
24552
Some error happened. Skipping this transcript Johnny B. Goode performed by Jay Samons 5PF-mO00igs


In [6]:
# The text comes in quite an unprocessed way.
# It is really difficult to be accurate in separating it into sentences,
# so we can use this library to get somewhat of an approximation!

def process_transcript(i):

    text = transcripts[i]

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Get the list of sentences.
    sentences = [sentence.text + "." for sentence in doc.sents]
    print (len(sentences))

    # Now we need to pair up shorter sentences in order to minimize the number of
    # times we'll need to call our model.
    sentences_post = []

    i = 0
    while i < len(sentences):

        s = ""

        while i < len(sentences):

            if len(s) + len(sentences[i]) >= 400:

                if s == "":
                    i = i + 1

                break

            s += (sentences[i] + " ")
            i += 1

        # Only keep large sentences.
        if s != "" and len(s) >= 250:
            sentences_post.append(s)

    return sentences_post

  # for sent in sentences_post:
  #   print(f"{len(sent)} -> {sent}")


In [10]:
def get_summary(i):

  # IDEA 1: try to append current summary with next sentence to summarize the entire lecture.
  # VERDICT: This doesn't work very well. The model just keeps previous information and forgets
  #          about new one.

  # IDEA 2: Summarize in layers. First summarize every sentence with small sentences,
  #         then summarize batches of those and so on.

    sentences_post = process_transcript(i)

    summaries = []

    i = 0
    for sent in sentences_post:

        if i % 5 == 0:
            print(f"Progress: {i/float(len(sentences_post))*100}%")

        summaries.append(generatorStageOne("summarize: " + sent)[0]["summary_text"])

        i += 1

    print("Stage 1: DONE")

    summaries2 = []

    j = 0
    while j < len(summaries):

        if j + 2 < len(summaries):
            sent = summaries[j] + " " + summaries[j + 1] + summaries[j + 2]
        elif j + 1 < len(summaries):
            sent = summaries[j] + " " + summaries[j + 1]
        else:
            sent = summaries[j]

        summaries2.append(generatorStageOne("summarize: " + sent)[0]["summary_text"])

        j += 3

    print("Stage 2: DONE")


    summary = ""

    j = 0
    while j < len(summaries2):

        if j + 2 < len(summaries2):
            sent = summaries2[j] + " " + summaries2[j + 1] + summaries2[j + 2]
        elif j + 1 < len(summaries2):
            sent = summaries2[j] + " " + summaries2[j + 1]
        else:
            sent = summaries2[j]

        summary += (generatorStageTwo("summarize: " + sent)[0]["summary_text"] + "")

        j += 3

    print("DONE")
    return summary


In [11]:
summaries = []

generatorStageOne = pipeline('summarization',
                        model='t5-small',
                        max_length=50)

generatorStageTwo = pipeline('summarization',
                        model='t5-small',
                        max_length=30)

# for i in range(len(transcripts)):
for title in transcripts.keys():
    print(f"Transcript: {title}")
    summary = get_summary(title)
    print(summary)
    # Append summary to file.
    with open("./summaries.txt", "a") as file:
        file.write(f"Lecture {title}\n" + summary + "\n\n")

Transcript: CS505 Lecture 22 Slide View
210
Progress: 0.0%
Progress: 9.803921568627452%
Progress: 19.607843137254903%
Progress: 29.411764705882355%
Progress: 39.21568627450981%
Progress: 49.01960784313725%
Progress: 58.82352941176471%
Progress: 68.62745098039215%
Progress: 78.43137254901961%
Progress: 88.23529411764706%
Progress: 98.0392156862745%
Stage 1: DONE


KeyboardInterrupt: 