In [1]:
import re
import os
from typing import Optional
from dotenv import load_dotenv
from getpass import getpass

from langchain_core.utils.utils import secret_from_env
from langchain_openai import ChatOpenAI
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
from pydantic import Field, SecretStr
from youtube_transcript_api import YouTubeTranscriptApi

In [2]:
api_key = getpass("Enter your OpenRouter API key here: ")
os.environ["OPENROUTER_API_KEY"] = api_key


In [3]:
def extract_video_id(url):
    """
    Extract YouTube video ID from URL.
    Parameters:
        url (str): The YouTube URL to extract the video ID from.
    Returns:
        str: The extracted video ID, or None if no valid ID is found.
    """
    patterns = [
        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
        r'(?:embed\/)([0-9A-Za-z_-]{11})',
        r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def get_youtube_transcript(url):
    """
    Get the transcript of a YouTube video.
    Parameters:
        url (str): The YouTube URL to get the transcript from.
    Returns:
        str: The transcript of the video, or an error message if the URL is invalid.
    """
    try:
        video_id = extract_video_id(url)
        if not video_id:
            raise ValueError("Invalid YouTube URL")
            
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        full_transcript = ' '.join(entry['text'] for entry in transcript_list)
        return full_transcript #count
        
    except Exception as e:
        return f"Error: {str(e)}", 0

In [4]:
youtube_url = 'https://www.youtube.com/watch?v=q4DQaMtHvsI&ab_channel=InstituteofPolicyStudies%28IPS%29%2CSingapore'

In [5]:
text = get_youtube_transcript(youtube_url)
print(text)

so it is my pleasure to welcome you to the IPS NAD lecture series by Mr Philip yo our 16 Sr NAD fellow for the study of Singapore today Mr y will be delivering his first lecture titled charting Singapore's economic transformation following his lecture Mr yo will take questions from the audience in the Q&A session the Q&A session will be checked by Mr Chun Kong permanent secretary information development at the ministry of digital development and information before we begin please allow me to go over some housekeeping rules for the event thank you for joining us today at the auditorium please be reminded to switch your mobile phones to silent mode the lectur is being streamed live on Facebook it will also be recorded and uploaded onto our IPS website and our social media platforms later please subm me your comments and questions at anytime during the lecture through the Facebook comments for audience members here at auditorium today please step up to the mic during the Q&A session to as

In [6]:
#https://huggingface.co/NovaSearch/stella_en_400M_v5/blob/main/README.md
# Define the model name and configuration
model_name = "dunzhang/stella_en_400M_v5"
model_kwargs = {
    'trust_remote_code': True,
    'device': 'cpu',
    'config_kwargs': {
        'use_memory_efficient_attention': False,
        'unpad_inputs': False
    }
}
encode_kwargs = {
    'normalize_embeddings': False
}

# Initialize the HuggingFaceEmbeddings with the Stella model
stella_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

document_embeddings = stella_embeddings.embed_documents(text)

    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.6.0+cpu)
    Python  3.12.8 (you have 3.12.4)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: {'new.pooler.dense.bias', 'new.pooler.dense.weight'}
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
#https://python.langchain.com/api_reference/experimental/text_splitter/langchain_experimental.text_splitter.SemanticChunker.html
chunker = SemanticChunker(embeddings=stella_embeddings)

# Split the text into chunks
chunks = chunker.split_text(text)
documents = [Document(page_content=chunk) for chunk in chunks]

for i, doc in enumerate(documents):
    print(f"Document {i + 1}:")
    pprint(doc.page_content)
    print("-" * 50)

Document 1:
so it is my pleasure to welcome you to the IPS NAD lecture series by Mr Philip yo our 16 Sr NAD fellow for the study of Singapore today Mr y will be delivering his first lecture titled charting Singapore's economic transformation following his lecture Mr yo will take questions from the audience in the Q&A session the Q&A session will be checked by Mr Chun Kong permanent secretary information development at the ministry of digital development and information before we begin please allow me to go over some housekeeping rules for the event thank you for joining us today at the auditorium please be reminded to switch your mobile phones to silent mode the lectur is being streamed live on Facebook it will also be recorded and uploaded onto our IPS website and our social media platforms later please subm me your comments and questions at anytime during the lecture through the Facebook comments for audience members here at auditorium today please step up to the mic during the Q&A s

In [11]:
load_dotenv()

# https://openrouter.ai/google/gemini-2.0-flash-exp:free/api
class ChatOpenRouter(ChatOpenAI):
    openai_api_key: Optional[SecretStr] = Field(
        alias="api_key",
        default_factory=secret_from_env("OPENROUTER_API_KEY", default=None),
    )
    @property
    def lc_secrets(self) -> dict[str, str]:
        return {"openai_api_key": "OPENROUTER_API_KEY"}

    def __init__(self,
                 openai_api_key: Optional[str] = None,
                 **kwargs):
        openai_api_key = (
            openai_api_key or os.environ.get("OPENROUTER_API_KEY")
        )
        super().__init__(
            base_url="https://openrouter.ai/api/v1",
            openai_api_key=openai_api_key,
            **kwargs
        )

llm = ChatOpenRouter(
    model_name="google/gemini-2.0-flash-exp:free"
)

In [None]:
# Reference https://medium.com/the-data-perspectives/custom-prompts-for-langchain-chains-a780b490c199
question_template = PromptTemplate.from_template("""
Write a concise summary of the following youtube transcript with key takeaways for the audience:
"{text}"
CONCISE SUMMARY:""")

refine_template = PromptTemplate.from_template("""Your job is to produce a final key takeaways summary.
We have provided an existing summary up to a certain point: {existing_answer}

We have the opportunity to refine the existing summary (only if needed) with some more context below.
------------
{text}
------------
Given the new context, refine the original summary with new key takeaways.
If the context isn't useful, return the original summary.\
""")

# Load the refine summarization chain
chain = load_summarize_chain(
    llm,
    chain_type="refine",
    question_prompt=question_template,
    refine_prompt = refine_template,
    verbose=True,
    document_variable_name="text", 
    initial_response_name="existing_answer"
)
output_comb = chain.invoke(documents)
pprint(output_comb)



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Write a concise summary of the following youtube transcript with key takeaways for the audience:
"so it is my pleasure to welcome you to the IPS NAD lecture series by Mr Philip yo our 16 Sr NAD fellow for the study of Singapore today Mr y will be delivering his first lecture titled charting Singapore's economic transformation following his lecture Mr yo will take questions from the audience in the Q&A session the Q&A session will be checked by Mr Chun Kong permanent secretary information development at the ministry of digital development and information before we begin please allow me to go over some housekeeping rules for the event thank you for joining us today at the auditorium please be reminded to switch your mobile phones to silent mode the lectur is being streamed live on Facebook it will also be recorded and uploaded onto our IPS website and o