<a href="https://colab.research.google.com/github/t3manish/Build-chat-applications-with-openai-and-langchain/blob/main/Langchain_YT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import userdata
import os

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [3]:
# 1. Install all required libraries
!pip install langchain langchain-openai langchain-community youtube-transcript-api chromadb tiktoken -q

# 2. Import necessary libraries
import os
import getpass
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.
google-adk 1.17.0 requires opentelemetry-api<=1.37.0,>=1.37.0, but you have opentelemetry-api 1.38.0 which is incompatible.
google-adk 1.17.0 requires opentelemetry-sdk<=1.37.0,>=1.37.0, but you have opentelemetry-sdk 1.38.0 which is incompatible.
opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-exporter-otlp-proto-common==1.37.0, but you have opentelemetry-exporter-otlp-proto-common 1.38.0 which is incompatible.
opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-proto==1.37.0, but you have opentelemetry-proto 1.38.0 which is incompatible.
opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-sdk~=1.37.0, but you have opentelemetry-sdk 1.38.0 which is incompati

In [None]:
# 1. Install all required libraries
print("--- Installing libraries ---")
!pip install langchain langchain-openai langchain-community youtube-transcript-api chromadb tiktoken -q

# 2. Import all necessary classes
import os
import getpass
from langchain_community.document_loaders import YoutubeLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

# 3. Securely get your OpenAI API Key
# When you run this cell, it will show a text box.
# Paste your API key there and press Enter.
if 'OPENAI_API_KEY' not in os.environ:
    os.environ['OPENAI_API_KEY'] = getpass.getpass("Enter your OpenAI API Key: ")
print("----------------------------\n")

# -----------------------------------------------------------------
# START OF SOLUTION FOR QUESTION 1
# -----------------------------------------------------------------
print("--- Starting Question 1 ---")
video_url = "https://youtu.be/6Fa91SY9Gnw?feature=shared"

# Use YoutubeLoader as specified in the answer key
loader_youtube = YoutubeLoader.from_youtube_url(video_url)
transcript_doc = loader_youtube.load()[0]
transcript = transcript_doc.page_content

# Clean the text as specified
transcript_stripped = transcript.replace(u'\xa0', u'').replace(u'\uf0a7', u'')

# Find the number of characters
char_count = len(transcript_stripped)
print(f"Q1 Result: Number of characters in transcript_stripped: {char_count}")
print("(Note: The live video transcript may have changed. The quiz answer was based on 5,853.)")
print("----------------------------\n")

# -----------------------------------------------------------------
# START OF SOLUTION FOR QUESTION 2
# -----------------------------------------------------------------
print("--- Starting Question 2 ---")
# Initialize the splitter as specified in the answer key
char_splitter = CharacterTextSplitter(
    separator = ".",  # Use "." as the separator
    chunk_size = 500,
    chunk_overlap = 0
)

# Use split_text() as specified
transcript_split = char_splitter.split_text(transcript_stripped)

# Find the number of documents
doc_count = len(transcript_split)
print(f"Q2 Result: Number of documents in transcript_split: {doc_count}")
print("----------------------------\n")

# -----------------------------------------------------------------
# START OF SOLUTION FOR QUESTION 3
# -----------------------------------------------------------------
print("--- Starting Question 3 ---")
# Create embeddings
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")

# Create Chroma vector store using from_texts()
vectorstore = Chroma.from_texts(
    texts=transcript_split,  # Use the list of strings from Q2
    embedding=embedding,
    persist_directory="./python-projects"
)

# Define the retriever as specified
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={'k': 3, 'lambda_mult': 0.3}
)

# Invoke the retriever to test
query = "Could you tell me all essential beginner Python projects according to the video?"
retrieved_docs = retriever.invoke(query)

print(f"Q3 Result: Retrieved {len(retrieved_docs)} documents.")
print("--- Content of Retrieved Docs (for context) ---")
for i, doc in enumerate(retrieved_docs):
    print(f"[DOCUMENT {i+1}]:\n{doc.page_content[:150]}...\n")
print("----------------------------\n")

# -----------------------------------------------------------------
# START OF SOLUTION FOR QUESTION 4
# -----------------------------------------------------------------
print("--- Starting Question 4 ---")
# Define the templates as specified in the answer key
TEMPLATE_1 = """You are a helpful chatbot that answers questions on YouTube videos.
Answer the questions using only the following context:
{context}"""
message_template_1 = SystemMessagePromptTemplate.from_template(TEMPLATE_1)

TEMPLATE_2 = "{question}"
message_template_2 = HumanMessagePromptTemplate.from_template(TEMPLATE_2)

# Combine into a single chat template
chat_template = ChatPromptTemplate.from_messages([message_template_1, message_template_2])
print("Q4 Result: chat_template created successfully.")
print("----------------------------\n")

# -----------------------------------------------------------------
# START OF SOLUTION FOR QUESTION 5
# -----------------------------------------------------------------
print("--- Starting Question 5 ---")
# Define the chat model
chat = ChatOpenAI(
    model="gpt-4",
    temperature=0,
    max_tokens=250,
    model_kwargs={"seed": 365}
)

# Define the output parser
str_parser = StrOutputParser()

# Define the first part of the chain (the valid 'RunnableParallel' map)
# This dictionary shorthand is the most common valid syntax
retriever_step = {
    "context": retriever,
    "question": RunnablePassthrough()
}

# Construct the full chain
chain = retriever_step | chat_template | chat | str_parser
print("Q5 Result: RAG chain created successfully.")
print("----------------------------\n")

# -----------------------------------------------------------------
# START OF SOLUTION FOR QUESTION 6
# -----------------------------------------------------------------
print("--- Starting Question 6 ---")
# Invoke the chain with the question
response = chain.invoke(query)

print("\n--- FINAL ANSWER FROM CHAIN ---\n")
print(response)
print("\n-------------------------------\n")
print("Q6 Result: As the answer key explains, the response should reference only 2 projects.")
print("The LLM correctly identified that the 3rd retrieved document was irrelevant and ignored it.")