# Imports

In [1]:
!pip install --upgrade pip



In [2]:
!pip install langchain langchain-community openai tiktoken pinecone-client langchain_pinecone unstructured 
!pip install pdfminer==20191125 pdfminer.six==20221105 pillow_heif unstructured_inference youtube-transcript-api pytube 
!pip install sentence-transformers python-dotenv retry datasets
!pip install -qU langchain-text-splitters
!pip install -U langchain-huggingface langchain-pinecone
!pip install -U sentence-transformers
!pip install --upgrade --quiet  praw
!pip install -qU langchain-pinecone pinecone-notebooks



In [7]:
import os
import ssl
import numpy as np
import pandas as pd
import requests
import torch

from dotenv import load_dotenv
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search


from langchain.text_splitter import CharacterTextSplitter
from langchain_pinecone import PineconeVectorStore

from langchain.document_loaders import YoutubeLoader
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Env. Variables

In [8]:
load_dotenv()
pinecone_key = os.getenv("PINECONE_API_KEY")
openrouter_key = os.getenv("OPEN_ROUTER_API_KEY")
model_id = os.getenv("HF_MODEL_ID")
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{os.getenv("HF_MODEL_ID")}"

api_url
headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}

In [9]:
template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:"""

QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)

qa_chain = RetrievalQA.from_chain_type(
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
    return_source_documents=True
)

NameError: name 'vectorstore' is not defined

In [10]:
def perform_rag():
    result = qa_chain({"query": query})
    return result["result"]


query = perform_rag()
print(query)

NameError: name 'qa_chain' is not defined

In [None]:
text_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=800,
    chunk_overlap=200,
    length_function=len,
)

In [None]:
urls = [
    "https://www.youtube.com/watch?v=8EYW2v4G9bw",
    "https://youtu.be/fOnUAAUXC1E?si=aQueTvxQ02BWq6se",
    "https://youtu.be/SD0PprfHFd0?si=jWGehEKC84k4uZiQ",
    "https://youtu.be/NUhDP30IRKk?si=i3PNGHL4rOVTCcWB",
    "https://youtu.be/Tt08KmFfIYQ?si=U2a7mdAThgrWlW5x",
    "https://youtu.be/31EWjB_9Jig?si=RMP2YfFyTxp0qTJA",
    "https://youtu.be/Ka0JgkZTHwY?si=cFV8Chv3Hscgl7DA",
    "https://youtu.be/pjqi_M3SPwY?si=UWcrvfx7FFPaZ9dk",
    "https://youtu.be/0SARbwvhupQ?si=SblTzsXZ5Ueja2qY",
    "https://www.youtube.com/live/InHF1nl4pu0?si=aFSGO4ovz4IlHzgv"
    
]

In [None]:
def load_videos(url):
    loader = YoutubeLoader.from_youtube_url(
    url, add_video_info=True
    )
    data = loader.load()
    return data


text = []
def get_texts(urls):
    for url in urls:
        data = load_videos(url)
        text.append(text_splitter.split_documents(data))
        return text
    
texts = get_texts(urls)
    

In [None]:
def get_embeddings(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options": {"wait_for_model": True}})
    return response.json()

embeddings = HuggingFaceEmbeddings(api_url=api_url, headers=headers)


In [None]:
output = query(texts[0].page_content)
video_embeddings = pd.DataFrame(output)
video_embeddings

In [None]:
video_embeds = torch.from_numpy(video_embeddings.to_numpy()).to(torch.float)

# Check if video_embeds needs to be reshaped
if video_embeds.shape[0] == 384 and video_embeds.shape[1] == 1:
    video_embeds = video_embeds.T  # Transpose to get (num_videos, 384)

# For testing purposes, unsqueeze if it is a 1D tensor to make it 2D
if video_embeds.dim() == 1:
    video_embeds = video_embeds.unsqueeze(0)

In [None]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = os.getenv("PINECONE_INDEX_NAME")
index = pc.Index(os.getenv("PINECONE_INDEX_NAME"))
namespace = os.getenv("PINECONE_NAMESPACE")

vectorstore = PineconeVectorStore.from_texts(
    [f"Source: {t.metadata['source']}\n\nContent: {t.page_content}" for t in texts],
    embeddings,
    index_name=index_name,
    namespace=namespace
)

In [None]:

question = perform_rag()
output = query(question)

query_embeddings = torch.FloatTensor(output)


In [None]:
hits = semantic_search(query_embeddings, video_embeds, top_k=5)

In [None]:

matches = ([texts[hits[0][i]['corpus_id']] for i in range(len(hits[0]))])

In [None]:
def query_pinecone(query):
    query_embedding = get_embeddings([query])[0]
    results = index.query(
        vector=query_embedding,
        top_k=5,
        include_metadata=True,
        namespace=namespace
    )
    return results


results = query_pinecone("Your question here")
for result in results['matches']:
    print(f"Score: {result['score']}, Metadata: {result['metadata']}")