<a href="https://colab.research.google.com/github/vkrisvasan/llamaKV/blob/main/llamaindexYouTubekvNaiveRAGTracker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""This code sets up a question-answering system using a Youtube transcript about Neuralink.
It leverages a large language model ("llama-3.1-8b-instant") and
an embedding model ("sentence-transformers/all-MiniLM-L6-v2")
to understand and respond to user queries about the transcript.
The code also includes functionality to track
the number of API calls made to the language model."""
# Request access to gated model https://huggingface.co/meta-llama/Meta-Llama-3-8B and check if we have access thru https://huggingface.co/settings/gated-repos
# Install required packages
!pip install llama-index llama-index-llms-groq groq llama-index-embeddings-huggingface llama-index-readers-youtube-transcript -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.2/180.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from llama_index.llms.groq import Groq
from llama_index.core import (Settings,StorageContext,load_index_from_storage)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [3]:

# Import os and getpass for handling credentials
import os
import getpass
# Prompt for credentials if not found in environment variables
credential_names = ["GROQ_API_KEY"]
for credential in credential_names:
  if credential not in os.environ:
    os.environ[credential]=getpass.getpass("Provide your..." + credential)

Provide your...GROQ_API_KEY··········


In [30]:
from transformers import AutoTokenizer

class TrackedLLM:
    def __init__(self, llm):
        print("TrackedLLM initialized")
        self.llm = llm
        self.call_count = 0
        self.total_tokens_in = 0
        self.total_tokens_out = 0
        # Use meta-llama/Meta-Llama-3-8B model's tokenizer after getting access to model
        # Request access to gated model https://huggingface.co/meta-llama/Meta-Llama-3-8B and check if we have access thru https://huggingface.co/settings/gated-repos
        self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

    def __call__(self, input_text, query_engine_object):
      try:
        print(f"Input text: {input_text}")
        print("TrackedLLM called")
        self.call_count += 1


        # Count input tokens
        input_tokens = self.tokenizer.encode(input_text)
        print(f"Input tokens: {input_tokens}")
        self.total_tokens_in += len(input_tokens)
        print(f"Total input tokens: {self.total_tokens_in}")

        # Use the correct method on the LLM instance
        response = query_engine_object.query(input_text)
        print(f"Response: {response}")

        # Convert response to a string
        if isinstance(response, dict):
            output_text = response.get('text', '')
        else:
            output_text = str(response)

        # Ensure output_text is a string before encoding
        if not isinstance(output_text, str):
            raise ValueError("Output text must be a string.")

        # Count output tokens
        output_tokens = self.tokenizer.encode(output_text)
        print(f"Output tokens: {output_tokens}")
        self.total_tokens_out += len(output_tokens)
        print(f"Total output tokens: {self.total_tokens_out}")

        return response
      except Exception as e:
        print(f"Error calling LLM: {e}")
        return None


    def __getattr__(self, name):
        print(f"Fetching attribute for LLM: {name}")
        return getattr(self.llm, name)

class TrackedStorageContext:
    def __init__(self, storage_context):
        print("TrackedStorageContext initialized")
        self.storage_context = storage_context
        self.persist_call_count = 0

    def persist(self, *args, **kwargs):
        print("TrackedStorageContext persist called")
        self.persist_call_count += 1
        return self.storage_context.persist(*args, **kwargs)

    def __getattr__(self, name):
        print(f"Fetching attribute for storage: {name}")
        return getattr(self.storage_context, name)


In [31]:
from llama_index.llms.groq import Groq
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage

# Initialize the original LLM
llm = Groq(model="llama-3.1-8b-instant", api_key=os.environ["GROQ_API_KEY"])

# Wrap the LLM with tracking
tracked_llm = TrackedLLM(llm)
Settings.llm = tracked_llm

Settings.embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

TrackedLLM initialized


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [32]:


# Initialize the YouTube transcript loader and load documents
links = ["https://www.youtube.com/watch?v=Kbk9BiPhm7o"]
loader = YoutubeTranscriptReader()
documents = loader.load_data(ytlinks=links)
print("Documents loaded")

# Create an index
index = VectorStoreIndex.from_documents(documents)

# Initialize the original storage context
storage_context = StorageContext.from_defaults()

# Wrap the storage context with tracking
tracked_storage_context = TrackedStorageContext(storage_context)

# Persist the index with tracking
index.storage_context.persist(persist_dir="./my_index_storage")
print("Index persisted")

# Load the index from storage with tracking
storage_context = StorageContext.from_defaults(persist_dir="./my_index_storage")
tracked_storage_context = TrackedStorageContext(storage_context)
index = load_index_from_storage(storage_context=tracked_storage_context)
print("Index loaded from storage")

# Create a query engine from the loaded index
query_engine = index.as_query_engine()
print("Query engine created")


Documents loaded
TrackedStorageContext initialized
Index persisted
TrackedStorageContext initialized
Fetching attribute for storage: index_store
Fetching attribute for storage: docstore
Fetching attribute for storage: vector_store
Fetching attribute for storage: graph_store
Fetching attribute for storage: index_store
Index loaded from storage
Fetching attribute for LLM: metadata
Query engine created


In [34]:
# Test direct LLM call
response = tracked_llm("What is the content of the video?", query_engine)
print("Direct LLM response: ")
print(response)
# Print out the tracking results
print(f"LLM Call Count: {tracked_llm.call_count}")
print(f"Total Input Tokens: {tracked_llm.total_tokens_in}")
print(f"Total Output Tokens: {tracked_llm.total_tokens_out}")
print(f"Persist Call Count: {tracked_storage_context.persist_call_count}")

Input text: What is the content of the video?
TrackedLLM called
Input tokens: [128000, 3923, 374, 279, 2262, 315, 279, 2835, 30]
Total input tokens: 18
Fetching attribute for LLM: predict
Fetching attribute for LLM: __pydantic_validator__
Response: The video appears to be a discussion about Brain-Computer Interface (BCI) technology, specifically focusing on the development and improvement of neural decoding and control systems. It involves a conversation about the capabilities and limitations of current BCI systems, including the use of neural implants and the importance of user interface design. The discussion also touches on the potential benefits of increasing the number of channels in a BCI system, including improved control quality and reliability.
Output tokens: [128000, 791, 2835, 8111, 311, 387, 264, 10430, 922, 31417, 12, 38432, 20620, 320, 5002, 40, 8, 5557, 11, 11951, 21760, 389, 279, 4500, 323, 16048, 315, 30828, 48216, 323, 2585, 6067, 13, 1102, 18065, 264, 10652, 922, 279

In [35]:

# Start a simple chat loop
while True:
    query = input("Ask a question: ")
    if query.lower() == "exit":
        break
    # Test LLM and persistence tracking in a full script context
    response = query_engine.query(query)
    print("Query engine response: ")
    print(response)
    # Print out the tracking results
    print(f"LLM Call Count: {tracked_llm.call_count}")
    print(f"Total Input Tokens: {tracked_llm.total_tokens_in}")
    print(f"Total Output Tokens: {tracked_llm.total_tokens_out}")
    print(f"Persist Call Count: {tracked_storage_context.persist_call_count}")
    # Persist the index after a query to check tracking
    index.storage_context.persist(persist_dir="./my_index_storage")
    print("Index persisted")

Ask a question: detail BCI
Fetching attribute for LLM: predict
Fetching attribute for LLM: __pydantic_validator__
Query engine response: 
Brain-Computer Interfaces (BCIs) are systems that enable people to control devices or communicate with others using only their brain signals. The process of developing a BCI involves several key components:

1. **Signal Acquisition**: This is the process of capturing brain signals from the user. This can be done using various techniques, such as electroencephalography (EEG), which measures electrical activity in the brain, or electrocorticography (ECoG), which measures electrical activity directly from the surface of the brain.

2. **Signal Processing**: Once the brain signals are acquired, they need to be processed to extract meaningful information. This involves filtering out noise, amplifying the signals, and transforming them into a format that can be understood by the computer.

3. **Decoding**: Decoding is the process of translating the brain s