In [None]:
# Step 1: Install Required Libraries
# !pip install -qU transformers torch accelerate bitsandbytes
# !pip install -qU langchain langchain_community langchain_huggingface
# !pip install -qU sentence-transformers faiss-cpu

In [None]:
# Step 2: Create our Knowledge Base 
# create example txt file as our external knowledge source
#
%%writefile my_knowledge.txt
The Aethelred Operating System, first released in 2023, is a lightweight, security-focused OS designed for embedded systems.
Its core feature is the "Chrono-Lock" sandbox, which isolates every application in a time-sensitive, encrypted container. This prevents unauthorized data access and rollback attacks.
Aethelred OS does not support traditional graphical user interfaces (GUIs); it is managed entirely through a command-line interface (CLI) called the "A-Shell".
The primary programming language for Aethelred development is "Veridian," a memory-safe language similar to Rust, but with built-in primitives for asynchronous hardware communication.
The latest version, Aethelred 3.1 "Dragon-Tooth," introduced support for multi-core processing, a feature that was absent in earlier versions.

Writing my_knowledge.txt


In [1]:
# Step 3: Phase 1 - Indexing our Document 📚

# Now, we'll load the document, split it into manageable chunks, create numerical embeddings for each chunk, 
# and store them in a searchable FAISS vector database.

from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter

# 1. Load the document
loader = TextLoader("./my_knowledge.txt")
documents = loader.load()

# 2. Split the document into chunks
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
print(f"Split document into {len(chunks)} chunks.")

# 3. Load a sentence-transformer model for embeddings
# This model is small, fast, and runs locally.
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

# 4. Create the FAISS vector store
# This command creates embeddings for the chunks and stores them.
vector_store = FAISS.from_documents(chunks, embedding_model)
print("Vector store created successfully.")

Split document into 1 chunks.


  embedding_model = HuggingFaceEmbeddings(model_name=model_name)
  from .autonotebook import tqdm as notebook_tqdm


Vector store created successfully.


In [4]:
!pip install huggingface_hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
from huggingface_hub import login
import os

# Replace "YOUR_HF_TOKEN" with your actual token
# login("YOUR_HF_TOKEN")
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")

In [3]:
# Step 4: Phase 2 - Setting up the RAG Chain

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Hugging Face Hub Login (if not already done in terminal)
# from huggingface_hub import login
# login("YOUR_HF_TOKEN")

# Define the model ID for Llama 3.1 8B Instruct
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_id = "unsloth/llama-3.1-8b-bnb-4bit"

# Use quantization to reduce memory usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
)

# Create a text-generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    top_p=0.9,
    temperature=0.1, # Use low temperature for factual answers
)

# Wrap the pipeline in a LangChain object
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Define the RAG prompt template
prompt_template = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant. Please answer the user's question based only on the following context. If the answer is not in the context, say you don't know. Do not use any prior knowledge.

CONTEXT:
{context}<|eot_id|><|start_header_id|>user<|end_header_id|>

QUESTION:
{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create the retriever from our vector store
retriever = vector_store.as_retriever(search_kwargs={"k": 2}) # Retrieve top 2 chunks

# Create the RAG chain using LangChain Expression Language (LCEL)
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
print("RAG chain created and ready to use.")

Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.28s/it]
Device set to use cuda:0


RAG chain created and ready to use.


In [4]:
def clean_answer(raw_output: str) -> str:
    if "assistant<|end_header_id|>" in raw_output:
        return raw_output.split("assistant<|end_header_id|>")[-1].strip()
    return raw_output.strip()

In [5]:
# Question 1: The answer is in the document.
question1 = "What is the primary programming language for Aethelred OS?"
print(f"Question: {question1}")

# Invoke the chain
# answer1 = rag_chain.invoke(question1)
answer1 = clean_answer(rag_chain.invoke(question1))
print(f"Answer: {answer1}")

print("\n" + "="*50 + "\n")

# Question 2: The answer is NOT in the document.
question2 = "Who is the CEO of the company that makes Aethelred OS?"
print(f"Question: {question2}")

# Invoke the chain
answer2 = clean_answer(rag_chain.invoke(question2))
print(f"Answer: {answer2}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is the primary programming language for Aethelred OS?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Answer: The primary programming language for Aethelred OS is "Veridian", a memory-safe language similar to Rust, but with built-in primitives for asynchronous hardware communication.


Question: Who is the CEO of the company that makes Aethelred OS?
Answer: I don't know.
