### Cell 10: Import Libraries for RAG and Define Configurations


In [1]:
# Cell 10: Imports, Core Configurations, and Library Version Checks
import os
import torch
import gc # For garbage collection
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# LangChain components
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
# from langchain_core.messages import SystemMessage, HumanMessage, AIMessage # Not directly used in final chain

print(f"--- Phase 3: RAG Chain - TinyLlama & Embeddings on CUDA (Optimized) ---")

# --- Core Configurations ---
LLM_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
EMBEDDING_MODEL_NAME_FOR_FAISS = "sentence-transformers/all-mpnet-base-v2"
DB_FAISS_PATH = "vectorstore/db_faiss_eu_ai_act" # Ensure this path is correct
NUM_CHUNKS_TO_RETRIEVE = 5 # CRITICAL: Start with k=1 for minimal context to save VRAM
MAX_NEW_TOKENS_LLM = 190   # CRITICAL: Keep LLM output short to save KV cache VRAM

# Device Configuration
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script is configured to run all components on CUDA.")
DEVICE = "cuda"
CUDA_DEVICE_ID = 0 # Assuming GPU 0
torch.cuda.set_device(CUDA_DEVICE_ID) # Explicitly set default CUDA device for PyTorch

print(f"Targeting device: {DEVICE}:{CUDA_DEVICE_ID}")
print(f"LLM: {LLM_MODEL_ID}")
print(f"Embedding Model for FAISS: {EMBEDDING_MODEL_NAME_FOR_FAISS}")
print(f"FAISS DB path: {DB_FAISS_PATH}")
print(f"Chunks to retrieve (k): {NUM_CHUNKS_TO_RETRIEVE}")
print(f"Max new tokens for LLM: {MAX_NEW_TOKENS_LLM}")


# Clear cache at the beginning of the session
torch.cuda.empty_cache()
gc.collect()
print("Initial CUDA cache cleared.")

--- Phase 3: RAG Chain - TinyLlama & Embeddings on CUDA (Optimized) ---
Targeting device: cuda:0
LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
Embedding Model for FAISS: sentence-transformers/all-mpnet-base-v2
FAISS DB path: vectorstore/db_faiss_eu_ai_act
Chunks to retrieve (k): 5
Max new tokens for LLM: 190
Initial CUDA cache cleared.


### Cell 11: Configure and Load TinyLlama Model & Tokenizer 




In [2]:
# Cell 11: Configure and Load TinyLlama Model & Tokenizer (Optimized for CUDA)

llm_model = None
llm_tokenizer = None
bnb_config_llm = None
dtype_to_use_llm = torch.float16 # Defaulting to float16 for CUDA with BnB

print("\nConfiguring BitsAndBytes for 4-bit LLM quantization on CUDA...")
try:
    bnb_config_llm = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16, # float16 for compute with TinyLlama
        bnb_4bit_use_double_quant=False,
    )
    print("BitsAndBytesConfig for LLM created successfully.")
except Exception as e:
    print(f"FATAL ERROR: Could not configure BitsAndBytesConfig for LLM: {e}")
    print("This is essential for running on limited VRAM. Halting.")
    # Raise an error or exit if BnB cannot be configured for CUDA
    raise SystemExit("BitsAndBytesConfig failed, cannot proceed with GPU LLM loading.")


print(f"\nAttempting to load LLM: {LLM_MODEL_ID} on {DEVICE}:{CUDA_DEVICE_ID}")
try:
    llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, trust_remote_code=True)
    if llm_tokenizer.pad_token is None:
        llm_tokenizer.pad_token = llm_tokenizer.eos_token
        print(f"LLM Tokenizer pad_token set to eos_token ({llm_tokenizer.eos_token_id}).")
    print("LLM tokenizer loaded successfully.")

    model_load_kwargs = {
        "quantization_config": bnb_config_llm,
        "device_map": {"": CUDA_DEVICE_ID}, # Pin model to the specified CUDA device
        "trust_remote_code": True,
        "attn_implementation": "eager", # Most stable attention
        # No explicit torch_dtype here, bnb_config's compute_dtype and device_map handle it
    }
    
    llm_model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_ID, **model_load_kwargs)
    print("LLM model loaded successfully!")

    if llm_tokenizer.pad_token == llm_tokenizer.eos_token and \
       getattr(llm_model.config, 'pad_token_id', None) is None and \
       llm_tokenizer.eos_token_id is not None:
        llm_model.config.pad_token_id = llm_tokenizer.eos_token_id
        print(f"Model config.pad_token_id updated to {llm_tokenizer.eos_token_id}.")

    # Set model to evaluation mode
    llm_model.eval()

except Exception as e:
    print(f"---------------------------------------------")
    print(f"FATAL ERROR loading LLM or tokenizer: {e}")
    print(f"---------------------------------------------")
    import traceback; traceback.print_exc()
    llm_model = None; llm_tokenizer = None
    raise SystemExit("LLM loading failed.")


if llm_model and llm_tokenizer:
    print(f"\nLLM {LLM_MODEL_ID} loaded. Device: {llm_model.device}.")
    # Quick VRAM check after model load
    if DEVICE == "cuda":
        torch.cuda.empty_cache(); gc.collect()
        print(f"CUDA memory allocated: {torch.cuda.memory_allocated(CUDA_DEVICE_ID)/1024**2:.2f} MB")
        print(f"CUDA memory reserved: {torch.cuda.memory_reserved(CUDA_DEVICE_ID)/1024**2:.2f} MB")
else:
    print(f"\nFailed to load LLM {LLM_MODEL_ID} or its tokenizer. Halting.")
    raise SystemExit("LLM or Tokenizer failed to load.")


Configuring BitsAndBytes for 4-bit LLM quantization on CUDA...
BitsAndBytesConfig for LLM created successfully.

Attempting to load LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 on cuda:0




LLM tokenizer loaded successfully.


W0619 14:23:39.670000 12668 Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


LLM model loaded successfully!
Model config.pad_token_id updated to 2.

LLM TinyLlama/TinyLlama-1.1B-Chat-v1.0 loaded. Device: cuda:0.
CUDA memory allocated: 789.51 MB
CUDA memory reserved: 836.00 MB


### Cell 12: Load FAISS Vector Store and Initialize Retriever 



In [3]:
# Cell 12: Load FAISS Vector Store and Initialize Retriever (Embeddings on CUDA)

db_retriever = None
embedding_model_for_retrieval = None

print(f"\nLoading FAISS vector store from: {DB_FAISS_PATH}")
if not os.path.exists(DB_FAISS_PATH):
    print(f"FATAL ERROR: FAISS index not found at {DB_FAISS_PATH}. Please ensure Phase 1 was completed.")
    raise SystemExit("FAISS index not found.")

try:
    print(f"Attempting to load embedding model ({EMBEDDING_MODEL_NAME_FOR_FAISS}) for FAISS on device: {DEVICE}")
    embedding_model_for_retrieval = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME_FOR_FAISS,
        model_kwargs={'device': DEVICE}, # <<< EMBEDDINGS ON CUDA
        encode_kwargs={'normalize_embeddings': True}
    )
    print(f"Embedding model for FAISS retrieval loaded successfully on {DEVICE}.")

    # Set model to evaluation mode (good practice for sentence-transformers models too)
    if hasattr(embedding_model_for_retrieval, 'client') and hasattr(embedding_model_for_retrieval.client, 'eval'):
        embedding_model_for_retrieval.client.eval()


    faiss_db = FAISS.load_local(
        DB_FAISS_PATH,
        embedding_model_for_retrieval,
        allow_dangerous_deserialization=True
    )
    print("FAISS vector store loaded successfully.")

    db_retriever = faiss_db.as_retriever(search_kwargs={"k": NUM_CHUNKS_TO_RETRIEVE})
    print(f"Retriever created. Will retrieve top {db_retriever.search_kwargs['k']} document chunk(s).")

except Exception as e:
    print(f"--------------------------------------------------")
    print(f"FATAL ERROR loading FAISS DB or initializing retriever: {e}")
    print(f"--------------------------------------------------")
    import traceback; traceback.print_exc()
    db_retriever = None
    raise SystemExit("FAISS/Retriever loading failed.")

if db_retriever:
    print("\nFAISS DB and Retriever are ready.")
    if DEVICE == "cuda":
        torch.cuda.empty_cache(); gc.collect()
        print(f"After FAISS load - CUDA memory allocated: {torch.cuda.memory_allocated(CUDA_DEVICE_ID)/1024**2:.2f} MB")
        print(f"After FAISS load - CUDA memory reserved: {torch.cuda.memory_reserved(CUDA_DEVICE_ID)/1024**2:.2f} MB")
else:
    print("Retriever not initialized. Halting.")
    raise SystemExit("Retriever initialization failed.")


Loading FAISS vector store from: vectorstore/db_faiss_eu_ai_act
Attempting to load embedding model (sentence-transformers/all-mpnet-base-v2) for FAISS on device: cuda




Embedding model for FAISS retrieval loaded successfully on cuda.
FAISS vector store loaded successfully.
Retriever created. Will retrieve top 5 document chunk(s).

FAISS DB and Retriever are ready.
After FAISS load - CUDA memory allocated: 1209.73 MB
After FAISS load - CUDA memory reserved: 1286.00 MB


### Cell 13: Define RAG Prompt and Custom LLM Interaction Logic


In [4]:
# Cell 13: Define RAG Prompt, Message Preparer, and Custom LLM Invoker (Simplified & Enhanced)

# Initialize to None so they are defined in this cell's scope
prepare_messages_runnable = None
custom_llm_invoker = None
# rag_prompt_for_tinyllama is not directly used in the chain if prepare_messages_runnable handles all formatting.

print("\n--- Attempting to Define RAG Components in Cell 13 (Simplified & Enhanced) ---")

# Crucial Check: Ensure LLM, Tokenizer, MAX_NEW_TOKENS_LLM, and DEVICE from earlier cells are loaded
if 'llm_model' in locals() and llm_model is not None and \
   'llm_tokenizer' in locals() and llm_tokenizer is not None and \
   'MAX_NEW_TOKENS_LLM' in locals() and isinstance(MAX_NEW_TOKENS_LLM, int) and \
   'DEVICE' in locals() and DEVICE is not None:

    print("LLM Model, Tokenizer, MAX_NEW_TOKENS_LLM, and DEVICE found. Proceeding...")

    # 1. Define the Enhanced System Prompt (Content only)
    enhanced_system_prompt_content_str = """You are an AI assistant. Your sole task is to answer the user's question based *only* on the provided 'Context from EU AI Act'.
Be concise and factual. Synthesize the information from the context into a coherent answer.
If the context does not contain enough information to answer the question, you MUST state: 'The provided context does not contain sufficient information to answer this question.'
Do not use any external knowledge or make assumptions beyond the context. Output only the answer."""
    # Removed <|system|> and </s> as apply_chat_template will handle based on role dicts.

    # 2. Define the Enhanced Message Preparation Function
    def prepare_rag_messages_enhanced(input_dict: dict) -> list:
        context_str = input_dict["context"]
        user_question_str = input_dict["question"]

        user_content_str = f"""Here is some relevant context from the EU AI Act:
--------------------
{context_str}
--------------------
Considering *only* this context, please answer the following question: {user_question_str}
Your answer should be a synthesized, natural language response based on the context."""

        messages = [
            {"role": "system", "content": enhanced_system_prompt_content_str},
            {"role": "user", "content": user_content_str}
        ]
        return messages

    prepare_messages_runnable = RunnableLambda(prepare_rag_messages_enhanced)
    print("Enhanced RAG message preparation runnable defined successfully.")


    # 3. Define the LLM Invocation with Adjusted Parameters
    # This function will use llm_model, llm_tokenizer, MAX_NEW_TOKENS_LLM, DEVICE from the outer scope (Cell 10 & 11)
    def invoke_llm_and_decode_simple(messages_for_llm: list) -> str:
        # No need to check for llm_model/tokenizer here again if the outer 'if' passed.
        # They are accessible from the scope where this function is defined and used by the Lambda.
        try:
            input_ids = llm_tokenizer.apply_chat_template(
                messages_for_llm,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(llm_model.device) # Use the device the model is actually on

            with torch.no_grad():
                output_ids = llm_model.generate(
                    input_ids,
                    max_new_tokens=MAX_NEW_TOKENS_LLM, # From Cell 10
                    temperature=0.35,
                    top_p=0.9,
                    do_sample=True,
                    pad_token_id=llm_tokenizer.eos_token_id,
                    repetition_penalty=1.15
                )
            
            response_start_index = input_ids.shape[1]
            generated_token_ids = output_ids[0][response_start_index:]
            response_text = llm_tokenizer.decode(generated_token_ids, skip_special_tokens=True).strip()

            if not response_text:
                return "The model generated an empty response."
            return response_text
        except Exception as e_gen:
            current_error_msg = f"ERROR during LLM generation: {e_gen}"
            print(current_error_msg)
            # import traceback; traceback.print_exc() # Uncomment for full traceback
            return f"ERROR in generation: {str(e_gen)[:200]}"

    custom_llm_invoker = RunnableLambda(invoke_llm_and_decode_simple) # Changed function name
    print("Custom LLM invoker runnable with enhanced parameters defined successfully.")

else:
    print("Prerequisite variables (llm_model, llm_tokenizer, MAX_NEW_TOKENS_LLM, or DEVICE) from earlier cells not found, not loaded, or of incorrect type.")
    print("RAG components in Cell 13 were NOT created.")
    # Print status of each prerequisite to help debug
    print(f"  llm_model is loaded: {'llm_model' in locals() and llm_model is not None}")
    print(f"  llm_tokenizer is loaded: {'llm_tokenizer' in locals() and llm_tokenizer is not None}")
    print(f"  MAX_NEW_TOKENS_LLM is defined as int: {'MAX_NEW_TOKENS_LLM' in locals() and isinstance(MAX_NEW_TOKENS_LLM, int)}")
    print(f"  DEVICE is defined: {'DEVICE' in locals() and DEVICE is not None}")


# Final Sanity check print after definition attempts
print(f"\nStatus after Cell 13 execution attempts:")
print(f"  prepare_messages_runnable is defined: {prepare_messages_runnable is not None}")
print(f"  custom_llm_invoker is defined: {custom_llm_invoker is not None}")


--- Attempting to Define RAG Components in Cell 13 (Simplified & Enhanced) ---
LLM Model, Tokenizer, MAX_NEW_TOKENS_LLM, and DEVICE found. Proceeding...
Enhanced RAG message preparation runnable defined successfully.
Custom LLM invoker runnable with enhanced parameters defined successfully.

Status after Cell 13 execution attempts:
  prepare_messages_runnable is defined: True
  custom_llm_invoker is defined: True


### Cell 14: Construct and Test the RAG Chain with TinyLlama 


In [5]:
# Cell 14: Construct and Test the Final RAG Chain (Simplified)

# Helper function (ensure this is defined if not already in scope)
def format_retrieved_docs(docs: list) -> str:
    return "\n".join([f"Snippet from Page {d.metadata.get('page', 'N/A')}:\n{d.page_content}" for d in docs])

rag_chain_final = None

# Check that all components from previous cells are not None
if 'db_retriever' in locals() and db_retriever and \
   'prepare_messages_runnable' in locals() and prepare_messages_runnable and \
   'custom_llm_invoker' in locals() and custom_llm_invoker:

    print("\nConstructing the final RAG chain with TinyLlama (Simplified)...")
    try:
        rag_chain_final = (
            { # Step 1: Prepare input for message formatting
                "context": RunnablePassthrough() | db_retriever | format_retrieved_docs,
                "question": RunnablePassthrough()
            }
            | prepare_messages_runnable # Step 2: Output: list of messages
            | custom_llm_invoker        # Step 3: Output: string (LLM response)
            | StrOutputParser()         # Step 4: Ensure output is a string
        )
        print("Final RAG chain constructed successfully!")

        # --- Testing Loop (same as before) ---
        print("\n--- Testing the Final RAG Chain (Simplified) ---")
        test_questions_final_rag = [
            "What are prohibited AI practices according to the EU AI Act?",
            "How is 'AI system' defined in the act?",
            "What does the act say about transparency for high-risk systems?",
            "What is the airspeed velocity of an unladen swallow?"
        ]
        for i, user_query in enumerate(test_questions_final_rag):
            print(f"\n--- RAG Test Question {i+1}: {user_query} ---")
            if DEVICE == "cuda" and i > 0:
                torch.cuda.empty_cache(); gc.collect()
            try:
                llm_response = rag_chain_final.invoke(user_query)
                print(f"TinyLlama RAG Chain Response:\n'{llm_response}'")
            except Exception as e_invoke:
                print(f"Error during RAG chain invocation: {e_invoke}")
                # import traceback; traceback.print_exc() # Uncomment for full trace
                if "out of memory" in str(e_invoke).lower():
                    print("CUDA Out of Memory. Halting."); break
        
        # Final Cleanup (same as before)
        # ... (your cleanup code) ...

    except Exception as e_chain_const:
        print(f"Error constructing the final RAG chain: {e_chain_const}")
        # import traceback; traceback.print_exc()
else:
    print("\nCannot construct final RAG chain due to missing components. Check status from Cell 13 output:")
    print(f"  db_retriever is OK: {'db_retriever' in locals() and db_retriever is not None}")
    print(f"  prepare_messages_runnable is OK: {'prepare_messages_runnable' in locals() and prepare_messages_runnable is not None}")
    print(f"  custom_llm_invoker is OK: {'custom_llm_invoker' in locals() and custom_llm_invoker is not None}")


Constructing the final RAG chain with TinyLlama (Simplified)...
Final RAG chain constructed successfully!

--- Testing the Final RAG Chain (Simplified) ---

--- RAG Test Question 1: What are prohibited AI practices according to the EU AI Act? ---
TinyLlama RAG Chain Response:
'According to the EU AI Act, the prohibited AI practices include the placing on the market, the putting into service or the use of an artificial intelligence (AI) system that uses subliminal techniques or manipulates behavior in a way that materially distorts the abilities of individuals or groups. These practices aim to manipulate or deceive people or groups for the sake of achieving specific goals, such as material gains or political power. Therefore, these practices violate the principles of fairness, transparency, and consent required by the act.'

--- RAG Test Question 2: How is 'AI system' defined in the act? ---
TinyLlama RAG Chain Response:
'In the European Union AI Act, "AI system" is defined as a machin