In [None]:
# RAG_System.ipynb

# ============================
# 1. Install Required Packages
# ============================
# You might already have some or all of these. If so, you can skip or comment them out.
# %pip install langchain transformers chromadb sentence-transformers accelerate bitsandbytes  # etc.

import os
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain.prompts import PromptTemplate
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

import shutil
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import gc



gc.collect()
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ============================
# 2. Configuration
# ============================
# Path to data folder
DATA_PATH = "../data" 

# Choose an embedding model.
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# Choose a local LLM model.
LLM_MODEL_ID = "tiiuae/falcon-7b-instruct"

retriever_top_k  = 4
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100



In [None]:
# Classify files in the folder

files_txt_path = []
files_csv_path = []

for root, dirs, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith('.txt'):
            files_txt_path.append(os.path.join(root, file))
        elif file.endswith('.csv'):
            files_csv_path.append(os.path.join(root, file))




In [None]:

# ============================
# 2. Load Files with Different Strategies
# ============================
all_documents = []

# Load all files in the directory
for file_path in files_txt_path:
    loader = TextLoader(file_path, encoding="utf-8")
    doc = loader.load()  # Load entire file as one document
    all_documents.append(Document(page_content=doc[0].page_content, metadata={"source": file_path}))

for file_path in files_csv_path:
    df = pd.read_csv(file_path)
    filename = os.path.basename(file_path)
    for index, row in df.iterrows():
        row_text = f"{filename} | " + " | ".join(f"{col}: {row[col]}" for col in df.columns)
        metadata = {"source": filename, "row_id": index}
        all_documents.append(Document(page_content=row_text, metadata=metadata))


## OPTIOANL function for processing files row by row
#     # ✅ Load row by row (structured data)
#     if filename in ROW_BY_ROW_FILES:
#         with open(file_path, "r", encoding="utf-8") as file:
#             for row_id, line in enumerate(file):
#                 line = line.strip()
#                 if line:  # Ignore empty lines
#                     all_documents.append(Document(page_content=line, metadata={"source": filename, "row_id": row_id}))


print(f"Loaded {len(all_documents)} raw documents from {len(os.listdir(DATA_PATH))} files.")

# ============================
# 3. Split Longer Documents for Better Retrieval
# ============================
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", " ", ""]
)

split_documents = []
for doc in all_documents:
    chunks = text_splitter.split_text(doc.page_content)  # Split if needed
    for chunk in chunks:
        split_documents.append(Document(page_content=chunk, metadata=doc.metadata))

print(f"Total {len(split_documents)} final chunks prepared for vector storage.")


Loaded 1231 raw documents from 1 files.
Total 2827 final chunks prepared for vector storage.


In [None]:

# ============================
# 4. Create Embeddings
# ============================
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
print("Embeddings loaded successfully.")

# ============================
# 5. Manage Vector Store
# ============================
persist_directory = "chroma_db"

# Check if the vector store exists and delete it if necessary
if os.path.exists(persist_directory):
    print("Vector store exists. Deleting existing database...")
    shutil.rmtree(persist_directory)  # Deletes the existing database folder

# Recreate the vector store
vectorstore = Chroma.from_documents(
    documents=split_documents,
    embedding=embeddings,
    persist_directory=persist_directory
)

vectorstore.persist()
print("Vector store recreated and persisted.")



  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)


Embeddings loaded successfully.
Vector store recreated and persisted.


  vectorstore.persist()


In [8]:

# ============================
# 6. Set Up the LLM (Falcon 7B Instruct)
# ============================
# Load the tokenizer and model
print("Loading Falcon 7B Instruct model; this may take some time...")
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL_ID,
    torch_dtype=torch.float16,
    device_map="mps",           # automatically place model layers on available GPU
    trust_remote_code=True
)


Loading Falcon 7B Instruct model; this may take some time...




Loading checkpoint shards: 100%|██████████| 2/2 [00:25<00:00, 12.82s/it]


In [None]:
# Create a text-generation pipeline
pipeline_llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,
    temperature=0.2,       # Lower temperature for more factual answers
    top_p=0.9,
    repetition_penalty=1.2,
)

# Wrap the pipeline in a LangChain LLM
llm = HuggingFacePipeline(pipeline=pipeline_llm)


# Customized Prompt

QA_Prompt = """
You are an expert assistant answering factual questions about Pittsburgh or Carnegie Mellon University (CMU). 
Use the retrieved information to give a detailed and helpful answer. If the provided context does not contain the answer, leverage your pretraining knowledge to provide the correct answer. 
If you truly do not know, just say "I don't know."

Important Instructions:
- Answer concisely without repeating the question.
- Use the provided context if relevant; otherwise, rely on your pretraining knowledge.
- Do **not** use complete sentences. Provide only the word, name, date, or phrase that directly answers the question. For example, given the question "When was Carnegie Mellon University founded?", you should only answer "1900".

Examples:
Question: Who is Pittsburgh named after? 
Answer: William Pitt
Question: What famous machine learning venue had its first conference in Pittsburgh in 1980? 
Answer: ICML
Question: What musical artist is performing at PPG Arena on October 13? 
Answer: Billie Eilish

- Please answer the following question based on the provided context, the information in the example above might not be relevant to the current context.

Context: \n\n {context} \n\n
Question: {question} \n\n
Answer:
"""

custom_prompt = PromptTemplate(template=QA_Prompt, input_variables=["context", "question"])


# ============================
# 7. Create the RetrievalQA Chain
# ============================
retriever = vectorstore.as_retriever(search_kwargs={"k": retriever_top_k})


def ask_question(query: str):
    """
    Run a query through the RAG pipeline and return the generated answer along with the source documents.
    
    Args:
        query (str): The user’s question.

    Returns:
        answer (str): The generated answer.
        sources (list): List of retrieved documents used to generate the answer.
    """
    # Retrieve relevant documents
    retrieved_docs = retriever.get_relevant_documents(query)
    
    # Extract text from retrieved documents
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])

    # Format the input using the QA_Prompt
    formatted_prompt = QA_Prompt.format(context=context, question=query)

    # Generate response using the LLM
    result = llm(formatted_prompt)  # Pass the fully formatted input

    # Extract answer and sources
    answer = result.strip()  # Ensure clean output
    return answer, retrieved_docs  # Return both answer and retrieved documents


Device set to use mps
  llm = HuggingFacePipeline(pipeline=pipeline_llm)


In [None]:
# Example:
user_question = "What time will Kimberly Akimbo take place?"
user_question = "Which events are taking place at the Carnegie of Homestead Music Hall?"
user_question = "When is The Way Of Tea: Ceremony And Recital taking place?"

answer, sources = ask_question(user_question)

# print("Question:", user_question)
print(answer)
print("\nSources used:")
for i, doc in enumerate(sources):
    print(f"[Source {i+1}] {doc.metadata.get('source', 'Unknown source')}")


  retrieved_docs = retriever.get_relevant_documents(query)
  result = llm(formatted_prompt)  # Pass the fully formatted input
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Question: When is The Way Of Tea: Ceremony And Recital taking place?
Answer: You are an expert assistant answering factual questions about Pittsburgh or Carnegie Mellon University (CMU). 
Use the retrieved information to give a detailed and helpful answer. If the provided context does not contain the answer, leverage your pretraining knowledge to provide the correct answer. 
If you truly do not know, just say "I don't know."

Important Instructions:
- Answer concisely without repeating the question.
- Use the provided context if relevant; otherwise, rely on your pretraining knowledge.
- Do **not** use complete sentences. Provide only the word, name, date, or phrase that directly answers the question. For example, given the question "When was Carnegie Mellon University founded?", you should only answer "1900".

Examples:
Question: Who is Pittsburgh named after? 
Answer: William Pitt
Question: What famous machine learning venue had its first conference in Pittsburgh in 1980? 
Answer: ICM