In [1]:
# RAG_System.ipynb

# ============================
# 1. Install Required Packages
# ============================
# You might already have some or all of these. If so, you can skip or comment them out.
# %pip install langchain transformers chromadb sentence-transformers accelerate bitsandbytes  # etc.

import os
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain.prompts import PromptTemplate
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

import shutil
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import gc
from tqdm import tqdm, trange


gc.collect()
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "mps" 

In [None]:
# ============================
# 2. Configuration
# ============================
# Path to data folder
TEXT_DATA_PATH = ["../data/zianp", "../data/dunhanj"] 
ROW_EVENT_PATH = ['../data/nicolaw']
STATIC_WEB_CSV_PATH = '../data/texts_urls_filtered.csv'
custom_cache_dir = "/mnt/new_volume"

# Choose an embedding model.
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
LLM_INPUT = "qwq"
# Choose a local LLM model.
# LLM_MODEL_ID = "tiiuae/falcon-7b-instruct"
# LLM_MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
LLM_MODEL_MAP = {"falcon": "tiiuae/falcon-7b-instruct"
                ,"llama3": "meta-llama/Llama-3.1-8B-Instruct"
                ,"deepseek-r1": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
                ,"phi-4": "unsloth/phi-4-bnb-4bit"
                ,"qwen2": "Qwen/Qwen2.5-7B-Instruct"
                ,"qwq":"Qwen/QwQ-32B"}

LLM_MODEL_ID = LLM_MODEL_MAP[LLM_INPUT]
# LLM_MODEL_ID = "unsloth/phi-4-bnb-4bit"

LLM_NAME = LLM_MODEL_ID.split("/")[-1]
data_file = "qa400"
test_data_path ="../annotations/{}.csv".format(data_file)

retriever_top_k = 4
CHUNK_SIZE = 512  
CHUNK_OVERLAP = 100
RELOAD_VECTORS_DB = False



In [4]:
# Classify files in the folder

files_txt_path = []
files_csv_path = []
files_event_path = []

for DATA_PATH in TEXT_DATA_PATH:
    for root, dirs, files in os.walk(DATA_PATH):
        for file in files:
            if file.endswith('.txt'):
                files_txt_path.append(os.path.join(root, file))
            elif file.endswith('.csv'):
                files_csv_path.append(os.path.join(root, file))

for DATA_PATH in ROW_EVENT_PATH:
    for root, dirs, files in os.walk(DATA_PATH):
        for file in files:
            if file.endswith('.txt'):
                files_event_path.append(os.path.join(root, file))




In [5]:

# ============================
# 2. Load Files with Different Strategies
# ============================
all_documents = []

# Load Dunhan CSV
test_df = pd.read_csv(STATIC_WEB_CSV_PATH)
for index, row in test_df.iterrows():
    
    all_documents.append(Document(page_content=row['TEXT'], metadata={"source": row['URL']}))

# Load all files in the directory
for file_path in files_txt_path:
    loader = TextLoader(file_path, encoding="utf-8")
    doc = loader.load()  # Load entire file as one document
    all_documents.append(Document(page_content=doc[0].page_content, metadata={"source": file_path}))

for file_path in files_csv_path:
    df = pd.read_csv(file_path)
    filename = os.path.basename(file_path)
    for index, row in df.iterrows():
        row_text = f"{filename} | " + " | ".join(f"{col}: {row[col]}" for col in df.columns)
        metadata = {"source": filename, "row_id": index}
        all_documents.append(Document(page_content=row_text, metadata=metadata))


# OPTIOANL function for processing files row by row
    # ✅ Load row by row (structured data)
for file_path in files_event_path:
    with open(file_path, "r", encoding="utf-8") as file:
        for row_id, line in enumerate(file):
            line = line.strip()
            if line:  # Ignore empty lines
                all_documents.append(Document(page_content=line, metadata={"source": filename, "row_id": row_id}))


print(f"Loaded {len(all_documents)} raw documents from {len(os.listdir(DATA_PATH))} files.")

# ============================
# 3. Split Longer Documents for Better Retrieval
# ============================
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", " ", ""]
)

split_documents = []
for doc in all_documents:
    chunks = text_splitter.split_text(doc.page_content)  # Split if needed
    for chunk in chunks:
        split_documents.append(Document(page_content=chunk, metadata=doc.metadata))

print(f"Total {len(split_documents)} final chunks prepared for vector storage.")


Loaded 8540 raw documents from 13 files.
Total 99184 final chunks prepared for vector storage.


In [6]:

# ============================
# 4. Create Embeddings
# ============================
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, cache_folder = custom_cache_dir)
print("Embeddings loaded successfully.")

# ============================
# 5. Manage Vector Store
# ============================
persist_directory = "chroma_db"

# Check if the vector store exists and delete it if necessary
if RELOAD_VECTORS_DB:

    if os.path.exists(persist_directory):
        print("Vector store exists. Deleting existing database...")
        shutil.rmtree(persist_directory)  # Deletes the existing database folder

    # Recreate the vector store
    vectorstore = Chroma.from_documents(
        documents=split_documents,
        embedding=embeddings,
        persist_directory=persist_directory
    )
else:
    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    print("Local Vector store loaded successfully.")

vectorstore.persist()
print("Vector store recreated and persisted.")



  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, cache_folder = custom_cache_dir)


Embeddings loaded successfully.


  vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)


Local Vector store loaded successfully.
Vector store recreated and persisted.


  vectorstore.persist()


In [7]:

# ============================
# 6. Set Up the LLM (Falcon 7B Instruct)
# ============================
# Load the tokenizer and model
print(f"Loading {LLM_MODEL_ID}; this may take some time...")
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, trust_remote_code=True, cache_dir=custom_cache_dir)
tokenizer.pad_token = tokenizer.eos_token  
model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL_ID,
    torch_dtype=torch.float16,
    device_map= device,           # automatically place model layers on available GPU
    trust_remote_code=True,
    cache_dir=custom_cache_dir
)


Loading meta-llama/Llama-3.1-8B-Instruct; this may take some time...


Loading checkpoint shards: 100%|██████████| 4/4 [02:01<00:00, 30.39s/it]


In [8]:
# Create a text-generation pipeline
pipeline_llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=20,
    temperature= 0.1,       # Lower temperature for more factual answers
    top_p=0.9,
    repetition_penalty=1.2,
    do_sample=True,
)

# Wrap the pipeline in a LangChain LLM
llm = HuggingFacePipeline(pipeline=pipeline_llm)


# Customized Prompt

QA_Prompt = """
You are an expert assistant answering factual questions about Pittsburgh or Carnegie Mellon University (CMU). 
Use the retrieved context to give a detailed and helpful answer. If the provided context does not contain the answer, leverage your pretraining knowledge to provide the correct answer. 

Important Instructions:
- Answer concisely without repeating the question.
- Use the provided context if relevant; otherwise, rely on your pretraining knowledge.
- Do **not** use complete sentences. Provide only the word, name, date, or phrase that directly answers the question. For example, given the question "When was Carnegie Mellon University founded?", you should only answer "1900".

Retrieved Context:
---
{context}
---

Examples:

Question: In less than 5 words, Who is Pittsburgh named after? 
Answer: William Pitt \n
Question: In less than 5 words, What famous machine learning venue had its first conference in Pittsburgh in 1980? 
Answer: ICML \n
Question: In less than 5 words, What musical artist is performing at PPG Arena on October 13? 
Answer: Billie Eilish \n

Now it's your turn. Please answer the following question based on the above context. Remember to answer as short as possible. 

Question: In less than 5 words, {question} \n\n
Answer:
"""

custom_prompt = PromptTemplate(template=QA_Prompt, input_variables=["context", "question"])


# ============================
# 7. Create the RetrievalQA Chain
# ============================
retriever = vectorstore.as_retriever(search_kwargs={"k": retriever_top_k})


def ask_question(query: str):
    """
    Run a query through the RAG pipeline and return the generated answer along with the source documents.
    
    Args:
        query (str): The user’s question.

    Returns:
        answer (str): The generated answer.
        sources (list): List of retrieved documents used to generate the answer.
    """
    # Retrieve relevant documents
    retrieved_docs = retriever.get_relevant_documents(query)
    # print(f"Retrieved {len(retrieved_docs)} documents.")
    
    # Extract text from retrieved documents
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])
    # print(f"Context length: {len(context)} characters.")
    # print('------ START CONTEXT ------')
    # print(context)
    # print('------ END CONTEXT ------')

    # Format the input using the QA_Prompt
    formatted_prompt = QA_Prompt.format(context=context, question=query)
    
    # Generate response using the LLM
    result = llm(formatted_prompt)  # Pass the fully formatted input
    answer = result.replace(formatted_prompt, "").strip()
    # Extract answer and sources
    answer = answer.strip()  # Ensure clean output
    return answer, retrieved_docs  # Return both answer and retrieved documents


Device set to use cuda
  llm = HuggingFacePipeline(pipeline=pipeline_llm)


In [9]:
# QA_Prompt.format(context='d', question='2')

In [10]:
df = pd.read_csv(test_data_path)

In [11]:
answers = []
sources = []

full = df.shape[0]
subset = 20
for i in trange(subset):
    row = df.iloc[i]
    answer, retrieved_docs = ask_question(row['question'])
    print(answer)
    answer = answer.split('\n')[0]
    answers.append(answer)
    sources.append(retrieved_docs)

df_ans = pd.DataFrame({'question': df['question'][:subset], 'answer': answers, 'reference_answer': df['reference_answer'][:subset], 'source': sources})


  retrieved_docs = retriever.get_relevant_documents(query)
  result = llm(formatted_prompt)  # Pass the fully formatted input
  5%|▌         | 1/20 [00:03<01:02,  3.29s/it]

William Pitt.  Source: 1758 letter to Pitt.  Named by General John Forbes.


 10%|█         | 2/20 [00:04<00:34,  1.92s/it]

1900 | 1967 (merger) | 1913 (partially related institute)


 15%|█▌        | 3/20 [00:04<00:21,  1.24s/it]

Smithfield Street Bridge.


 20%|██        | 4/20 [00:05<00:18,  1.15s/it]

Luis von Ahn  | No mention of this information in the text. However, I can tell


 25%|██▌       | 5/20 [00:06<00:13,  1.14it/s]

Matt Light.


 30%|███       | 6/20 [00:06<00:10,  1.40it/s]

Downtown Cultural District.


 35%|███▌      | 7/20 [00:06<00:07,  1.71it/s]

Kevin McMahon.


 40%|████      | 8/20 [00:07<00:08,  1.38it/s]

Individual Giving Team  --- OR ---   Planned Giving Team  --- OR ---   Legacy Society Team


 45%|████▌     | 9/20 [00:08<00:06,  1.61it/s]

Please contact us.


 50%|█████     | 10/20 [00:09<00:07,  1.35it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Gift Illustrator  -OR-  Planner Library  -OR-  Gift Illustrator Your Planning Library


 55%|█████▌    | 11/20 [00:09<00:06,  1.49it/s]

Steelers Hall of Honor Museum.


 60%|██████    | 12/20 [00:10<00:04,  1.66it/s]

Acrisure Stadium.


 65%|██████▌   | 13/20 [00:11<00:04,  1.46it/s]

Roberto Clemente  --- end of response --- 





--- end of conversation ---


 70%|███████   | 14/20 [00:11<00:03,  1.75it/s]

Penn Brewery.


 75%|███████▌  | 15/20 [00:11<00:02,  1.84it/s]

Pittsburgh Neighborhood Trail.


 80%|████████  | 16/20 [00:12<00:02,  1.91it/s]

Congregation Beth Shalom.


 85%|████████▌ | 17/20 [00:13<00:01,  1.53it/s]

None mentioned.  However, I can tell you that there isn't any information regarding the hotel partners


 90%|█████████ | 18/20 [00:14<00:01,  1.32it/s]

No information available in this context. However, according to my training data, the founder of the Pittsburgh


 95%|█████████▌| 19/20 [00:14<00:00,  1.37it/s]

Big Nosh 

Let me know if I'm right!


100%|██████████| 20/20 [00:15<00:00,  1.26it/s]

Jewish Federation of Greater Pittsburgh.  The Steel Tree Fund.  OR   Both options are acceptable





In [19]:
df_ans.to_csv(f'../results/test_{data_file}_{LLM_NAME}_ck{CHUNK_SIZE}_ckolap{CHUNK_OVERLAP}_retop{retriever_top_k}.csv', index=False)

In [None]:
# df_ans.to_csv('../results/test_30.csv', index=False)

In [None]:
# # Example:
# user_question = "In few words, what time will Kimberly Akimbo take place?"
# user_question = "Which bridge should drivers use as an alternate route to avoid congestion at I-279 Northbound Exit 1B on event days?"

# #"question": "What is the total expenditure forecast for the City of Pittsburgh in 2024?",
# # "answer": "$684,553,037"
# question_list = [("What is the total expenditure forecast for the City of Pittsburgh in 2024?","684,553,037")
#                  ,("Which department has the highest budget allocation in 2024?","Finance, with a budget of $190,821,098.")]
# # question_list = [('When was Carnegie Technical Schools founded?', '1900')]

# for question, ref_ans in question_list:
#     user_question = question
#     print("Question:", user_question)
#     answer, sources = ask_question(user_question)
#     print("Generated Answer:", answer)
#     print("Reference Answer:", ref_ans)

#     for i, doc in enumerate(sources):
#         print(f"[Source {i+1}] {doc.metadata.get('source', 'Unknown source')}")
    
#     print("\n\n")

# # print("Question:", user_question)
# # print(answer)
# # print("\nSources used:")
# # for i, doc in enumerate(sources):
# #     print(f"[Source {i+1}] {doc.metadata.get('source', 'Unknown source')}")


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Question: What is the total expenditure forecast for the City of Pittsburgh in 2024?
Generated Answer: $ 1,097,536,446
Reference Answer: 684,553,037
[Source 1] ../data/dunhanj/2024_operating_budget.txt
[Source 2] ../data/dunhanj/2024_operating_budget.txt
[Source 3] ../data/dunhanj/2024_operating_budget.txt
[Source 4] ../data/dunhanj/2024_operating_budget.txt



Question: Which department has the highest budget allocation in 2024?
Generated Answer: Department of Law
Reference Answer: Finance, with a budget of $190,821,098.
[Source 1] ../data/dunhanj/2024_operating_budget.txt
[Source 2] ../data/dunhanj/2024_operating_budget.txt
[Source 3] ../data/dunhanj/2024_operating_budget.txt
[Source 4] ../data/dunhanj/2024_operating_budget.txt





In [None]:
# df2 = pd.read_csv('../results/test_1000_new.csv')

In [None]:
# df2

Unnamed: 0,question,answer,reference_answer,source
0,Who is Pittsburgh named after?,William Pitt,William Pitt,[Document(metadata={'source': 'https://web.arc...
1,What year was Carnegie Mellon University founded?,<|repo_name|>jamesr66/qa-pittsburgh-cmu<|file,1900,[Document(metadata={'source': '../data/zianp/w...
2,Which bridge in Pittsburgh is famously yellow?,Fort Duquesne Bridge,Roberto Clemente Bridge,[Document(metadata={'source': 'https://trustar...
3,Which famous AI professor at CMU co-founded Du...,Luis von Ahn,Luis von Ahn,[Document(metadata={'source': 'https://www.cmu...
4,Who hosts the Burgh Bus comedy tour in Pittsbu...,Matt Light,Matt Light.,[Document(metadata={'source': 'https://downtow...
...,...,...,...,...
430,What is the name of the city that hosts the Ci...,Western Pennsylvania## Question ##,Pittsburgh,[Document(metadata={'source': '../data/zianp/w...
431,What is the name of the city that is home to t...,Pittsburgh,Pittsburgh,[Document(metadata={'source': '../data/zianp/w...
432,What is the name of the famous university in P...,Carnegie Mellon University,Carnegie Mellon University,[Document(metadata={'source': 'https://kids.br...
433,What is the name of the famous comedy tour in ...,Pittsburgh Improv Theatre,Burgh Bus,[Document(metadata={'source': 'https://downtow...
