In [1]:
# RAG_System.ipynb

# ============================
# 1. Install Required Packages
# ============================
# You might already have some or all of these. If so, you can skip or comment them out.
# %pip install langchain transformers chromadb sentence-transformers accelerate bitsandbytes  # etc.

import os
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain.prompts import PromptTemplate
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

import shutil
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import gc



gc.collect()
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ============================
# 2. Configuration
# ============================
# Path to data folder
DATA_PATH = "../data/zianp" 

# Choose an embedding model.
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# Choose a local LLM model.
# LLM_MODEL_ID = "tiiuae/falcon-7b-instruct"
LLM_MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

retriever_top_k  = 4
CHUNK_SIZE = 512  
CHUNK_OVERLAP = 100
RELOAD_VECTORS_DB = False



In [3]:
# Classify files in the folder

files_txt_path = []
files_csv_path = []

for root, dirs, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith('.txt'):
            files_txt_path.append(os.path.join(root, file))
        elif file.endswith('.csv'):
            files_csv_path.append(os.path.join(root, file))




In [4]:
test_df = pd.read_csv('../data/texts_urls_filtered.csv')

In [5]:
file_event_path = ['/Users/panzian/Desktop/11711-anlp/project/anlp-spring2025-hw2/data/zianp/pitts_event/pittsburgh_events.txt1']

In [6]:
# # iter
# all_documents = []


In [7]:

# ============================
# 2. Load Files with Different Strategies
# ============================
all_documents = []

# Load Dunhan CSV
for index, row in test_df.iterrows():
    all_documents.append(Document(page_content=row['TEXT'], metadata={"source": row['URL']}))

# Load all files in the directory
for file_path in files_txt_path:
    loader = TextLoader(file_path, encoding="utf-8")
    doc = loader.load()  # Load entire file as one document
    all_documents.append(Document(page_content=doc[0].page_content, metadata={"source": file_path}))

for file_path in files_csv_path:
    df = pd.read_csv(file_path)
    filename = os.path.basename(file_path)
    for index, row in df.iterrows():
        row_text = f"{filename} | " + " | ".join(f"{col}: {row[col]}" for col in df.columns)
        metadata = {"source": filename, "row_id": index}
        all_documents.append(Document(page_content=row_text, metadata=metadata))


# OPTIOANL function for processing files row by row
    # ✅ Load row by row (structured data)
for file_path in file_event_path:
    with open(file_path, "r", encoding="utf-8") as file:
        for row_id, line in enumerate(file):
            line = line.strip()
            if line:  # Ignore empty lines
                all_documents.append(Document(page_content=line, metadata={"source": filename, "row_id": row_id}))


print(f"Loaded {len(all_documents)} raw documents from {len(os.listdir(DATA_PATH))} files.")

# ============================
# 3. Split Longer Documents for Better Retrieval
# ============================
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", " ", ""]
)

split_documents = []
for doc in all_documents:
    chunks = text_splitter.split_text(doc.page_content)  # Split if needed
    for chunk in chunks:
        split_documents.append(Document(page_content=chunk, metadata=doc.metadata))

print(f"Total {len(split_documents)} final chunks prepared for vector storage.")


Loaded 7978 raw documents from 6 files.
Total 88614 final chunks prepared for vector storage.


In [8]:

# ============================
# 4. Create Embeddings
# ============================
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
print("Embeddings loaded successfully.")

# ============================
# 5. Manage Vector Store
# ============================
persist_directory = "chroma_db"

# Check if the vector store exists and delete it if necessary
if RELOAD_VECTORS_DB:

    if os.path.exists(persist_directory):
        print("Vector store exists. Deleting existing database...")
        shutil.rmtree(persist_directory)  # Deletes the existing database folder

    # Recreate the vector store
    vectorstore = Chroma.from_documents(
        documents=split_documents,
        embedding=embeddings,
        persist_directory=persist_directory
    )
else:
    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    print("Local Vector store loaded successfully.")

vectorstore.persist()
print("Vector store recreated and persisted.")



  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)


Embeddings loaded successfully.


  vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)


Local Vector store loaded successfully.
Vector store recreated and persisted.


  vectorstore.persist()


In [9]:

# ============================
# 6. Set Up the LLM (Falcon 7B Instruct)
# ============================
# Load the tokenizer and model
print(f"Loading {LLM_MODEL_ID}; this may take some time...")
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL_ID,
    torch_dtype=torch.float16,
    device_map="mps",           # automatically place model layers on available GPU
    trust_remote_code=True
)


Loading meta-llama/Llama-3.1-8B-Instruct; this may take some time...


Loading checkpoint shards: 100%|██████████| 4/4 [00:34<00:00,  8.73s/it]


In [10]:
# Create a text-generation pipeline
pipeline_llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=20,
    temperature=0.1,       # Lower temperature for more factual answers
    top_p=0.9,
    repetition_penalty=1.2,
)

# Wrap the pipeline in a LangChain LLM
llm = HuggingFacePipeline(pipeline=pipeline_llm)


# Customized Prompt

QA_Prompt = """
You are an expert assistant answering factual questions about Pittsburgh or Carnegie Mellon University (CMU). 
Use the retrieved information to give a detailed and helpful answer. If the provided context does not contain the answer, leverage your pretraining knowledge to provide the correct answer. 
If you truly do not know, just say "I don't know."

Important Instructions:
- Answer concisely without repeating the question.
- Use the provided context if relevant; otherwise, rely on your pretraining knowledge.
- Do **not** use complete sentences. Provide only the word, name, date, or phrase that directly answers the question. For example, given the question "When was Carnegie Mellon University founded?", you should only answer "1900".

Examples:
Question: Who is Pittsburgh named after? 
Answer: William Pitt
Question: What famous machine learning venue had its first conference in Pittsburgh in 1980? 
Answer: ICML
Question: What musical artist is performing at PPG Arena on October 13? 
Answer: Billie Eilish

Now it's your turn. Please answer the following question based on the provided context, the information in the example above might not be relevant to the current context. Remember to answer concisely and directly. Do not provide any additional explanations. Please do not generate other questions.

Context: \n\n {context} \n\n
Question: {question} \n\n
Answer:
"""

custom_prompt = PromptTemplate(template=QA_Prompt, input_variables=["context", "question"])


# ============================
# 7. Create the RetrievalQA Chain
# ============================
retriever = vectorstore.as_retriever(search_kwargs={"k": retriever_top_k})


def ask_question(query: str):
    """
    Run a query through the RAG pipeline and return the generated answer along with the source documents.
    
    Args:
        query (str): The user’s question.

    Returns:
        answer (str): The generated answer.
        sources (list): List of retrieved documents used to generate the answer.
    """
    # Retrieve relevant documents
    retrieved_docs = retriever.get_relevant_documents(query)
    print(f"Retrieved {len(retrieved_docs)} documents.")
    
    # Extract text from retrieved documents
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])
    print(f"Context length: {len(context)} characters.")
    print('------ START CONTEXT ------')
    print(context)
    print('------ END CONTEXT ------')

    # Format the input using the QA_Prompt
    formatted_prompt = QA_Prompt.format(context=context, question=query)
    
    # Generate response using the LLM
    result = llm(formatted_prompt)  # Pass the fully formatted input
    answer = result.replace(formatted_prompt, "").strip()
    # Extract answer and sources
    answer = answer.strip()  # Ensure clean output
    return answer, retrieved_docs  # Return both answer and retrieved documents


Device set to use mps
  llm = HuggingFacePipeline(pipeline=pipeline_llm)


In [11]:
# QA_Prompt.format(context='d', question='2')

In [12]:
# Example:
user_question = "In few words, what time will Kimberly Akimbo take place?"
# user_question = "Which events are taking place at the Carnegie of Homestead Music Hall?"
# user_question = "When is The Way Of Tea: Ceremony And Recital taking place?"
# user_question = "Can you give me a vendor of picklesburgh event?"
# user_question = "What is Pittsburgh's population in 1761?"
# user_question = "What is the name of the stadium in Pittsburgh where the Steelers and Pitt Panthers play?"
user_question = "Which bridge should drivers use as an alternate route to avoid congestion at I-279 Northbound Exit 1B on event days?"

question_list = [
    ("What is the name of the stadium in Pittsburgh where the Steelers and Pitt Panthers play?", "Acrisure Stadium"),
    ("What is the name of the new parking garage located between Acrisure Stadium and PNC Park?", "Champions Parking Garage"),
    ("Which mobile app is recommended for booking parking in advance at Acrisure Stadium?", "SpotHero"),
    ("What policy is enforced inside the Champions Garage regarding tailgating?", "No tailgating is allowed inside the Champions Garage"),
    ("What is the best route for fans walking from downtown Pittsburgh to Acrisure Stadium due to sidewalk closures at PNC Park?", "The River Walk"),
    ("Which navigation app has partnered with Acrisure Stadium to provide parking directions?", "Waze"),
    ("What major event besides football games is hosted at Acrisure Stadium and includes a schedule for ribs?", "Kickoff and Rib Festival"),
    ("Which bridge should drivers use as an alternate route to avoid congestion at I-279 Northbound Exit 1B on event days?", "West End Bridge"),
    ("Which two sports teams play home games at Acrisure Stadium?", "Pittsburgh Steelers and Pitt Panthers"),
    ("What is the policy regarding parking pass delivery for the Champions Garage?", "Parking passes are mobile delivery only")
]
# question_list = [('When was Carnegie Technical Schools founded?', '1900')]

for question, ref_ans in question_list:
    user_question = question
    print("Question:", user_question)
    answer, sources = ask_question(user_question)
    print("Generated Answer:", answer)
    print("Reference Answer:", ref_ans)

    for i, doc in enumerate(sources):
        print(f"[Source {i+1}] {doc.metadata.get('source', 'Unknown source')}")
    
    print("\n\n")

# print("Question:", user_question)
# print(answer)
# print("\nSources used:")
# for i, doc in enumerate(sources):
#     print(f"[Source {i+1}] {doc.metadata.get('source', 'Unknown source')}")


  retrieved_docs = retriever.get_relevant_documents(query)


Question: What is the name of the stadium in Pittsburgh where the Steelers and Pitt Panthers play?


  result = llm(formatted_prompt)  # Pass the fully formatted input
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Retrieved 4 documents.
Context length: 1565 characters.
------ START CONTEXT ------
Stadium stadium, Pittsburgh, Pennsylvania, United States Ask the Chatbot a Question Also known as: Heinz Field Learn about this topic in these articles: Heinz Company Pittsburgh In Pittsburgh: The contemporary city …city’s professional baseball team, and Acrisure Stadium houses the Steelers, its professional football team. The Penguins, Pittsburgh’s professional ice hockey team, plays at PPG Paints Arena. Popular summertime attractions include riverboat excursions on Pittsburgh’s waterways and Kennywood,

date:OCT 11 2025  month:OCT  day:11  year:2025  time:1:00 PM SAT  week:SAT  venue:NC State Wolfpack at Pittsburgh Panthers Football Acrisure Stadium (formerly Heinz Field) 15212, Pittsburgh, Pennsylvania, US  date-desc:Acrisure Stadium (formerly Heinz Field)  location:15212, Pittsburgh, Pennsylvania, US  from-price:Prices from $123

date:SEP 05 2025  month:SEP  day:05  year:2025  time:11:59 PM FRI  wee

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Retrieved 4 documents.
Context length: 2032 characters.
------ START CONTEXT ------
have partnered on a new parking garage between Acrisure Stadium and PNC Park. Adding over 400 new spaces to the North Shore, the Champions Garage is located at the corner of West General Robinson Street and Tony Dorsett Drive behind the Hyatt Place Hotel. Important Details Before making your purchase request, please keep the following in mind: Parking passes for this lot are mobile delivery only. Your pass will be scanned upon entry to the Champions Garage. No Tailgating will be allowed inside the

Game: Fans walking from downtown should use the River Walk to avoid sidewalk closures at PNC Park. Parking We recommend booking convenient and affordable parking in advance through SpotHero, the nation's leading parking reservations app. To reserve your parking spot, visit the Acrisure Stadium SpotHero Parking Page . Parking Champions Parking Garage The Steelers, Pirates and Oxford Development are pleased to 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Answer: SpotHero
Reference Answer: SpotHero
[Source 1] https://acrisurestadium.com/stadium/parking-directions#directions
[Source 2] https://www.mlb.com/pirates/apps/ballpark
[Source 3] https://www.pittsburghpa.gov/safety/alerts/public-safety-press-release-archive
[Source 4] https://www.dallascowboys.com



Question: What policy is enforced inside the Champions Garage regarding tailgating?
Retrieved 4 documents.
Context length: 2038 characters.
------ START CONTEXT ------
pass will be scanned upon entry to the Champions Garage. No Tailgating will be allowed inside the Champions Garage. Entry into the Champions Garage is from West General Robinson Street and North Shore Drive for easy access from all directions. If you purchased a full-season parking package and any or all postseason games are not played at home at Acrisure Stadium this season, all payments for those parking passes will be credited towards your next season parking pass. Purchase Single Event Parking Purchase

&

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Answer: No Tailgating will be allowed inside the Champions Garage.  #1



#2
A
Reference Answer: No tailgating is allowed inside the Champions Garage
[Source 1] https://acrisurestadium.com/stadium/parking-directions#directions
[Source 2] https://www.pittsburghpa.gov/recreation-events/about-parks/parks-rules
[Source 3] https://www.pittsburghpa.gov/recreation-events/park-permits/sports-field-permit
[Source 4] https://acrisurestadium.com/stadium/parking-directions#directions



Question: What is the best route for fans walking from downtown Pittsburgh to Acrisure Stadium due to sidewalk closures at PNC Park?
Retrieved 4 documents.
Context length: 2032 characters.
------ START CONTEXT ------
Game: Fans walking from downtown should use the River Walk to avoid sidewalk closures at PNC Park. Parking We recommend booking convenient and affordable parking in advance through SpotHero, the nation's leading parking reservations app. To reserve your parking spot, visit the Acrisure Stadiu

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Answer: River Walk  #2)   #3)    Riverwalk  RiverWalk  River Walk
Reference Answer: The River Walk
[Source 1] https://acrisurestadium.com/stadium/parking-directions#directions
[Source 2] https://www.mlb.com/pirates/ballpark/tours
[Source 3] https://acrisurestadium.com/stadium/parking-directions#directions
[Source 4] https://www.mlb.com/pirates/ballpark/tours



Question: Which navigation app has partnered with Acrisure Stadium to provide parking directions?
Retrieved 4 documents.
Context length: 2031 characters.
------ START CONTEXT ------
please call the PNC Park Hotline at 412-325-4700 or e-mail us at pncparktours@pirates.com . PARKING printer-2 Download and Print (PDF) The Pirates have teamed up with Waze, a community-based traffic and navigation app, to provide you with the most efficient routes to and from the ballpark. Waze is the only GPS program that (thanks to our partnership) has PNC Park parking lots, traffic patterns, and street closures incorporated, providing fa

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Answer: Waze  #Pittsburgh #CarnegieMellonUniversity #ExpertAssistant #F
Reference Answer: Waze
[Source 1] https://www.mlb.com/pirates/ballpark/tours
[Source 2] https://acrisurestadium.com/stadium/parking-directions#directions
[Source 3] https://acrisurestadium.com/stadium/parking-directions#directions
[Source 4] https://acrisurestadium.com/stadium/parking-directions#directions



Question: What major event besides football games is hosted at Acrisure Stadium and includes a schedule for ribs?
Retrieved 4 documents.
Context length: 2038 characters.
------ START CONTEXT ------
Rib Fest Ribs Book a Private Event Audio / Visual Services Contact Us Event Spaces UPMC Club West Club PNC Champions Club FedEx Great Hall North Club Press Box Dining Room Ford Fan Zone Food and Beverage Event FAQs Guest Services Clear Bag Policy A-Z Guide Fan Guide Code of Conduct Security Guidelines Disability Services Evacuation Guide Payment Methods Book a Private Event As Pittsburgh’s most unique even

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Retrieved 4 documents.
Context length: 1896 characters.
------ START CONTEXT ------
on your lot specific route. Avoid I-279 Northbound Exit 1B Congestion: Use North Shore Alternate Route – Exit 1D Use West End Bridge (Parkway East) Use West End Bridge (Parkway West) Avoid I-279 Southbound Exit 1B Congestion: Use North Shore Alternate Route – Exit 2B © 2025 Acrisure Stadium | Privacy Policy | Terms of Use Team Member Login Facebook Twitter Instagram

via I-376 West. Take Exit 70C to North Shore ramp / Fort Duquesne Bridge Keep left on Fort Duquesne Bridge to Acrisure Stadium / North Shore Exit. Alternate Routes (Event Days) To avoid heavy congestion on I-279 Exit 1B North Shore during event days, please use these Alternate Routes. When applicable, certain Lot Specific Directions contain Alternate Routes to provide additional options if congestion occurs on your lot specific route. Avoid I-279 Northbound Exit 1B Congestion: Use North Shore Alternate

and Tunnel FROM ROUTE 51: Take Libert

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Answer: West End Bridge (Parkway East)  West End Bridge (Parkway West)  both
Reference Answer: West End Bridge
[Source 1] https://acrisurestadium.com/stadium/parking-directions#directions
[Source 2] https://acrisurestadium.com/stadium/parking-directions#directions
[Source 3] https://downtownpittsburgh.com/visit/getting-here/driving-directions
[Source 4] https://www.pittsburghpa.gov/safety/alerts/public-safety-press-release-archive



Question: Which two sports teams play home games at Acrisure Stadium?
Retrieved 4 documents.
Context length: 2029 characters.
------ START CONTEXT ------
Stadium stadium, Pittsburgh, Pennsylvania, United States Ask the Chatbot a Question Also known as: Heinz Field Learn about this topic in these articles: Heinz Company Pittsburgh In Pittsburgh: The contemporary city …city’s professional baseball team, and Acrisure Stadium houses the Steelers, its professional football team. The Penguins, Pittsburgh’s professional ice hockey team, plays at PPG Pai

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Answer: Steelers, Panthers
Reference Answer: Pittsburgh Steelers and Pitt Panthers
[Source 1] https://www.britannica.com/place/acrisure-stadium
[Source 2] https://www.visitpittsburgh.com/blog/guide-to-acrisure-stadium-for-steelers-fans
[Source 3] ../data/zianp/wiki_test/Pittsburgh/Pittsburgh.txt
[Source 4] ../data/zianp/wiki/pittsburgh/Pittsburgh.txt



Question: What is the policy regarding parking pass delivery for the Champions Garage?
Retrieved 4 documents.
Context length: 2041 characters.
------ START CONTEXT ------
pass will be scanned upon entry to the Champions Garage. No Tailgating will be allowed inside the Champions Garage. Entry into the Champions Garage is from West General Robinson Street and North Shore Drive for easy access from all directions. If you purchased a full-season parking package and any or all postseason games are not played at home at Acrisure Stadium this season, all payments for those parking passes will be credited towards your next season park