In [3]:
## Retrieval augmented generation

from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
import os

# Ensure environment is loaded correctly
from dotenv import load_dotenv
load_dotenv()

# Set LLM and embedding model explicitly
Settings.llm.api_key = os.getenv("OPENAI_API_KEY")  # Optional but safe
Settings.embed_model = OpenAIEmbedding(
    api_key=os.getenv("OPENAI_API_KEY"),
    model="text-embedding-ada-002"
)





In [4]:
os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")

In [5]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents=SimpleDirectoryReader("data").load_data()


Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 120 0 (offset 0)
Ignoring wrong pointing object 247 0 (offset 0)


In [6]:
documents

[Document(id_='08d14cd5-a95e-4c24-b64e-4eb56c33762f', embedding=None, metadata={'page_label': '1', 'file_name': 'AcademicPolicies.pdf', 'file_path': '/Users/vincentgibbons/Library/CloudStorage/OneDrive-Personal/College/Spring Semester 2025/Capstone 2/coug-gpt/data/AcademicPolicies.pdf', 'file_type': 'application/pdf', 'file_size': 5718443, 'creation_date': '2025-03-25', 'last_modified_date': '2025-03-25'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Colorado Christian University\n2025 Catalog', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}'

In [7]:
index=VectorStoreIndex.from_documents(documents, show_progress=True)

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 782/782 [00:00<00:00, 2698.35it/s]
Generating embeddings: 100%|██████████| 810/810 [00:13<00:00, 62.26it/s]


In [8]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x1724a4be0>

In [9]:
query_engine=index.as_query_engine()

In [10]:
query_engine

<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x1723b9b10>

In [11]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.postprocessor import SimilarityPostprocessor

retriever=VectorIndexRetriever(index=index,similarity_top_k=10)
postprocesser=SimilarityPostprocessor(similarity_cutoff=0.40)

query_engine=RetrieverQueryEngine(retriever=retriever, node_postprocessors=[postprocesser])

In [12]:
response=query_engine.query("who is tim mctavish")

In [13]:
response

Response(response='Tim McTavish is a member of the Board of Trustees at Colorado Christian University.', source_nodes=[NodeWithScore(node=TextNode(id_='1335612f-1c54-4db8-a3f2-7ab5b68c82d2', embedding=None, metadata={'page_label': '188', 'file_name': 'cus-student-handbook.pdf', 'file_path': '/Users/vincentgibbons/Library/CloudStorage/OneDrive-Personal/College/Spring Semester 2025/Capstone 2/coug-gpt/data/cus-student-handbook.pdf', 'file_type': 'application/pdf', 'file_size': 2123850, 'creation_date': '2025-03-25', 'last_modified_date': '2025-02-25'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f25ea720-7630-4f30-9bde-f411f0474557', node_type='4', metadata={'page_label': '188', 'file_name': 'cus-student-

In [14]:
print(response)

Tim McTavish is a member of the Board of Trustees at Colorado Christian University.


In [15]:
from llama_index.core.response.pprint_utils import pprint_response
pprint_response(response,show_source=True)
print(response)

Final Response: Tim McTavish is a member of the Board of Trustees at
Colorado Christian University.
______________________________________________________________________
Source Node 1/4
Node ID: 1335612f-1c54-4db8-a3f2-7ab5b68c82d2
Similarity: 0.5886546068589684
Text: 187      If the person obtaining the ticket at the end of ten
days takes no action, the  student receiving the ticket loses the
right of appeal.      LOCAL CHURCHES  (CCU does not endorse one church
over another)
______________________________________________________________________
Source Node 2/4
Node ID: 47b5cb59-bafc-4ff1-acbe-4738905a029a
Similarity: 0.5858982909395776
Text: Gary L. Steward Dean, School of Humanities and Social Sciences;
Associate Professor of History College of Undergraduate Studies BA,
South Dakota State University MA, Westminster Theological Seminary
MDiv, Ph.D., Southern Baptist Theological Seminary Shane T. Stone
Director; Associate Professor of Criminal Justice College of Adult and
Graduate St

In [17]:
import os.path
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage,
)

# Check if storage already exists
PERSIST_DIR = "./storage"
if not os.path.exists(PERSIST_DIR):
    # Load documents and create the index
    documents = SimpleDirectoryReader("data").load_data()
    index = VectorStoreIndex.from_documents(documents)
    #Store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    #Load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

# Either way we can now query the index
query_engine = index.as_query_engine()
response = query_engine.query("Who is tim mctavish")
print(response)



There is no information provided about Tim McTavish in the given context.


In [23]:
name = "cory hixson"

matches = []
for node in index.docstore.docs.values():
    if name in node.text.lower():
        matches.append(node)

print(f"Found {len(matches)} matching chunks.")

for i, node in enumerate(matches):
    print(f"\n--- Match {i+1} ---")
    print("File:", node.metadata.get("file_name", "Unknown"))
    print("Text:", node.text[:500])


Found 1 matching chunks.

--- Match 1 ---
File: AcademicPolicies.pdf
Text: Kristen J. Goree
Director, Graduate Nursing; Professor of Nursing
College of Adult and Graduate Studies
B.A., BSN, MSN, DNP, University of Colorado
Misti D. Gossett Thrower
Assistant Professor of Counseling
College of Adult and Graduate Studies
BA, Metropolitan State University
MA, MA, University of Colorado Denver
PhD, Adams State University
Sonji D. Gregory
Associate Professor ofCounseling
College of Adult and Graduate Studies
BA, Malone University
MA, Ashland University
PhD, Regent University
