In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from langchain_ollama import ChatOllama


base_url = "http://localhost:11434"
model = 'llama3.2'

llm = ChatOllama(base_url=base_url, model=model)
llm

ChatOllama(model='llama3.2', base_url='http://localhost:11434')

In [3]:
llm.invoke("What is the capital of France?")

AIMessage(content='The capital of France is Paris.', additional_kwargs={}, response_metadata={'model': 'llama3.2', 'created_at': '2025-05-15T03:54:18.474659451Z', 'done': True, 'done_reason': 'stop', 'total_duration': 3552462250, 'load_duration': 3055686550, 'prompt_eval_count': 32, 'prompt_eval_duration': 393889803, 'eval_count': 8, 'eval_duration': 101578814, 'model_name': 'llama3.2'}, id='run--5cca35d9-0b27-4f20-942f-da042de9b8db-0', usage_metadata={'input_tokens': 32, 'output_tokens': 8, 'total_tokens': 40})

In [4]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import LLMChain
from tqdm import tqdm

In [5]:
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader(
    "../data/crawled_content/",
    glob="**/utsa_*.txt",
    loader_cls=TextLoader,
    loader_kwargs={
        "encoding": "utf8",  # Specify encoding
        "autodetect_encoding": True,  # Automatically detect encoding
    },
    show_progress=True,  # Show loading progress
    silent_errors=False,  # Handle errors without stopping
)

documents = loader.load()

100%|██████████| 19/19 [00:00<00:00, 146.78it/s]


In [6]:
# Print number of documents loaded
print(f"Loaded {len(documents)} documents")

# Access the content of documents
for doc in documents:
    print(doc.page_content[:100])  # Print first 100 characters
    print(doc.metadata)  # Print metadata (includes file path)

Loaded 19 documents
LINK: https://hpcsupport.utsa.edu/foswiki/bin/view/WebDocumentation/WebHome/?sortcol=2;table=1;up=0

{'source': '../data/crawled_content/utsa_edu_11000.txt'}
LINK: https://rowdylink.utsa.edu/event/10750478
TITLE: -
CONTENT:


LINK: https://www.utsa.edu/strat
{'source': '../data/crawled_content/utsa_edu_2000.txt'}
LINK: https://www.utsa.edu/advising/advisor/life.html
TITLE: Life & Health Sciences | UTSA Advising 
{'source': '../data/crawled_content/utsa_edu_5000.txt'}
LINK: https://www.utsa.edu/strategicplan/initiatives/upcoming-past/
TITLE: Upcoming & Past Initiativ
{'source': '../data/crawled_content/utsa_edu_10000.txt'}
LINK: https://utsa.edu/
TITLE: Welcome to The University of Texas at San Antonio | UTSA
CONTENT:

UT
{'source': '../data/crawled_content/utsa_edu_1000.txt'}
LINK: https://colfa.utsa.edu/faculty/profiles/modern-languages/huenlich-david.html
TITLE: David Huen
{'source': '../data/crawled_content/utsa_edu_9000.txt'}
LINK: https://sciences.utsa.edu/facu

In [7]:
print(documents[0].page_content)  # Print the content of the first document

LINK: https://hpcsupport.utsa.edu/foswiki/bin/view/WebDocumentation/WebHome/?sortcol=2;table=1;up=0
TITLE: WebHome < WebDocumentation < Foswiki
CONTENT:
You are here:
>
172 total compute/GPU nodes and 2 login nodes, majority of these are Intel Cascade Lake CPUs and some are AMD EPYC CPUs
2 large-memory nodes, each containing four CPUs with 20-cores each for a total of 80 cores, and each including 1.5TB of RAM
1 large-memory node, equipped with two AMD EPYC CPUs with 8-cores each for total of 16 cores and 2 TB of RAM
6 nodes equipped with two AMD EPYC CPUs with 8-cores each for a total of 16 cores and 1 TB of RAM
100Gb/s Infiniband connectivity
Two Lustre filesytems: /home and /work, where /home has 110 TBs capacity and /work has 1.1 PB of capacity
A cumulative total of 250TB of local scratch (approximately 1.5 TB of /scratch space on most compute/GPU nodes)
amdonly: 6 node
amdbigmem: 1 node
- amdgpu: 2 nodes
- bigmem: 2 nodes
- compute1: 65 nodes
- compute2: 25 nodes
- computedev: 5 no

In [8]:
import re


def text_clean(text):
    text = re.sub(r"\n\n+", "\n\n", text)
    text = re.sub(r"\t+", "\t", text)
    text = re.sub(r"\s+", " ", text)
    return text

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["LINK:"]
)
chunks = text_splitter.split_documents(documents)

In [10]:
len(chunks)

8973

In [11]:
print(chunks[0].page_content)

LINK: https://hpcsupport.utsa.edu/foswiki/bin/view/WebDocumentation/WebHome/?sortcol=2;table=1;up=0
TITLE: WebHome < WebDocumentation < Foswiki
CONTENT:
You are here:
>
172 total compute/GPU nodes and 2 login nodes, majority of these are Intel Cascade Lake CPUs and some are AMD EPYC CPUs
2 large-memory nodes, each containing four CPUs with 20-cores each for a total of 80 cores, and each including 1.5TB of RAM
1 large-memory node, equipped with two AMD EPYC CPUs with 8-cores each for total of 16 cores and 2 TB of RAM
6 nodes equipped with two AMD EPYC CPUs with 8-cores each for a total of 16 cores and 1 TB of RAM
100Gb/s Infiniband connectivity
Two Lustre filesytems: /home and /work, where /home has 110 TBs capacity and /work has 1.1 PB of capacity
A cumulative total of 250TB of local scratch (approximately 1.5 TB of /scratch space on most compute/GPU nodes)
amdonly: 6 node
amdbigmem: 1 node
- amdgpu: 2 nodes
- bigmem: 2 nodes
- compute1: 65 nodes
- compute2: 25 nodes
- computedev: 5 no

In [12]:
chunks_clean = [text_clean(chunk.page_content) for chunk in chunks]
print(chunks_clean[0])

LINK: https://hpcsupport.utsa.edu/foswiki/bin/view/WebDocumentation/WebHome/?sortcol=2;table=1;up=0 TITLE: WebHome < WebDocumentation < Foswiki CONTENT: You are here: > 172 total compute/GPU nodes and 2 login nodes, majority of these are Intel Cascade Lake CPUs and some are AMD EPYC CPUs 2 large-memory nodes, each containing four CPUs with 20-cores each for a total of 80 cores, and each including 1.5TB of RAM 1 large-memory node, equipped with two AMD EPYC CPUs with 8-cores each for total of 16 cores and 2 TB of RAM 6 nodes equipped with two AMD EPYC CPUs with 8-cores each for a total of 16 cores and 1 TB of RAM 100Gb/s Infiniband connectivity Two Lustre filesytems: /home and /work, where /home has 110 TBs capacity and /work has 1.1 PB of capacity A cumulative total of 250TB of local scratch (approximately 1.5 TB of /scratch space on most compute/GPU nodes) amdonly: 6 node amdbigmem: 1 node - amdgpu: 2 nodes - bigmem: 2 nodes - compute1: 65 nodes - compute2: 25 nodes - computedev: 5 no

In [13]:
chunks_clean[:3]

['LINK: https://hpcsupport.utsa.edu/foswiki/bin/view/WebDocumentation/WebHome/?sortcol=2;table=1;up=0 TITLE: WebHome < WebDocumentation < Foswiki CONTENT: You are here: > 172 total compute/GPU nodes and 2 login nodes, majority of these are Intel Cascade Lake CPUs and some are AMD EPYC CPUs 2 large-memory nodes, each containing four CPUs with 20-cores each for a total of 80 cores, and each including 1.5TB of RAM 1 large-memory node, equipped with two AMD EPYC CPUs with 8-cores each for total of 16 cores and 2 TB of RAM 6 nodes equipped with two AMD EPYC CPUs with 8-cores each for a total of 16 cores and 1 TB of RAM 100Gb/s Infiniband connectivity Two Lustre filesytems: /home and /work, where /home has 110 TBs capacity and /work has 1.1 PB of capacity A cumulative total of 250TB of local scratch (approximately 1.5 TB of /scratch space on most compute/GPU nodes) amdonly: 6 node amdbigmem: 1 node - amdgpu: 2 nodes - bigmem: 2 nodes - compute1: 65 nodes - compute2: 25 nodes - computedev: 5 

In [14]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from langchain.schema import Document
from langgraph.graph import StateGraph, END
import json

# 1. Setup the LLM
# llm = ChatOpenAI(model="gpt-3.5-turbo")
llm = ChatOllama(base_url=base_url, model=model)

# 2. Create a prompt for filtering
filter_prompt = ChatPromptTemplate.from_template(
    """
You are an assistant that determines if text is related to university applications, professor information or reseach.

Text: {chunk_text}

Determine if this text is relevant to university applications based on these specific criteria:
1. Content about professors (contact information, research interests, biographies, etc.)
2. Information about graduate applications and programs (requirements, TOEFL/GRE scores, application processes)
3. Details about research topics, including funding information

Any content matching these criteria is relevant. All other content is NOT relevant.

Return a JSON with:
- "is_relevant": true or false
- "reason": brief explanation of your decision, referencing which specific criterion was met (if relevant)

Only return the JSON.
"""
)

# 3. Create a parser for structured output
json_parser = JsonOutputParser()

# 4. Create the filtering chain
filter_chain = filter_prompt | llm | json_parser

In [15]:
# print(chunks[100].page_content)

In [16]:
filter_chain.invoke({"chunk_text": chunks[100].page_content})

{'is_relevant': False, 'reason': 'None of the criteria match the content.'}

In [17]:
from typing import TypedDict


class InputState(TypedDict):
    chunk_text: str


class OutputState(TypedDict):
    is_relevant: str
    reason: str


class OverallState(InputState, OutputState):
    pass


def filter_university_content(state: InputState) -> OverallState:
    """
    Function to filter university-related content from a chunk of text.
    Args:
        chunk (Document): A chunk of text to be filtered.
    Returns:
        dict: A dictionary containing the chunk and its relevance status.
    """
    if isinstance(state["chunk_text"], Document):
        chunk =state["chunk_text"]
    else:
        chunk = Document(page_content=state["chunk_text"])
    result = filter_chain.invoke({"chunk_text": chunk.page_content})
    # Add the result to the document's metadata
    # state["chunk_text"] = chunk.page_content
    # state["is_relevant"] = result["is_relevant"]
    # state["reason"] = result["reason"]
    # chunk.metadata["is_university_related"] = result["is_relevant"]
    # chunk.metadata["relevance_reason"] = result["reason"]

    return {
        "chunk": chunk,
        "is_relevant": result["is_relevant"] if result["is_relevant"] else "N/A",
        "reason": result["reason"] if result["is_relevant"] else "N/A",
    }

In [18]:
input_state = InputState(chunk_text=chunks_clean[100])
result = filter_university_content(input_state)
result

{'chunk': Document(metadata={}, page_content="LINK: https://global.utsa.edu/faculty-scholars/Visa Types/J1 Scholar.html TITLE: J-1 Scholar | Global Initiatives | UTSA | University of Texas at San Antonio CONTENT: Search Search icons myUTSA UTSA Today Visit Directory J-1 Scholar - Medical emergencies (accident, illness, injury) - Exchange Visitor missing (sudden departure, long absence, has not returned to UTSA as planned) - Litigation - Incident involving the criminal justice system (arrest, charges, law enforcement, etc.) - Sexually-related incidents or abuse - Exchange Visitor death - Other situations impacting Exchange Visitor safety (natural disasters, civil unrest, outbreaks of violence) - Medical benefits of at least U.S. $100,000 per accident or illness. - Repatriation of remains in the amount of U.S. $25,000. - Expenses associated with medical evacuation in the amount of U.S. $50,000. - Deductible per accident or illness $500 or less. - A willful failure to carry required insur

In [19]:
graph = StateGraph(OverallState)
graph.add_node("filter", filter_university_content)

graph.add_edge("filter", END)
graph.set_entry_point("filter")

filter_graph = graph.compile()

In [20]:
filter_graph.invoke({"chunk_text": chunks_clean[215]})

{'chunk_text': 'LINK: https://www.utsa.edu/studentunion/roadrunnerpantry TITLE: 404 - Page Not Found | UTSA CONTENT: Page Not Found The page or file you were looking for may have been moved or is not available because of: Try Searching Keyword Found a Broken Link? Visit Our Home Page Thank you for visiting The University of Texas at San Antonio! LINK: https://pathways.utsa.edu/forms/ TITLE: Application – UTSA IES Pathways CONTENT: Download Flyer The UTSA P20 Pathways to Educational Research Training Program is designed to mentor undergraduate juniors beginning in their junior year to conduct collaborative Educational Science research that prepares them for graduate study. The training program is primarily housed within the UTSA Department of Interdisciplinary Teaching and includes faculty and students from across disciplines who are conducting Educational Science research. The founded training program aims to: 1. Provide undergraduate juniors with experiences in conducting Educational 

In [21]:
def process_chunks(chunks):
    # filter_graph = build_filter_graph()
    relevant_chunks = []
    irrelevant_chunks = []

    for chunk in tqdm(chunks):
        # print(i)
        result = filter_graph.invoke({"chunk_text": chunk})
        if result["is_relevant"] == "True":
            relevant_chunks.append(result["chunk_text"])
        else:
            irrelevant_chunks.append(result["chunk_text"])

    return {"relevant_chunks": relevant_chunks, "irrelevant_chunks": irrelevant_chunks}
# {"relevant_chunks": relevant_chunks, "irrelevant_chunks": irrelevant_chunks}

In [23]:
results = process_chunks(chunks[1000:1100])

# Print results
print(f"Found {len(results['relevant_chunks'])} relevant chunks")
print(f"Found {len(results['irrelevant_chunks'])} irrelevant chunks")

# Example of accessing the first relevant chunk
if results["relevant_chunks"]:
    print("\nExample relevant chunk:")
    print(f"Content: {results['relevant_chunks'][0].page_content}")
    print(f"Metadata: {results['relevant_chunks'][0].metadata}")

100%|██████████| 100/100 [01:22<00:00,  1.21it/s]

Found 0 relevant chunks
Found 100 irrelevant chunks





In [24]:
results['relevant_chunks'][:3]

[]

In [25]:
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS 

In [26]:
embeddings = OllamaEmbeddings(model=model, base_url='http://localhost:11434')
# embeddings = OpenAIEmbeddings()

In [28]:
# NOTE: chane line 3 to relevent_chunks

db = FAISS.from_documents(
    results["irrelevant_chunks"], 
    # results["relevant_chunks"], 
    embeddings,
)
db.save_local("../data/faiss_index_utsa")

In [29]:
db = FAISS.load_local("../data/faiss_index_utsa", embeddings, allow_dangerous_deserialization=True)

In [30]:
retriever = db.as_retriever(search_kwargs={"k": 4})
retriever.invoke("does utsa has mechnical engineering program?")

[Document(id='c740ed36-c222-4ace-9a82-bad26b792a51', metadata={'source': '../data/crawled_content/utsa_edu_2000.txt'}, page_content='LINK: https://www.utsa.edu/hop/chapter9/9-3.html\nTITLE: Section 9.03, Handbook of Operating Procedures Amendment Approval Process  | Handbook of Operating Procedures | UTSA | The University of Texas at San Antonio\nCONTENT:\nThe University of Texas at San Antonio\nSearch\n»\n.16\n- 9.19 – Administration of the Student Deposit Endowment Fund (Deleted Effective May 17, 2022)\n\nI. POLICY STATEMENT\n\n\nII. RATIONALE\n\nThis policy establishes the guidance for amending the institutionâ\x80\x99s HOP including obtaining input from faculty, staff and student governance bodies that may be affected by change in policies and procedures.\n\nIII. SCOPE\n\nThis policy applies to any significant change to a HOP policy, including the addition, reformatting, modification, or deletion of policies and procedures.\n\nIV. WEBSITE ADDRESS FOR THIS POLICY\n\n\nV. RELATED STA