In [1]:
import os
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
from langchain_core.tools import tool

from langchain_community.document_loaders.pdf import PyPDFLoader
import requests
import tempfile
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
import json
import hashlib

from pathlib import Path
import shutil 

_ = load_dotenv() # Loads the .env file - e.g. the OPENAI_API_KEY

In [8]:
# Document loader function to load PDF files from a URL

def load_pdf_from_url(url: str):
    """
    Load a PDF from a URL and extract its text content.
    
    Args:
        url: The URL of the PDF file
    
    Returns:
        List of documents with page content
    """
    try:
        # Download the PDF content
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Create a temporary file to store the PDF
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
            temp_file.write(response.content)
            temp_file_path = temp_file.name
        
        # Load the PDF using PyPDFLoader
        loader = PyPDFLoader(temp_file_path)
        documents = loader.load()
        
        # Clean up the temporary file
        os.unlink(temp_file_path)
        
        return documents
    
    except Exception as e:
        print(f"Error loading PDF from URL: {str(e)}")
        return None

## Dummy PDF URL for testing purposes to reduce embedding costs with OpenAI
# docs = load_pdf_from_url("https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf")

def loader_optcg_rulebooks():
    """
    Load the One Piece Card Game rules from the official website.
    
    Returns:
        List of documents with page content
    """
    comp_rules = load_pdf_from_url("https://en.onepiece-cardgame.com/pdf/rule_comprehensive.pdf?20250221")
    tourney_rules = load_pdf_from_url("https://en.onepiece-cardgame.com/pdf/tournament_rules_manual.pdf?20250613")
    
    # Verifies BOTH comprehensive and tournament rules are loaded
    # If either is None, it means loading failed
    # Ensures we have both sets of rules before proceeding
    if comp_rules is None or tourney_rules is None:
        print("Failed to load One Piece Card Game rules.")
        print("Please check the URLs or your internet connection.")
        return None, None# Exit early if loading fails
    
    # Tag each page/chunk with its source
    for page in comp_rules:
        page.metadata["source"] = "comprehensive_rules"
    for page in tourney_rules:
        page.metadata["source"] = "tournament_rules"

    return comp_rules, tourney_rules



# Test the loader function
x, y = loader_optcg_rulebooks()
print("First 5 pages of the loaded documents:")
x[:5], y[:5]

First 5 pages of the loaded documents:


([Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-02-20T12:30:45+09:00', 'moddate': '2025-02-20T14:31:46+09:00', 'title': 'Comprehensive Rules', 'source': 'comprehensive_rules', 'total_pages': 26, 'page': 0, 'page_label': '1'}, page_content='1 \n \nONE PIECE CARD GAME Comprehensive Rules \nVersion 1.1.8 \n \nLast updated: 2/21/2025 \n \nTable of Contents \nContents of Comprehensive Rules ............................................................................................ \n1. Game Overview .................................................................................................................. 1 \n2. Card Information ................................................................................................................ 3 \n3. Game Areas ........................................................................................................................ 6 \n4. Basic Gam

In [3]:
# docs = x + y
# #docs = load_pdf_from_url("https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf")
# CACHE_DIRECTORY = Path.home() / ".cache"
# CACHE_DIRECTORY.mkdir(parents=True, exist_ok=True)
# PERSIST_DIRECTORY = CACHE_DIRECTORY / "optcg_rulebooks_vectorstore"
# HASH_PATH = PERSIST_DIRECTORY / "doc_hash.json"
# docs_hash = hash_documents(docs)
# save_hash(docs_hash, HASH_PATH)
# print(f"Document hash saved to {HASH_PATH}")
# load_hash(HASH_PATH)

In [2]:
### NOTE: The following function is for deleting the vector store from disk.
## This should only be used if you need to update the vector store with new documents or changes. Otherwise, only if you absoutely need to delete the vector store from the disk. 

## The implentation of storing the vector store on disk is to ensure that it persists across sessions and does not need to be recreated every time you run the code. Limiting the need to re-embed the documents every time you run the code. Thus, reducing costs and improving performance.


def delete_vectorstore_optcg_rulebooks():
    """Delete the persistent vector store for One Piece Card Game rules"""
    
    # Rudimentary confirmation prompt
    confirmation = input("Are you sure you want to delete the vector store? This action cannot be undone. Type 'yes' to confirm: ")
    if confirmation.strip().lower() != 'yes':
        print("Deletion cancelled.")
        return
    
    # Delete the vector store directory if it exists
    PERSIST_DIRECTORY = Path.home() / ".cache" / "optcg_rulebooks_vectorstore"
    if PERSIST_DIRECTORY.exists():
        try:
            # Use shutil.rmtree to recursively delete the entire directory tree
            shutil.rmtree(PERSIST_DIRECTORY)
            print(f"Deleted vector store at {PERSIST_DIRECTORY}")
        except PermissionError as e:
            print(f"Permission error: {e}")
            print("Please ensure no files are open in the vector store directory.")
            print("You may need to close any applications using the vector store before deleting it. Try restarting your Jupyter kernel and running delete again.")
        except Exception as e:
            print(f"Error deleting vector store: {e}")
    else:
        print("No vector store found to delete.")

In [None]:
## NOTE: Hashing function to create a unique hash for the documents
## This is used to check if the documents have changed since the last time the vector store was created
## i.e. if the documents have been updated.

def hash_documents(documents):
    combined = "".join(doc.page_content for doc in documents)
    return hashlib.md5(combined.encode("utf-8")).hexdigest()

def save_hash(doc_hash, HASH_PATH):
    with open(HASH_PATH, "w") as f:
        json.dump({"hash": doc_hash}, f)

def load_hash(HASH_PATH):
    if HASH_PATH.exists():
        with open(HASH_PATH, "r") as f:
            return json.load(f).get("hash")
    return None


def check_document_changes(documents, HASH_PATH, PERSIST_DIR_IN_CACHE="optcg_rulebooks_vectorstore"): 
    """
    Check if the documents have changed since the last time the vector store was created.
    This should never return `False, False` as at document hash should always be created when the vector store is created.
    """
    current_hash = hash_documents(documents)
    saved_hash = load_hash(HASH_PATH)
    
    CACHE_DIRECTORY = Path.home() / ".cache"
    CACHE_DIRECTORY.mkdir(parents=True, exist_ok=True)
    PERSIST_DIRECTORY = CACHE_DIRECTORY / PERSIST_DIR_IN_CACHE

    if saved_hash is None:
        if os.path.exists(PERSIST_DIRECTORY) and os.listdir(PERSIST_DIRECTORY):
            print("WARNING: Previous hash not found, but vector store exists. This is unexpected!")
            print("Please delete the existing vector store and create a new one to ensure hash is created.")
            return False, False # This should not happen! Hash should always be created when the vector store is created.
        print("No previous hash found. Create a new vector store.")
        return False, True  # No previous hash, so we need to create a new vector store
    elif current_hash != saved_hash:
        print("Documents have changed since last vector store creation.")
        return True, True  # Documents have changed, so we need to create a new vector store
    elif current_hash == saved_hash:
        print("Documents have not changed since last vector store creation.")
        return True, False  # No changes detected

# TODO: This function only implements the check for updates to the rules. This function does not update the vector store.
## Would need to call `create_or_load_vectorstore_optcg_rulebooks()` to create the vector store if updates are detected.
## If there is an existing vector store, it should be deleted and a new one will be created.

def check_for_updates_to_rules():
    """Check if the One Piece Card Game rules have been updated since the last vector store creation, using the hash of the documents"""
    CACHE_DIRECTORY = Path.home() / ".cache"
    CACHE_DIRECTORY.mkdir(parents=True, exist_ok=True)
    PERSIST_DIR_IN_CACHE="optcg_rulebooks_vectorstore"
    PERSIST_DIRECTORY = CACHE_DIRECTORY / PERSIST_DIR_IN_CACHE
    HASH_PATH = PERSIST_DIRECTORY / "doc_hash.json"

    comp_rules, tourney_rules = loader_optcg_rulebooks()
    if comp_rules is not None and tourney_rules is not None: # Exit if no documents are loaded.
        documents = comp_rules + tourney_rules
        existing_hash_bool, doc_changes_bool = check_document_changes(documents, HASH_PATH, PERSIST_DIR_IN_CACHE)
        print(f"Existing hash found: {existing_hash_bool}, Document changes detected: {doc_changes_bool}")
        if not doc_changes_bool:
            print("No updates needed.")
            return False
        elif not existing_hash_bool and doc_changes_bool:
            print("Documents have changed or no previous hash found. Create a new vector store.")
            return True       
        elif existing_hash_bool and doc_changes_bool:
            print("Update needed. Delete existing vector store and create a new one.")
            return True
        else: # If check_document_changes returns as `False, False`, this is unexpected.
            # This should not happen! Hash should always be created when the vector store is created.
            print("Unexpected case. Please check the implementation.")
            return None
    else:
        print("Cannot check for updates. No documents loaded.")
        return None
    
    
def preprocess_tournament_rules(documents):
    """Add custom separators for tournament rule numbering"""
    processed_docs = []
    
    for doc in documents:
        # Replace patterns like "1.2 followed by 1.2.1" with double newlines
        content = doc.page_content
        
        # Add double newlines (`\n\n`) before numbered sections like "1.2" and "1.2.1"
        content = re.sub(r'(\d+\.\d+)', r' \n\n\1', content)
        #content = re.sub(r'(\d+\.\d+\.\d+)', r' \n\n\1', content)

        # Create new document with processed content
        new_doc = doc.model_copy()
        new_doc.page_content = content
        processed_docs.append(new_doc)
    
    return processed_docs

def create_or_load_vectorstore_optcg_rulebooks():
    """Create or load the persistent vector store"""
    
    # Define the cache directory and persistent directory
    CACHE_DIRECTORY = Path.home() / ".cache"
    CACHE_DIRECTORY.mkdir(parents=True, exist_ok=True)
    PERSIST_DIRECTORY = CACHE_DIRECTORY / "optcg_rulebooks_vectorstore"

    # Define the embedding model
    embeddings = OpenAIEmbeddings(
        model="text-embedding-3-large"
        )

    # Check if vector store already exists and load it
    if os.path.exists(PERSIST_DIRECTORY) and os.listdir(PERSIST_DIRECTORY):
        print("Loading existing vector store...")
        vectorstore = Chroma(
            persist_directory=str(PERSIST_DIRECTORY),
            embedding_function=embeddings
        )
        return vectorstore
    
    # If vector store does not exist, create it
    else: 
        print("Creating new vector store...")
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1500,
            chunk_overlap=300,
            separators=["\n\n", "\n \n", "\n", ". ", " ", ""]
        )
        
        # Load and split documents
        comp_rules, tourney_rules = loader_optcg_rulebooks()
        if not comp_rules or not tourney_rules:
            print("No documents loaded. Please check PDF URLs.")
            return None # Exit if no documents are loaded. Will not create a vector store.
        docs = comp_rules + tourney_rules
        doc_chunks = text_splitter.split_documents(docs)
        print(f"Split documents into {len(doc_chunks)} chunks")
        
        # Create vector store with persistence
        vectorstore = Chroma.from_documents(
            documents=doc_chunks,
            embedding=embeddings,
            persist_directory=str(PERSIST_DIRECTORY)
        )
        print(f"Vector store created and saved to {PERSIST_DIRECTORY}")


        HASH_PATH = PERSIST_DIRECTORY / "doc_hash.json"
        docs_hash = hash_documents(docs)
        save_hash(docs_hash, HASH_PATH)
        print(f"Document hash saved to {HASH_PATH}")

        return vectorstore

# Create or load the vector store
#vectorstore = create_or_load_vectorstore_optcg_rulebooks()

In [None]:
# delete_vectorstore_optcg_rulebooks()
vectorstore = create_or_load_vectorstore_optcg_rulebooks()
check_for_updates_to_rules()

Documents have not changed since last vector store creation.
Existing hash found: True, Document changes detected: False
Documents have not changed since last vector store creation. No updates needed.


False

In [25]:
## Test the vector store
print("Testing vector store with a sample query...")
sample_query = "How many cards can I draw on my first turn?"
results = vectorstore.similarity_search(sample_query, k=3)

[result.metadata["source"] + str(result.metadata["page_label"]) + ": " + result.page_content for result in results]

Testing vector store with a sample query...


['comprehensive_rules12: their effects first, in any order, followed by the player who did not choose \nto go first or second, who then processes their effects in any order. \n5-2-1-5-2. If changes are made to a deck as the result of an effect that reads “At the \nstart of the game”, it is then shuffled by the owner of that deck. \n5-2-1-6. Each player draws 5 cards from their deck  as their opening hand. Then, \nbeginning with the player going first, each player may redraw their hand once \naccording to the procedure below.  \n5-2-1-6-1. The player returns all of the cards in their hand to their deck, reshuffles, \nand then redraws 5 cards. \n5-2-1-7. Each player places a number of cards from the top of their deck equal to the \nLife value of their Leader face-down in their Life area such that the card at the \ntop of their deck is at the bottom in their Life area. \n5-2-1-8. The first player begins the game and starts their turn.',
 'comprehensive_rules12: cannot contain a specified 

In [None]:
sample_query = "What is considered private knowledge?"
results = vectorstore.similarity_search(sample_query, k=3)

[result.metadata["source"] + str(result.metadata["page_label"]) + ": " + result.page_content for result in results]

 'tournament_rules26: • Some severe infractions may also be explained in private to the person committing the \ninfraction. \n• Not every situation that occurs is appropriate for public knowledge. A judge’s decision to \naddress a situation in private must be respected by other players, spectators, uninvolved \njudges and tournament officials. \n• The Head Judge’s decision is final.',
 'tournament_rules22: 23  \nPlayers may not answer questions about any game state information considered private knowledge. \nGiving false or misleading information about private knowledge, or intentionally revealing \ninformation considered private knowledge, may result in a disqualification penalty. \n4.13 Life Area / Hand / Deck Verification \nWith the exception of areas targeted by a search effect, you may not search any of your opponent’s \nprivate knowledge areas, such as hand, deck, or life area, unless you are directed to do so by a card \neffect. \nJudges may not be asked to search or verify your

In [None]:
sample_query = "What is considered public knowledge?"
results = vectorstore.similarity_search(sample_query, k=3)

[result.metadata["source"] + str(result.metadata["page_label"]) + ": " + result.page_content for result in results]

 'tournament_rules22: 23  \nPlayers may not answer questions about any game state information considered private knowledge. \nGiving false or misleading information about private knowledge, or intentionally revealing \ninformation considered private knowledge, may result in a disqualification penalty. \n4.13 Life Area / Hand / Deck Verification \nWith the exception of areas targeted by a search effect, you may not search any of your opponent’s \nprivate knowledge areas, such as hand, deck, or life area, unless you are directed to do so by a card \neffect. \nJudges may not be asked to search or verify your opponent’s hand, deck or life area unless there is \nevidence your opponent may be cheating or that there may be a valid deck-related issue. \n4.14 Appeals \nPlayers have the right to appeal rulings to the Head Judge of the tournament if they disagree with a \nfloor judge’s ruling. \nPlayers may not appeal a floor judge’s ruling until after the floor judge has issued the ruling. \nPla

In [3]:
delete_vectorstore_optcg_rulebooks()

Deleted vector store at C:\Users\tyson\.cache\optcg_rulebooks_vectorstore


In [None]:
@tool
def rulebook_lookup(query: str) -> str:
    """Looks up a rule in the One Piece TCG rulebook."""
    # In a real implementation, this would query a database or API.
    rules = {
        "What happens if two characters with the same name are played on the same team?": "Characters with the same name cannot be played on the same team. If you already have a character in play, you cannot play another character with the same name.",
        "How does the Don!! system work?": "Don!! cards are used to pay costs and activate abilities. You can attach Don!! cards to characters to increase their power or use them to pay for events and character abilities.",
        "What is the difference between active and rest positions?": "Active position means the card is upright and can attack or use abilities. Rest position means the card is turned sideways and cannot attack until it becomes active again during your next turn.",
        "How do you win the game?": "You win by reducing your opponent's life to 0. Life is reduced when your opponent takes damage and has no cards left in their life area to trash.",
        "What is a counter ability?": "Counter abilities can be activated during your opponent's turn when specific conditions are met, usually when one of your characters is being attacked."
    }
    return rules.get(query, "Rule not found.")

In [None]:
tools = [rulebook_lookup]
agent = create_react_agent(
    model=ChatOpenAI(model="gpt-4.1-mini", temperature=0),
    name="RulebookAgent",
    tools=tools, 
    prompt="You are a helpful assistant that helps people find information in the Rulebook for One Piece TCG. You have access to the following tools: {tools}. Use them to find the information the user is looking for. If you don't know the answer, just say you don't know. Do not try to make up an answer.",
)

In [None]:
response = agent.invoke(
    {"messages": [{
        "role": "user", 
        "content": "What happens if two characters with the same name are played on the same team?"
    }]}
)
for m in response["messages"]:
    m.pretty_print()


What happens if two characters with the same name are played on the same team?
Name: RulebookAgent
Tool Calls:
  rulebook_lookup (call_PKkircNK3uQSxN1NV6lDwEsx)
 Call ID: call_PKkircNK3uQSxN1NV6lDwEsx
  Args:
    query: two characters with the same name on the same team
Name: rulebook_lookup

Rule not found.
Name: RulebookAgent

I couldn't find a specific rule in the One Piece TCG rulebook about what happens if two characters with the same name are played on the same team. If you have any other questions or need information on a related topic, feel free to ask!


In [None]:
rules = {
        "What happens if two characters with the same name are played on the same team?": "Characters with the same name cannot be played on the same team. If you already have a character in play, you cannot play another character with the same name."}
rules.get("What happens if two characters with the same name are played on the same team?", "Rule not found.")

'Characters with the same name cannot be played on the same team. If you already have a character in play, you cannot play another character with the same name.'

In [None]:
# Example: Load a PDF from a URL
# Replace this with your actual PDF URL
pdf_url = "https://example.com/your-pdf-file.pdf"

# Uncomment the lines below to test with a real PDF URL
# documents = load_pdf_from_url(pdf_url)
# if documents:
#     print(f"Successfully loaded {len(documents)} pages from the PDF")
#     
#     # Show first page content (first 500 characters)
#     if len(documents) > 0:
#         print(f"\nFirst page content preview:")
#         print(documents[0].page_content[:500] + "...")
#         
#         # Show metadata
#         print(f"\nPage metadata:")
#         print(documents[0].metadata)
# else:
#     print("Failed to load PDF")

In [None]:
# Debug cell - check Path object
print(f"Path type: {type(Path)}")
print(f"Path.home() type: {type(Path.home())}")
print(f"Path.home() value: {Path.home()}")

# Test the problematic line step by step
home = Path.home()
print(f"home: {home}, type: {type(home)}")

cache = home / ".cache"
print(f"cache: {cache}, type: {type(cache)}")

final_path = cache / "onepiece_vectorstore"
print(f"final_path: {final_path}, type: {type(final_path)}")

Path type: <class 'type'>
Path.home() type: <class 'pathlib._local.WindowsPath'>
Path.home() value: C:\Users\tyson
home: C:\Users\tyson, type: <class 'pathlib._local.WindowsPath'>
cache: C:\Users\tyson\.cache, type: <class 'pathlib._local.WindowsPath'>
final_path: C:\Users\tyson\.cache\onepiece_vectorstore, type: <class 'pathlib._local.WindowsPath'>
