In [2]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_core.language_models.llms import BaseLLM
from langchain.llms.ollama import Ollama
from src.web_page_loader import loadWebsite
import json
from typing import Callable


In [3]:
# Mistral Paper: https://arxiv.org/pdf/2310.06825.pdf
llm = Ollama(model='mistral:7b', temperature=0.0)

In [4]:
SOURCE_URL = "https://www.5esrd.com/classes"
websiteMd, tagMap = loadWebsite(
  SOURCE_URL,
  contentTag="main",
  excludedTags=[
    { "name": "script" },
    { "attrs": { "id": "toc_container" }},
    { "attrs": { "class": "ogn-childpages" }}
  ],
  extractTags={ "tables": { "name": "table" } },
  extractLinks=SOURCE_URL
)

None
None
None
{'id': 'toc_container'}
{'class': 'ogn-childpages'}


In [5]:
print(json.dumps(tagMap, indent=2))

print(websiteMd)

{
  "a": [
    "https://www.5esrd.com/classes/3rd-party-publisher-classes/",
    "https://www.5esrd.com/classes/3rd-party-publisher-classes/aaw-games-llc/",
    "https://www.5esrd.com/classes/3rd-party-publisher-classes/aaw-games-llc/gemcaster/",
    "https://www.5esrd.com/classes/3rd-party-publisher-classes/aaw-games-llc/stonespeaker/",
    "https://www.5esrd.com/classes/3rd-party-publisher-classes/aaw-games-llc/transgnomamist-svirfneblin-wizard-arcane-tradition/",
    "https://www.5esrd.com/classes/3rd-party-publisher-classes/aaw-games-llc/underterror/",
    "https://www.5esrd.com/classes/3rd-party-publisher-classes/c-j-leung/",
    "https://www.5esrd.com/classes/3rd-party-publisher-classes/c-j-leung/channeler/",
    "https://www.5esrd.com/classes/3rd-party-publisher-classes/dream-realm-storytellers-ltd/",
    "https://www.5esrd.com/classes/3rd-party-publisher-classes/grimlore-entertainment/",
    "https://www.5esrd.com/classes/3rd-party-publisher-classes/grimlore-entertainment/the-p

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


def semanticTextSplitter(markdown: str):
  splitter = SemanticChunker(OllamaEmbeddings(model="mistral:7b"))
  return splitter.create_documents([markdown])

def recursiveTextSplitter(markdown: str):
  splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
  )

  return splitter.create_documents([markdown])

def markdownTextSplitter(markdown: str):
  splitter = MarkdownHeaderTextSplitter(
    strip_headers=False,
    headers_to_split_on = [
      ("#", "Header 1"),
      ("##", "Header 2"),
      ("###", "Header 3"),
      ("####", "Header 4"),
      ("#####", "Header 5"),
    ]
  )

  return splitter.split_text(markdown)

In [7]:
websiteDocs = semanticTextSplitter(websiteMd)

print(f'Semantic Docs Found: {len(websiteDocs)}')

for index,doc in enumerate(websiteDocs):
  print(f'  > Chunk Size: {llm.get_num_tokens(doc.page_content)}')

Semantic Docs Found: 7


  from .autonotebook import tqdm as notebook_tqdm
Token indices sequence length is longer than the specified maximum sequence length for this model (1196 > 1024). Running this sequence through the model will result in indexing errors


  > Chunk Size: 657
  > Chunk Size: 42
  > Chunk Size: 1196
  > Chunk Size: 174
  > Chunk Size: 1102
  > Chunk Size: 31
  > Chunk Size: 31


In [8]:
from src.prompts.proposition import systemMessage
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import ChatPromptTemplate
from llama_index.core.base.llms.types import ChatResponse, ChatMessage, MessageRole
from langchain.output_parsers import OutputFixingParser, PydanticOutputParser, ResponseSchema, StructuredOutputParser

response_schemas = [
    ResponseSchema(name="propositions", description="list of proposition strings found in the provided text", type="list[str]"),
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

class PropositionResponse(BaseModel):
  propositions: list[str] = Field(description="list of propositions found in the text")

propositionParser = PydanticOutputParser(pydantic_object=PropositionResponse)

prompt = ChatPromptTemplate.from_template(
  template=systemMessage,
  partial_variables={
    "format_instructions": output_parser.get_format_instructions()
  }
)


In [9]:
print(prompt.format(input="Test"))

print(f'Tokens: {llm.get_num_tokens(prompt.format(input="Test"))}')

Human: 
Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of
context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input
whenever possible.
2. For any named entity that is accompanied by additional descriptive information, separate this
information into its own distinct proposition.
3. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences
and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the
entities they refer to.

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"propositions": list[str]  // list of proposition strings found in the provided text
}
```

Decompose the following:
Test

Tokens: 213


In [10]:
chain = prompt | llm

propositions = []
for index,doc in enumerate(websiteDocs):
  print(f'> Processing Document {index} [Tokens: {llm.get_num_tokens(doc.page_content)}]')

  print(prompt.format(input=doc.page_content))
  chatResponse: ChatResponse = chain.invoke({'input': doc.page_content })

  print('==> Parsing Response')
  # print('===\n')
  # print('### Input')
  # print(doc.page_content)
  # print('### Output')
  # print(chatResponse)
  # print('\n===\n\n')

  try:

    parsedResponse = output_parser.parse(chatResponse)
    count = len(parsedResponse['propositions'])
    print(f'==> Propositions Found: {count}')
    # print(type(parsedResponse))
    # print(parsedResponse.keys())

    propositions.extend(parsedResponse['propositions'])
  except:
    print('==> Error In response, running fixer')
    fixer = OutputFixingParser.from_llm(
      parser=output_parser,
      max_retries=10,
      llm=llm
    )
    parsedResponse: PropositionResponse = fixer.parse(chatResponse)
    count = len(parsedResponse['propositions'])
    print(f'==> Propositions Found: {count}')
    propositions.extend(parsedResponse['propositions'])



print('=== Final List ===\n')
print(json.dumps(propositions, indent=2))

> Processing Document 0 [Tokens: 657]
Human: 
Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of
context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input
whenever possible.
2. For any named entity that is accompanied by additional descriptive information, separate this
information into its own distinct proposition.
3. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences
and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the
entities they refer to.

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"propositions": list[str]  // list of proposition strings found in the provided text
}
```

Decompose the following:


 
##### 

 

 > 
# Classes

 
## 

 

As your character goes on adventures and overcomes challenges, he

In [16]:
vectorstore = FAISS.from_texts(
  propositions,
  embedding=OllamaEmbeddings(model="mistral:7b")
)

retriever = vectorstore.as_retriever()

In [20]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

template = """Answer the question asked by the user.  If you do not have the answer respond with "I don't know" and recommend additional data be loaded.
Only use the following context to answer question:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

def propositionQuery(input: str):
  return chain.invoke(input)

questions = [
  "What is the purpose of classes?",          # Should Answer
  "What is the purpose of levels?",           # Should Answer
  "How do I create a character in this game?" # Should not answer
]

for index, q in enumerate(questions):
  print('## Question: {question}\n\n```\n{result}\n```\n\n'.format(
    question=q,
    result=propositionQuery(q)
  ))


## Question: What is the purpose of classes?

```
 Based on the provided context, classes in this context likely refer to character classes in a role-playing game. The purpose of classes, as described in the documents, is that they enhance the narrative by providing specific abilities and capabilities for the character. Additionally, advancing in a class grants additional features as detailed in the class description. There's no mention of hit points or inspiration specifically being related to classes, but based on other documents, it seems that constitution modifier increases result in hit point maximum increases.
```


## Question: What is the purpose of levels?

```
 Based on the provided context, it appears that levels in this context refer to advancements in capability that grant additional features and enhance the narrative. There's no explicit mention of hit points or rewarding other players being directly related to levels, but those are mentioned as separate concepts in the c