In [None]:
#https://python.langchain.com/docs/integrations/document_loaders/mediawikidump/

from langchain_community.document_loaders import MWDumpLoader
loader = MWDumpLoader(
    file_path="stardewvalleywiki.com_mediawiki-20240505-wikidump\stardewvalleywiki.com_mediawiki-20240505-current.xml",
    encoding="utf-8",
    skip_redirects=True,
    stop_on_error=True,
)

#does not export the fricking price 
# documents=[doc.page_content for doc in loader.load()]

documents=loader.load()


In [1]:
#https://github.com/earwig/mwparserfromhell
import mwxml
import mwparserfromhell
from langchain_core.documents import Document


stardew_wiki_dump = mwxml.Dump.from_file(open("stardewvalleywiki.com_mediawiki-20240505-wikidump\stardewvalleywiki.com_mediawiki-20240505-current.xml", encoding="utf-8"))

documents=[]

def load_single_page_from_dump(page):
    for revision in page:
        code = mwparserfromhell.parse(revision.text)
        text= code.strip_code(
            normalize=True, collapse=True, keep_template_params=True
        )
        metadata = {"source":page.title}
        return Document(page_content=text, metadata=metadata)

for page in stardew_wiki_dump.pages:
    documents.append(load_single_page_from_dump(page))



# print(documents[2].page_content)


In [2]:
for i in range(len(documents)):
    documents[i].page_content = documents[i].page_content.split("Category")[0]

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import Any
from tqdm.contrib.concurrent import process_map



splitter = RecursiveCharacterTextSplitter(
    add_start_index=True,
    chunk_size=1000,
    is_separator_regex=True,
    separators = [r"\w(=){3}\n", r"\w(=){2}\n", r"\n\n", r"\n", r"\s"],
    # keep_seperator=False
)

# documents=splitter.create_documents(documents)
documents=splitter.split_documents(documents)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from langchain_community.embeddings import OllamaEmbeddings
ollama_emb = OllamaEmbeddings(
    model="llama3",
)
# print(ollama_emb.embed_query('This is a test document'))

In [5]:
documents=documents[:10]

In [6]:
from langchain_chroma import Chroma 

vector_db = Chroma.from_documents(documents, ollama_emb,persist_directory='db')

In [7]:
query = '1000 years from now'
docs = vector_db.similarity_search(query)

#trash results, next to fix 
print(docs[0].page_content)

250  y  y
Note: This article is about the furniture. For the crop see the Blueberry page.

'Blueberries' is a piece of furniture that hangs on a wall. It can rotate into daily stock at the Carpenter's Shop for 250 or the Traveling Cart for between furniture.

It's also available from the Furniture Catalogue for 0.

Position
Note that some small paintings hang higher or lower on the wall than others.

File:SmallPaintingsPosition.png

History
1.0 Introduced.


In [10]:
retriever = vector_db.as_retriever()

query='The Binary painting is a furniture item'
retriever.get_relevant_documents(query)

[Document(page_content='REDIRECT Stardew Valley Wiki', metadata={'source': 'Main Page', 'start_index': 0}),
 Document(page_content="Retro Catalogue  furniture\nThe 'Binary' painting is a furniture item available from the Retro Catalogue.\n\nHistory\n1.6 Introduced.", metadata={'source': "'Binary'", 'start_index': 1}),
 Document(page_content="Retro Catalogue  furniture\nThe 'Abstract' painting is a furniture item available from the Retro Catalogue.\n\nHistory\n1.6 Introduced.", metadata={'source': "'Abstract'", 'start_index': 1}),
 Document(page_content="4000  11 Artifacts that include Rare Disc inline and Dwarf Gadget\n'Burnt Offering' is a piece of furniture that hangs on a wall. It can be obtained by donating 11 Artifacts that include the Rare Disc and Dwarf Gadget to the Museum.  \n\nIt can also be purchased in the Casino for 4000 Qi.\n\nHistory\n1.0 Introduced.", metadata={'source': "'Burnt Offering'", 'start_index': 1})]

In [13]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Ollama

llm = Ollama(model="llama3")


prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [22]:
rag_chain.invoke("How much is the blueberries furniture piece and where could you buy it?")

"Based on the context, the blueberries furniture piece costs 250 and can be bought from the Carpenter's Shop for 250 (on Saturdays) or the Traveling Cart, or it is also available from the Furniture Catalogue for 0."

In [21]:
# from langchain_community.llms import Ollama

# llm = Ollama(model="llama3")

# llm.invoke("Tell me a joke")