In [35]:
#https://python.langchain.com/docs/integrations/document_loaders/mediawikidump/

from langchain_community.document_loaders import MWDumpLoader
loader = MWDumpLoader(
    file_path="stardewvalleywiki.com_mediawiki-20240505-wikidump\stardewvalleywiki.com_mediawiki-20240505-current.xml",
    encoding="utf-8",
    skip_redirects=True,
    stop_on_error=True,
)

#does not export the fricking price 
# documents=[doc.page_content for doc in loader.load()]

documents=loader.load()


In [1]:
#https://github.com/earwig/mwparserfromhell
import mwxml
import mwparserfromhell
from langchain_core.documents import Document


stardew_wiki_dump = mwxml.Dump.from_file(open("stardewvalleywiki.com_mediawiki-20240505-wikidump\stardewvalleywiki.com_mediawiki-20240505-current.xml", encoding="utf-8"))

documents=[]

def load_single_page_from_dump(page):
    for revision in page:
        code = mwparserfromhell.parse(revision.text)
        text= code.strip_code(
            normalize=True, collapse=True, keep_template_params=True
        )
        metadata = {"source":page.title}
        text=text.split('Category')[0]
        return Document(page_content=text, metadata=metadata)

for page in stardew_wiki_dump.pages:
    if not page.redirect:
        documents.append(load_single_page_from_dump(page))



# print(documents[2].page_content)


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import Any
from tqdm.contrib.concurrent import process_map



splitter = RecursiveCharacterTextSplitter(
    add_start_index=True,
    chunk_size=1000,
    is_separator_regex=True,
    separators = [r"\w(=){3}\n", r"\w(=){2}\n", r"\n\n", r"\n", r"\s"],
    # keep_seperator=False
)

# documents=splitter.create_documents(documents)
documents=splitter.split_documents(documents)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from langchain_community.embeddings import OllamaEmbeddings
ollama_emb = OllamaEmbeddings(
    # model="llama3",
    model='nomic-embed-text'
)
# print(ollama_emb.embed_query('This is a test document'))



In [4]:
#4mins on ollama embeddings
documents=documents[:100]

In [20]:
from sentence_transformers import SentenceTransformer


model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
embeddings=model.encode([doc.page_content for doc in documents])

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [5]:
from langchain_chroma import Chroma 

vector_db = Chroma.from_documents(documents, ollama_emb,persist_directory='db')
# vector_db = Chroma.from_documents(embeddings,persist_directory='db')

In [7]:
query = '1000 years from now'
docs = vector_db.similarity_search(query)

#trash results, next to fix 
# print(docs[0].page_content)

250  y  y
Note: This article is about the furniture. For the crop see the Blueberry page.

'Blueberries' is a piece of furniture that hangs on a wall. It can rotate into daily stock at the Carpenter's Shop for 250 or the Traveling Cart for between furniture.

It's also available from the Furniture Catalogue for 0.

Position
Note that some small paintings hang higher or lower on the wall than others.

File:SmallPaintingsPosition.png

History
1.0 Introduced.


In [6]:
retriever = vector_db.as_retriever()

query='A Night On Eco-Hill'
retriever.get_relevant_documents(query)

  warn_deprecated(


[Document(page_content="20 items\n'A Night On Eco-Hill' is a piece of furniture that hangs on a wall.  It can be obtained only by donating 20 items to the Museum.\n\nHistory\n1.0 Introduced.", metadata={'source': "'A Night On Eco-Hill'", 'start_index': 1}),
 Document(page_content="Magic Rock Candy\nOh, wow... (Name)! Are you sure you want to give this to me? It's so rare!\nI'll admit, my mouth is watering already...\n\nMonster Compendium\nWhoa, that's a creepy looking book... I love it!*Flip* *flip* *flip*... Ooh... It's full of monsters...I'll have to study this before going into the caves... Thanks!\n\nStardrop Tea\nWow... the color is so beautiful, like an amethyst. Thanks!\n\nImageNameDescriptionSourceIngredients 48px|center All Universal LovescenterAmethystAmethystMiningcenterBanana PuddingBanana PuddingCookingBanana 1Milk 1Sugar 1centerBlackberry CobblerBlackberry CobblerCookingBlackberry 2Sugar 1Wheat Flour 1centerChocolate CakeChocolate CakeCookingWheat Flour 1Sugar 1Egg 1cente

In [7]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Ollama

llm = Ollama(model="llama3")


prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [8]:
rag_chain.invoke("Where could you get the A night on eco hill painting?")

'You can get "A Night On Eco-Hill" by donating 20 items to the Museum.'

In [21]:
# from langchain_community.llms import Ollama

# llm = Ollama(model="llama3")

# llm.invoke("Tell me a joke")