In [35]:
#https://python.langchain.com/docs/integrations/document_loaders/mediawikidump/

from langchain_community.document_loaders import MWDumpLoader
loader = MWDumpLoader(
    file_path="stardewvalleywiki.com_mediawiki-20240505-wikidump\stardewvalleywiki.com_mediawiki-20240505-current.xml",
    encoding="utf-8",
    skip_redirects=True,
    stop_on_error=True,
)

#does not export the fricking price 
# documents=[doc.page_content for doc in loader.load()]

documents=loader.load()


In [29]:
#https://github.com/earwig/mwparserfromhell
import mwxml
import mwparserfromhell
from langchain_core.documents import Document
from tqdm import tqdm



stardew_wiki_dump = mwxml.Dump.from_file(open("stardewvalleywiki.com_mediawiki-20240505-wikidump\stardewvalleywiki.com_mediawiki-20240505-current.xml", encoding="utf-8"))

documents=[]


def load_single_page_from_dump(page,i):
    for revision in page:
        code = mwparserfromhell.parse(revision.text)
        text= code.strip_code(
            normalize=True, collapse=True, keep_template_params=True
        )
        metadata = {"source":page.title}
        text=text.split('Category')[0]
        return Document(page_content=text, metadata=metadata)

for page in tqdm(stardew_wiki_dump.pages):
    if not page.redirect:
        documents.append(load_single_page_from_dump(page))


2829it [00:17, 163.91it/s]


In [30]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import Any
from tqdm.contrib.concurrent import process_map



splitter = RecursiveCharacterTextSplitter(
    add_start_index=True,
    chunk_size=1000,
    is_separator_regex=True,
    separators = [r"\w(=){3}\n", r"\w(=){2}\n", r"\n\n", r"\n", r"\s"],
    # keep_seperator=False
)

# documents=splitter.create_documents(documents)
documents=splitter.split_documents(documents)

In [31]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

embedding_function = SentenceTransformerEmbeddings(
    model_name="all-MiniLM-L6-v2")



In [32]:
from langchain_chroma import Chroma 
vector_db = Chroma.from_documents(documents, embedding_function,persist_directory='db')

In [49]:
retriever = vector_db.as_retriever()

query='Can you marry Abigail'
retriever.get_relevant_documents(query)

[Document(page_content='Abigail warns the player to be more careful in the future, saying "I don\'t want to lose you." Then, she buries the monster and marks the grave with the symbol of Yoba. \nInteracting with the grave displays the text: "Abigail took a life to save mine...I\'ll never forget that."\n\nIf you go home and talk to Abigail the same day as the event she\'ll say: "Good thing I brought my sword today!"\n\nMarriage\nMarriage\nOnce married, Abigail will move into the farmhouse. Like other marriage candidates, she will add her own room to the right of the bedroom. She\'ll also set up a small area behind the farmhouse where she\'ll sometimes go to practice her flute.', metadata={'source': 'Abigail', 'start_index': 18884}),
 Document(page_content='Abigail.png  Fall 13  Pelican Town  Pierre\'s General Store  Pierre FatherCaroline Mother  SamSebastian  Yes  Spring 4  AmethystBanana PuddingBlackberry CobblerChocolate CakePufferfishPumpkinSpicy Eel\n\nAbigail lives at the general s

In [36]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Ollama

llm = Ollama(model="llama3")


prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [47]:
rag_chain.invoke("Can you marry Abigail ingame?")

'Yes, you can marry Abigail in-game.'

In [21]:
#TODO:
#1. Find a way to format the tables, dialogs, etc in the raw data
#2. Faster ingestion in VectorDB 
#3. Prevent hallucinations