In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
#https://github.com/earwig/mwparserfromhell
import mwxml
import mwparserfromhell
from llama_index.core import Document 
from tqdm import tqdm



stardew_wiki_dump = mwxml.Dump.from_file(open("stardewvalleywiki.com_mediawiki-20240505-wikidump\stardewvalleywiki.com_mediawiki-20240505-current.xml", encoding="utf-8"))

raw_documents=[]
    

for page in tqdm(stardew_wiki_dump.pages):
    if not page.redirect:
        for revision in page:
            # print(page)
            # if page.title =="Abigail":
            code = mwparserfromhell.parse(revision.text)
            # print(f'================Code=================\n{code}\n================Code=================')
            # text= code.strip_code(
            #     normalize=False, collapse=True, keep_template_params=True
            # )
            # print(f'================Text=================\n{text}\n================Text=================')
            metadata = {"Page Title":page.title}
            code=code.split('[[Category')[0]
            
            raw_documents.append(Document(text=code, metadata=metadata))
        
# print(documents[0].page_content)

  from .autonotebook import tqdm as notebook_tqdm
2829it [00:14, 196.25it/s]


In [3]:
raw_documents=raw_documents[:10]

In [5]:
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    SummaryExtractor
)

from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser

text_splitter= LangchainNodeParser(RecursiveCharacterTextSplitter(
    add_start_index=True,
    chunk_size=1500,
    is_separator_regex=True,
    # separators = [r"\w(=){3}\n", r"\w(=){2}\n", r"\n\n", r"\n"],
    separators = [r"\n\n", r"\n"],))

from llama_index.llms.ollama import Ollama
llm=Ollama(model="llama3", request_timeout=60.0)


# llm=None
qa_extractor = QuestionsAnsweredExtractor(llm,questions=5)
summary_extractor = SummaryExtractor(llm)


from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")

from llama_index.core.ingestion import IngestionPipeline
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
from llama_index.core import StorageContext



# chroma_client = chromadb.EphemeralClient()
chroma_client=chromadb.PersistentClient(path="./chroma_db")
chroma_collection = chroma_client.get_or_create_collection("stardew_wiki")
vector_store = ChromaVectorStore(chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

pipeline = IngestionPipeline(
    transformations=[text_splitter, 
                     qa_extractor, 
                     summary_extractor,
                     embed_model
                     ],
    vector_store=vector_store,
)

for i in range (0, len(raw_documents), 5):
    print(f'Processing from document {i} to {i+5}')
    pipeline.run(
        documents=raw_documents[i:i+5],
        show_progress=True,
        # num_workers=3
    )

# pipeline.run(
#     documents=raw_documents,
#     show_progress=True,
#     # num_workers=3
# )


# from llama_index.core import VectorStoreIndex
# index = VectorStoreIndex.from_vector_store(vector_store,embed_model=embed_model)




Processing from document 0 to 5


Parsing nodes: 100%|██████████| 5/5 [00:00<00:00, 5046.08it/s]
100%|██████████| 5/5 [00:42<00:00,  8.43s/it]
100%|██████████| 5/5 [00:09<00:00,  1.97s/it]
Generating embeddings: 100%|██████████| 5/5 [00:45<00:00,  9.15s/it]


Processing from document 5 to 10


Parsing nodes: 100%|██████████| 5/5 [00:00<00:00, 4863.53it/s]
100%|██████████| 5/5 [00:25<00:00,  5.14s/it]
100%|██████████| 5/5 [00:11<00:00,  2.39s/it]
Generating embeddings: 100%|██████████| 5/5 [00:00<00:00, 21.24it/s]


In [6]:
# db=chromadb.PersistentClient(path="./chroma_db")
# test=db.get_collection("stardew_wiki")
# test_vector_store = ChromaVectorStore(test)
from llama_index.core import VectorStoreIndex
test_index = VectorStoreIndex.from_vector_store(vector_store,embed_model=embed_model)

In [7]:
retriever=test_index.as_retriever(verbose=True)

In [8]:
nodes = retriever.retrieve("What is 1000 Years From Now?")
print(nodes)

[NodeWithScore(node=TextNode(id_='749b93d4-bd4c-4085-af63-566ecc3666d4', embedding=None, metadata={'Page Title': "'1000 Years From Now'", 'questions_this_excerpt_can_answer': 'Based on the provided context, here are five potential questions that this context can specifically answer:\n\n1. What is the name of the painter whose stock appears during Winter 16 every 3 years, and how does it relate to the "1000 Years From Now" piece of furniture?\n\nThis question requires knowledge of the specific event (Night Market) and its timing (every 3 years), as well as the connection between Famous Painter Lupini and the "1000 Years From Now" piece.\n\n2. What is the name of the market where the "1000 Years From Now" piece of furniture can be purchased, and when does it appear?\n\nThis question requires knowledge of the specific event (Night Market) and its timing (Winter 16), as well as the location where the piece can be found.\n\n3. How much does the "1000 Years From Now" piece of furniture cost 

In [5]:
nodes

[TextNode(id_='fda32327-491d-48cf-b15f-7f6ddfc87fcd', embedding=None, metadata={'Page Title': "'1000 Years From Now'", 'questions_this_excerpt_can_answer': 'Based on the context, here are five questions that this text can provide specific answers to:\n\n1. What is the name of the famous painter associated with the "1000 Years From Now" furniture piece?\n\nThis question can be answered by looking at the sentence that mentions the rotation into Famous Painter Lupini\'s stock.\n\n2. How often does the "1000 Years From Now" furniture reappear on Winter 16 during the Night Market?\n\nThis question can be answered by examining the sentence that describes the rotation and reappearance of the furniture every 3 years.\n\n3. What is the price of the "1000 Years From Now" furniture piece?\n\nThis question can be answered by looking at the sentence that mentions the purchase price of {{Price|1200}}.\n\n4. In what year was the "1000 Years From Now" furniture introduced?\n\nThis question can be answ

In [8]:
print(nodes[2].get_content(metadata_mode="all"))

[Excerpt from document]
Page Title: 'Abstract'
questions_this_excerpt_can_answer: Based on the given context, here are five potential questions that this text can provide specific answers to:

1. What is the name of the painting featured in the Retro Catalogue?

Answer: The "Abstract" painting.

2. When was the "Abstract" painting introduced in the game?

Answer: According to the History section, it was introduced in version 1.6.

3. Is the "Abstract" painting a type of furniture item?

Answer: Yes, according to the Infobox furniture and the text description, the "Abstract" painting is a furniture item.

4. Where can you find the "Abstract" painting in the game?

Answer: You can find it in the Retro Catalogue.

5. What is the description of the "Abstract" painting provided in-game?

Answer: The description is not explicitly stated in the given text, but according to the Infobox furniture, it provides a link to the {{Description|furniture}} template, which likely contains the descriptio