In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
#https://github.com/earwig/mwparserfromhell
import mwxml
import mwparserfromhell
from llama_index.core import Document 
from tqdm import tqdm



stardew_wiki_dump = mwxml.Dump.from_file(open("stardewvalleywiki.com_mediawiki-20240505-wikidump\stardewvalleywiki.com_mediawiki-20240505-current.xml", encoding="utf-8"))

raw_documents=[]
    

for page in tqdm(stardew_wiki_dump.pages):
    if not page.redirect:
        for revision in page:
            # print(page)
            # if page.title =="Abigail":
            code = mwparserfromhell.parse(revision.text)
            # print(f'================Code=================\n{code}\n================Code=================')
            # text= code.strip_code(
            #     normalize=False, collapse=True, keep_template_params=True
            # )
            # print(f'================Text=================\n{text}\n================Text=================')
            metadata = {"Page Title":page.title}
            code=code.split('[[Category')[0]
            
            raw_documents.append(Document(text=code, metadata=metadata))
        
# print(documents[0].page_content)

  from .autonotebook import tqdm as notebook_tqdm
2829it [00:14, 188.98it/s]


In [3]:
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    SummaryExtractor
)

from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser

text_splitter= LangchainNodeParser(RecursiveCharacterTextSplitter(
    add_start_index=True,
    chunk_size=1500,
    is_separator_regex=True,
    # separators = [r"\w(=){3}\n", r"\w(=){2}\n", r"\n\n", r"\n"],
    separators = [r"\n\n", r"\n"],))

from llama_index.llms.ollama import Ollama
llm=Ollama(model="llama3", request_timeout=60.0)


# llm=None
qa_extractor = QuestionsAnsweredExtractor(llm,questions=5)
summary_extractor = SummaryExtractor(llm)


from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")

from llama_index.core.ingestion import IngestionPipeline
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
from llama_index.core import StorageContext



# chroma_client = chromadb.EphemeralClient()
chroma_client=chromadb.PersistentClient(path="./chroma_db")
chroma_collection = chroma_client.get_or_create_collection("stardew_wiki")
vector_store = ChromaVectorStore(chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

pipeline = IngestionPipeline(
    transformations=[text_splitter, 
                     qa_extractor, 
                     summary_extractor,
                     embed_model
                     ],
    vector_store=vector_store,
)

for i in range (0, len(raw_documents), 200):
    print(f'Processing from document {i} to {i+200}')
    pipeline.run(
        documents=raw_documents[i:i+200],
        show_progress=True,
        # num_workers=3
    )

# pipeline.run(
#     documents=raw_documents,
#     show_progress=True,
#     # num_workers=3
# )


# from llama_index.core import VectorStoreIndex
# index = VectorStoreIndex.from_vector_store(vector_store,embed_model=embed_model)




Processing from document 0 to 200


Parsing nodes: 100%|██████████| 200/200 [00:00<00:00, 4558.23it/s]
100%|██████████| 562/562 [55:23<00:00,  5.91s/it] 
100%|██████████| 562/562 [25:03<00:00,  2.68s/it]
Generating embeddings: 100%|██████████| 562/562 [00:31<00:00, 17.88it/s]


Processing from document 200 to 400


Parsing nodes: 100%|██████████| 200/200 [00:00<00:00, 5013.36it/s]
100%|██████████| 637/637 [1:01:42<00:00,  5.81s/it]
100%|██████████| 637/637 [29:32<00:00,  2.78s/it]
Generating embeddings: 100%|██████████| 637/637 [00:29<00:00, 21.93it/s]


Processing from document 400 to 600


Parsing nodes: 100%|██████████| 200/200 [00:00<00:00, 2415.27it/s]
100%|██████████| 1183/1183 [2:01:44<00:00,  6.17s/it] 
100%|██████████| 1183/1183 [52:07<00:00,  2.64s/it] 
Generating embeddings: 100%|██████████| 1183/1183 [00:54<00:00, 21.77it/s]


Processing from document 600 to 800


Parsing nodes: 100%|██████████| 200/200 [00:00<00:00, 3287.47it/s]
100%|██████████| 887/887 [1:28:19<00:00,  5.97s/it]
100%|██████████| 887/887 [41:44<00:00,  2.82s/it]
Generating embeddings: 100%|██████████| 887/887 [00:40<00:00, 21.85it/s]


Processing from document 800 to 1000


Parsing nodes: 100%|██████████| 200/200 [00:00<00:00, 3856.55it/s]
100%|██████████| 688/688 [1:09:04<00:00,  6.02s/it]
100%|██████████| 688/688 [32:59<00:00,  2.88s/it]
Generating embeddings: 100%|██████████| 688/688 [00:33<00:00, 20.35it/s]


Processing from document 1000 to 1200


Parsing nodes: 100%|██████████| 200/200 [00:00<00:00, 4270.08it/s]
100%|██████████| 789/789 [1:16:27<00:00,  5.81s/it]
100%|██████████| 789/789 [36:10<00:00,  2.75s/it]
Generating embeddings: 100%|██████████| 789/789 [00:34<00:00, 22.55it/s]


Processing from document 1200 to 1400


Parsing nodes: 100%|██████████| 200/200 [00:00<00:00, 4092.72it/s]
100%|██████████| 658/658 [1:04:20<00:00,  5.87s/it]
100%|██████████| 658/658 [29:43<00:00,  2.71s/it]
Generating embeddings: 100%|██████████| 658/658 [00:30<00:00, 21.86it/s]


Processing from document 1400 to 1600


Parsing nodes: 100%|██████████| 200/200 [00:00<00:00, 4180.38it/s]
100%|██████████| 791/791 [1:15:28<00:00,  5.73s/it]
100%|██████████| 791/791 [36:18<00:00,  2.75s/it]
Generating embeddings: 100%|██████████| 791/791 [00:35<00:00, 22.33it/s]


Processing from document 1600 to 1800


Parsing nodes: 100%|██████████| 200/200 [00:00<00:00, 4774.75it/s]
100%|██████████| 709/709 [1:08:13<00:00,  5.77s/it]
100%|██████████| 709/709 [32:48<00:00,  2.78s/it]
Generating embeddings: 100%|██████████| 709/709 [00:32<00:00, 22.14it/s]


Processing from document 1800 to 2000


Parsing nodes: 100%|██████████| 200/200 [00:00<00:00, 3785.68it/s]
100%|██████████| 784/784 [1:17:49<00:00,  5.96s/it]
100%|██████████| 784/784 [37:09<00:00,  2.84s/it]
Generating embeddings: 100%|██████████| 784/784 [00:35<00:00, 21.80it/s]


Processing from document 2000 to 2200


Parsing nodes: 100%|██████████| 5/5 [00:00<00:00, 5015.91it/s]
100%|██████████| 10/10 [01:00<00:00,  6.00s/it]
100%|██████████| 10/10 [00:27<00:00,  2.74s/it]
Generating embeddings: 100%|██████████| 10/10 [00:00<00:00, 11.23it/s]


In [40]:
doc=chroma_collection.get(limit=1)
doc

{'ids': ['0011fcd0-4374-41a9-ab30-6c125f4a41f0'],
 'embeddings': None,
 'metadatas': [{'Page Title': 'Robin',
   '_node_content': '{"id_": "0011fcd0-4374-41a9-ab30-6c125f4a41f0", "embedding": null, "metadata": {"Page Title": "Robin", "questions_this_excerpt_can_answer": "Based on the context, here are five questions that this context can specifically answer:\\n\\n1. What is the name of the character who has upgraded their house?\\nAnswer: Pam\\n\\n2. What is the unique character of each piece of wood mentioned in the Summer section?\\nAnswer: Each piece of wood has its own unique character.\\n\\n3. Is it possible to upgrade one\'s farm buildings without providing enough lumber and stone for the project?\\nAnswer: No, according to Robin, you need to provide enough lumber and stone for the project.\\n\\n4. Who is the wild man that lives behind Robin\'s house?\\nAnswer: This context does not explicitly mention his name, but it can be inferred that he is a mysterious character who lives in

In [4]:
# db=chromadb.PersistentClient(path="./chroma_db")
# test=db.get_collection("stardew_wiki")
# test_vector_store = ChromaVectorStore(test)
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_vector_store(vector_store,embed_model=embed_model)

In [28]:
retriever=index.as_retriever(verbose=True)
query_engine=index.as_query_engine(llm=llm)

In [58]:
nodes = retriever.retrieve("What are the things included in the Winter Foraging Bundle?")
nodes

[NodeWithScore(node=TextNode(id_='afbb165a-6736-4622-8db3-9f1680c94a4c', embedding=None, metadata={'Page Title': 'Remixed Bundles', 'questions_this_excerpt_can_answer': "Based on the given context, here are five questions that this context can provide specific answers to:\n\n1. What items can be obtained by tilling soil or finding Artifact Spots in Winter?\n\nAnswer: Winter Root, Snow Yam, and possibly other items not listed.\n\n2. Which mobs drop Crystal Fruit when foraging in Winter?\n\nAnswer: Dust Sprites\n\n3. In which areas of the game can Blue Slimes be found to obtain Winter Root?\n\nAnswer: Floors 41-79 of The Mines\n\n4. What is the reward item obtained from completing this bundle?\n\nAnswer: Winter Seeds (30)\n\n5. How many items are chosen at random for the Winter Foraging Bundle?\n\nAnswer: 4 items\n\nAs for higher-level summaries and surrounding context, here's a possible summary:\n\nThe Remixed Bundles page appears to be a feature in a game where players can obtain bundl

In [63]:
#https://docs.llamaindex.ai/en/stable/examples/retrievers/recursive_retriever_nodes/

response=query_engine.query('What crops are in the Summer Bundle?')
print(str(response))

Based on the provided context, I can answer the question.

The crops included in the "Summer Crops Bundle" are:

1. Tomato
2. Hot Pepper
3. Blueberry
4. Melon


In [5]:
nodes

[TextNode(id_='fda32327-491d-48cf-b15f-7f6ddfc87fcd', embedding=None, metadata={'Page Title': "'1000 Years From Now'", 'questions_this_excerpt_can_answer': 'Based on the context, here are five questions that this text can provide specific answers to:\n\n1. What is the name of the famous painter associated with the "1000 Years From Now" furniture piece?\n\nThis question can be answered by looking at the sentence that mentions the rotation into Famous Painter Lupini\'s stock.\n\n2. How often does the "1000 Years From Now" furniture reappear on Winter 16 during the Night Market?\n\nThis question can be answered by examining the sentence that describes the rotation and reappearance of the furniture every 3 years.\n\n3. What is the price of the "1000 Years From Now" furniture piece?\n\nThis question can be answered by looking at the sentence that mentions the purchase price of {{Price|1200}}.\n\n4. In what year was the "1000 Years From Now" furniture introduced?\n\nThis question can be answ

In [8]:
print(nodes[2].get_content(metadata_mode="all"))

[Excerpt from document]
Page Title: 'Abstract'
questions_this_excerpt_can_answer: Based on the given context, here are five potential questions that this text can provide specific answers to:

1. What is the name of the painting featured in the Retro Catalogue?

Answer: The "Abstract" painting.

2. When was the "Abstract" painting introduced in the game?

Answer: According to the History section, it was introduced in version 1.6.

3. Is the "Abstract" painting a type of furniture item?

Answer: Yes, according to the Infobox furniture and the text description, the "Abstract" painting is a furniture item.

4. Where can you find the "Abstract" painting in the game?

Answer: You can find it in the Retro Catalogue.

5. What is the description of the "Abstract" painting provided in-game?

Answer: The description is not explicitly stated in the given text, but according to the Infobox furniture, it provides a link to the {{Description|furniture}} template, which likely contains the descriptio