In [9]:
from llama_index.node_parser import SimpleNodeParser
from llama_index.node_parser.extractors import (
    MetadataExtractor,
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
)
from llama_index.text_splitter import TokenTextSplitter
from llama_index import download_loader

WikipediaReader = download_loader("WikipediaReader")
loader = WikipediaReader()
documents = loader.load_data(pages=['2023 Cricket World Cup'], auto_suggest=False)
print(f'Loaded {len(documents)} documents')

# #construct text splitter to split texts into chunks for processing
text_splitter = TokenTextSplitter(separator=" ", chunk_size=256, chunk_overlap=20)

#create node parser to parse nodes from document
node_parser = SimpleNodeParser(text_splitter=text_splitter)

nodes = node_parser.get_nodes_from_documents(documents)
print(f"loaded nodes with {len(nodes)} nodes")

Loaded 1 documents
loaded nodes with 9 nodes


In [15]:
from llama_index.embeddings import TextEmbeddingsInference

embed_model = TextEmbeddingsInference(
    model_name="BAAI/bge-large-en-v1.5",
    base_url = "http://127.0.0.1:8080",
    #base_url = "http://ec2-##-##-##-##.compute-1.amazonaws.com:8080",
    timeout=60,  # timeout in seconds
    embed_batch_size=10,  # batch size for embedding
)

In [22]:
import os
import logging
import sys

os.environ["OPENAI_API_KEY"] = "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"

logging.basicConfig(stream=sys.stdout, level=logging.ERROR)

from llama_index.llms import OpenAI

# define LLM
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo")

In [23]:
from llama_index import ServiceContext

service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model, chunk_size=256, chunk_overlap=20)

In [None]:
from llama_index import VectorStoreIndex

index = VectorStoreIndex(
    nodes=nodes,
    service_context=service_context,
    show_progress=True
)

query_engine = index.as_query_engine()

Generating embeddings:   0%|          | 0/9 [00:00<?, ?it/s]

In [None]:
response = query_engine.query("which team has won most matches and which team has lost most matches")
print(response)