In [1]:
import os 
import importlib
import textwrap
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, set_global_tokenizer, PromptHelper, StorageContext
from llama_index.core.text_splitter import SentenceSplitter
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SimpleNodeParser
from llamaindex_object_array_reader.dataset import simple_ols # import a simple dataset 
from llama_index.legacy.llms import HuggingFaceLLM
import torch
from transformers import AutoTokenizer, AutoModel
from transformers import BitsAndBytesConfig
from llama_index.llms.ollama import Ollama
# from langchain.embeddings import HuggingFaceEmbedding, HuggingFaceInstructEmbeddings
from llama_index.legacy.embeddings import HuggingFaceEmbedding
from transformers import AutoTokenizer, AutoModel
from argparse import Namespace
from chromadb import Collection, PersistentClient
from dotenv import load_dotenv
from llamaindex_object_array_reader import ObjectArrayReader
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
import nest_asyncio

# 允许嵌套事件循环
nest_asyncio.apply()



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import logging
import sys
from llamaindex_object_array_reader._logging import logger

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
log = logger
log.setLevel(logging.DEBUG)

In [3]:
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

In [4]:
# Obsolete
# if os.path.exists('my_cred.py'):
#     my_cred = importlib.import_module('my_cred')
#     os.environ['OPENAI_API_KEY'] = my_cred.OPENAI_API_KEY
# else:
#     # Set your OPENAI API Key
#     os.environ['OPENAI_API_KEY'] = "vy-...cH5N"

load_dotenv()
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
HF_TOKEN = os.environ['HF_TOKEN']

In [5]:
def print_resp(msg, max_len:int=55):
    """将文本分割为每行最大长度的子字符串
    """
    divider: str = '\n'+ '*'*60+'\n'
    msg = textwrap.fill(msg, width=max_len)
    print(f"""\u2705 RESPONSE:{divider}\n{msg}\n{divider} \U0001F6A9END OF RESPONSE""")

In [6]:
models:Namespace = Namespace(
    BERT_BASE_CHINESE="bert-base-chinese",
    LLAMA2_CHINESE_7B_CHAT="FlagAlpha/Llama2-Chinese-7b-Chat", #18G needed
    LLAMA2_7B_CHAT_HF="meta-llama/Llama-2-7b-chat-hf", #18G needed
    BLOOM_560M="bigscience/bloom-560m", #18G needed
    BLOOMZ_560M="bigscience/bloomz-560m", #18G needed
    GPT2="GPT2", #18G needed
    ALL_MPNET_BASE_V2="sentence-transformers/all-mpnet-base-v2", #18G needed
    MISTRAL_7B_INSTRUCT_V0_1="mistralai/Mistral-7B-Instruct-v0.1", #18G needed
    STARLING_LM_7B="berkeley-nest/Starling-LM-7B-alpha",
)

In [7]:
# Set the check point
check_point:str = models.ALL_MPNET_BASE_V2

In [8]:
tokenizer = AutoTokenizer.from_pretrained(check_point)
set_global_tokenizer(tokenizer)

# Alternatively, using a local LLM
USE_LOCAL:bool = True
if USE_LOCAL:
    # llm = Ollama(model="llama2-chinese")
    # llm = Ollama(model="starling-lm:7b-alpha-q3_K_M")
    llm = Ollama(model="mistral")
    
else: 
    llm = HuggingFaceLLM(
        model_name=check_point,
        tokenizer_name=check_point,
        context_window=512,
        model_kwargs={
            # 'torch_dtype':torch.float16,
            "token": HF_TOKEN,
            'load_in_8bit':False, #No, the bitsandbytes library only works on CUDA GPU. So it must set to 'False' as running on mac os. 
            'offload_folder':"offload_folder",
            'offload_state_dict':True,
            'is_decoder': True if check_point==models.BERT_BASE_CHINESE else None,
            },
        tokenizer_kwargs={
            "token": HF_TOKEN,
            "return_tensors":'pt',},
        device_map="auto" if check_point!=models.BERT_BASE_CHINESE else "mps", 
    )


Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /sentence-transformers/all-mpnet-base-v2/resolve/main/tokenizer_config.json HTTP/1.1" 200 0


In [9]:
embedding_model = HuggingFaceEmbedding(
    model_name=check_point,
    tokenizer=tokenizer,
    cache_folder="cache_folder",
    max_length=512,
    device="mps"
)

https://huggingface.co:443 "HEAD /sentence-transformers/all-mpnet-base-v2/resolve/main/config.json HTTP/1.1" 200 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "GET /sentence-transformers/all-mpnet-base-v2/raw/main/1_Pooling/config.json HTTP/1.1" 200 190


In [10]:
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=64)
prompt_helper = PromptHelper(
    context_window=512,
    num_output=256,
    chunk_overlap_ratio=0.1,
    chunk_size_limit=None,
)

In [11]:
documents = SimpleDirectoryReader("test_docs/simple_txt_short_en").load_data()

> [SimpleDirectoryReader] Total files added: 3


In [12]:
# Assuming documents have already been loaded
# Initialize the parser
parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)
# Parse documents into nodes
nodes = parser.get_nodes_from_documents(documents)
print('Total nodes:', len(nodes))
for _, n in enumerate(nodes):
    print(n)
    print('---')

> Adding chunk: You can do data integration, management, analys...
> Adding chunk: Colosscious' flagship product, Pharmquer, is an...
> Adding chunk: Welcome to Colosscious. 
We are the expert who ...
Total nodes: 3
Node ID: cde41ebf-9512-4793-a8d6-b3002b2e527d
Text: You can do data integration, management, analysis and composing
reports and dashboards with Pharmquer, and then automatize all your
works.
---
Node ID: e5bfe2b0-5ee3-4b74-9d34-e2b77c28a40e
Text: Colosscious' flagship product, Pharmquer, is an enterprise level
software of manufacturing and business intelligence, which is
architected especially for the industry.
---
Node ID: 7e1706e8-8321-409c-b1fd-72891a749953
Text: Welcome to Colosscious.  We are the expert who spotlight-focus
on providing the digital technology to bio and pharmaceutical
companies, engaging in boosting the performances of new drug
developments, quality control, manufacturing processes, and reducing
the costs and duration by Big Data.
---


In [13]:
V_DB_NAME = "chromadb"
chroma_client = PersistentClient(V_DB_NAME)
COLLECTION_NAME:str = 'test'
chroma_collection:Collection = chroma_client.get_or_create_collection(COLLECTION_NAME)
vector_store = ChromaVectorStore(chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)


2024-02-13 02:05:38,359 - chromadb.telemetry.product.posthog - [32;20mINFO[0m - (posthog.py:20) - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information. 


Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Starting component System
Starting component Posthog
Starting component OpenTelemetryClient
Starting component SimpleAssignmentPolicy
Starting component SqliteDB
Starting component LocalSegmentManager
Starting component SegmentAPI


In [14]:
for n in nodes:
    print(storage_context.docstore.document_exists(n.id_))

False
False
False


Starting new HTTPS connection (1): us-api.i.posthog.com:443
https://us-api.i.posthog.com:443 "POST /batch/ HTTP/1.1" 200 None


## Create and store new embeddings to ChromaDB. 

In [15]:
storage_context.docstore.add_documents(nodes)

service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding_model, text_splitter=text_splitter,
    prompt_helper=prompt_helper, callback_manager=callback_manager)
# index = VectorStoreIndex.from_documents(
#     documents, service_context=service_context, storage_context=storage_context, show_progress=True,
# )
index = VectorStoreIndex(
    nodes, service_context=service_context, storage_context=storage_context, show_progress=True,
)

  service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding_model, text_splitter=text_splitter,
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Generating embeddings: 100%|██████████| 3/3 [00:00<00:00, 13.44it/s]

Starting component PersistentLocalHnswSegment





Add of existing embedding ID: dc0f865e-90c8-42b0-9239-19625ebcef35




Add of existing embedding ID: 1f7abdb8-4dbb-4f9d-9398-f59fb630b862




Add of existing embedding ID: cb553733-838a-421b-89bf-c582fe90182a
**********
Trace: index_construction
    |_embedding ->  0.222214 seconds
**********


https://us-api.i.posthog.com:443 "POST /batch/ HTTP/1.1" 200 None


In [16]:
query_engine = index.as_query_engine()

In [17]:
tokenizer(
    ["What Colosscious do?"],
    return_tensors="pt",
    add_special_tokens=False,
).input_ids.to("mps")

tensor([[ 2058,  8906, 15098, 18440,  2083,  1033]], device='mps:0')

In [18]:
query_resp = query_engine.query("What is flagship product of Colosscious")

print_resp(query_resp.response)

> Top 1 nodes:
> [Node cb130b57-5f50-4276-9215-38aa878a7381] [Similarity score: 0.4660001156353038] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> [Node 7e1706e8-8321-409c-b1fd-72891a749953] [Similarity score: 0.4660001156353038] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> Top 2 nodes:
> [Node cb130b57-5f50-4276-9215-38aa878a7381] [Similarity score:             0.466] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> [Node 7e1706e8-8321-409c-b1fd-72891a749953] [Similarity score:             0.466] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
load_ssl_context verify=True cert=None trust_env=True http2=False
load_verify_locations cafile='/Users/yuwang/Developments/python/llamaindex_object_array_reader/.venv/lib/python3.10/site-packages/certifi/cacert.pem'
connect_tcp.

2024-02-13 02:05:52,363 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
close.started
close.complete
✅ RESPONSE:
************************************************************

 Colosconscious specializes in delivering digital
technology solutions to bio and pharmaceutical
companies, with a focus on enhancing new drug
development, improving quality control, optimizing
manufacturing processes, and reducing costs and
duration through Big Data. No specific flagship product
is mentioned in the context provided.

************************************************************
 🚩END OF RESPONSE


In [None]:
query_engine = index.as_chat_engine()
query_resp = query_engine.query("What is Pharmquer?")
print_resp(query_resp.response)

2024-02-07 19:52:36,318 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
✅ RESPONSE:
************************************************************

PharmQuer is an international pharmacovigilance
electronic system used in more than 80 countries for
the collection and analysis of spontaneous case reports
(adverse reactions to drugs). It is a free, web-based
platform that allows users to report, review and
analyze cases. The primary purpose of PharmQuer is to
facilitate data sharing between regulatory agencies,
pharmaceutical companies, academia, and other
stakeholders in the field of pharmacovigilance.

************************************************************
 🚩END OF RESPONSE


## Load existing embeddings in ChromaDB.

In [39]:
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding_model, text_splitter=text_splitter,
    prompt_helper=prompt_helper, callback_manager=callback_manager)
# load your index from stored vectors
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context, service_context=service_context
)


**********
Trace: index_construction
**********


  service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding_model, text_splitter=text_splitter,


In [40]:
# create a query engine
query_engine = index.as_query_engine()

In [41]:
response = query_engine.query("What is Colosscious?")
print_resp(response.response)

> Top 1 nodes:
> Top 1 nodes:
> [Node cb553733-838a-421b-89bf-c582fe90182a] [Similarity score: 0.37691508919720346] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> [Node cb553733-838a-421b-89bf-c582fe90182a] [Similarity score: 0.37691508919720346] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> [Node 5b2ae422-9c47-47b9-aad2-e97c95d22903] [Similarity score: 0.37691508919720346] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> [Node 5b2ae422-9c47-47b9-aad2-e97c95d22903] [Similarity score: 0.37691508919720346] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> Top 2 nodes:
> [Node cb553733-838a-421b-89bf-c582fe90182a] [Similarity score:             0.376915] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> [Node 5b2ae422-9c

2024-02-13 02:02:49,467 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
receive_response_body.started request=<Request [b'POST']>
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
receive_response_body.complete
response_closed.started
response_closed.started
response_closed.complete
response_closed.complete
close.started
close.started
close.complete
close.complete
✅ RESPONSE:
************************************************************

 Colosconscious is an expert entity that specializes in
providing digital technology solutions to bio and
pharmaceutical companies. Their focus areas include
enhancing new drug development, improving quality
control processes, optimizing manufacturing methods,
and reducing costs and durations through the
application of Big Data.

************************************************************
 🚩END OF RESPONSE


In [42]:
response = query_engine.query("What is Colosscious?")
print_resp(response.response)

> Top 1 nodes:
> Top 1 nodes:
> [Node cb553733-838a-421b-89bf-c582fe90182a] [Similarity score: 0.37691508919720346] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> [Node cb553733-838a-421b-89bf-c582fe90182a] [Similarity score: 0.37691508919720346] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> [Node 5b2ae422-9c47-47b9-aad2-e97c95d22903] [Similarity score: 0.37691508919720346] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> [Node 5b2ae422-9c47-47b9-aad2-e97c95d22903] [Similarity score: 0.37691508919720346] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> Top 2 nodes:
> [Node cb553733-838a-421b-89bf-c582fe90182a] [Similarity score:             0.376915] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> [Node 5b2ae422-9c

2024-02-13 02:02:58,931 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
receive_response_body.started request=<Request [b'POST']>
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
receive_response_body.complete
response_closed.started
response_closed.started
response_closed.complete
response_closed.complete
close.started
close.started
close.complete
close.complete
✅ RESPONSE:
************************************************************

 Coloscius is an expert organization that specializes
in providing digital technology solutions to bio and
pharmaceutical companies. Their focus includes
enhancing new drug development, ensuring quality
control, optimizing manufacturing processes, and
reducing costs and duration through the application of
Big Data.

************************************************************
 🚩END OF RESPONSE


## Use llama_index_object_array_reader

In [19]:
# Preview: demo data
simple_ols[:2]

[{'x1': 97.98219999874924,
  'x2': 99.84941752810117,
  'x3': 100.9727776594234,
  'y': 360.87650920565545},
 {'x1': 101.00077953260389,
  'x2': 99.87874921228179,
  'x3': 99.35642250227457,
  'y': 361.50488035486944}]

In [20]:
loader = ObjectArrayReader()

In [21]:
from llama_index.core.readers.base import Document
object_arrays:list[Document] = loader.load_data(file=simple_ols)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x1      50 non-null     float64
 1   x2      50 non-null     float64
 2   x3      50 non-null     float64
 3   y       50 non-null     float64
dtypes: float64(4)
memory usage: 1.7 KB


In [22]:
import pandas as pd
df = pd.DataFrame(simple_ols)

In [23]:
object_arrays[0]

Document(id_='7caf7ec5-3892-4b9c-9195-a5b0a1795ee4', embedding=None, metadata={'columns': "['x1', 'x2', 'x3', 'y']", 'schema': 'None', 'shape': '(50, 4)'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='97.98219999874924, 99.84941752810117, 100.9727776594234, 360.87650920565545\n101.00077953260389, 99.87874921228179, 99.35642250227457, 361.50488035486944\n98.5109626677227, 100.7485502397903, 99.46465098250788, 359.8117609861218\n100.77335929310553, 100.03722922045552, 99.86657209922947, 362.2336960397953\n100.97359840386007, 99.1724799721807, 100.16093297144785, 362.1391160315852\n100.18799255929102, 100.55900119891184, 100.61532849440285, 363.29752977180965\n100.9157547652626, 98.61649241995889, 99.06726035297895, 359.7975894964005\n101.04615952660859, 102.00920930524853, 100.16419028246959, 364.8003752715575\n99.46321248760913, 100.23898461781165, 100.4603474082993, 361.9810830871964\n101.01997365057879, 100.70311893925478, 100.35193718659701,

In [24]:
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding_model, callback_manager=callback_manager)
index = VectorStoreIndex.from_documents(
    documents=object_arrays, service_context=service_context,  storage_context=storage_context, show_progress=True,
)

  service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding_model, callback_manager=callback_manager)
Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2225 > 512). Running this sequence through the model will result in indexing errors


> Adding chunk: 97.98219999874924, 99.84941752810117, 100.97277...
> Adding chunk: 98.90754655604644, 99.80434032022505, 360.25326...
> Adding chunk: 26476906248293, 100.78711425646654, 100.2982222...
> Adding chunk: 100.84958819151115, 361.7497627252443
99.070933...


Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 42.52it/s]
Generating embeddings: 100%|██████████| 4/4 [00:00<00:00,  6.68it/s]

**********
Trace: index_construction
    |_node_parsing ->  0.026223 seconds
      |_chunking ->  0.02075 seconds
    |_embedding ->  0.597961 seconds
**********





In [25]:
len(simple_ols)

50

In [26]:
object_arrays

[Document(id_='7caf7ec5-3892-4b9c-9195-a5b0a1795ee4', embedding=None, metadata={'columns': "['x1', 'x2', 'x3', 'y']", 'schema': 'None', 'shape': '(50, 4)'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='97.98219999874924, 99.84941752810117, 100.9727776594234, 360.87650920565545\n101.00077953260389, 99.87874921228179, 99.35642250227457, 361.50488035486944\n98.5109626677227, 100.7485502397903, 99.46465098250788, 359.8117609861218\n100.77335929310553, 100.03722922045552, 99.86657209922947, 362.2336960397953\n100.97359840386007, 99.1724799721807, 100.16093297144785, 362.1391160315852\n100.18799255929102, 100.55900119891184, 100.61532849440285, 363.29752977180965\n100.9157547652626, 98.61649241995889, 99.06726035297895, 359.7975894964005\n101.04615952660859, 102.00920930524853, 100.16419028246959, 364.8003752715575\n99.46321248760913, 100.23898461781165, 100.4603474082993, 361.9810830871964\n101.01997365057879, 100.70311893925478, 100.35193718659701

In [27]:
# create a query engine
query_engine = index.as_query_engine()

In [28]:
response = query_engine.query("How many values with in the dataset?")
print_resp(response.response)

> Top 1 nodes:
> [Node 021d374e-1c7e-4935-823f-e9ee662007e4] [Similarity score: 0.23697712076054578] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
> [Node ec5b09f1-bc2b-4225-99d8-4bed978ccabd] [Similarity score: 0.23697712076054578] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
> Top 2 nodes:
> [Node 021d374e-1c7e-4935-823f-e9ee662007e4] [Similarity score:             0.236977] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
> [Node ec5b09f1-bc2b-4225-99d8-4bed978ccabd] [Similarity score:             0.236977] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
load_ssl_context verify=True cert=None trust_env=True http2=False
load_verify_locations cafile='/Users/yuwang/Developments/python/llamaindex_object_array_reader/.venv/lib/python3.10/site-packages/certifi/cacert.pem'
conn

2024-02-13 02:06:11,815 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
close.started
close.complete
✅ RESPONSE:
************************************************************

 The provided context consists of 50 rows, each
containing 4 columns. Therefore, there are a total of
50 rows multiplied by 4 columns, resulting in a total
of 200 values in the dataset.

************************************************************
 🚩END OF RESPONSE


In [29]:
response = query_engine.query("How many values with in the dataset?")
print_resp(response.response)

> Top 1 nodes:
> [Node 021d374e-1c7e-4935-823f-e9ee662007e4] [Similarity score: 0.23697712076054578] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
> [Node ec5b09f1-bc2b-4225-99d8-4bed978ccabd] [Similarity score: 0.23697712076054578] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
> Top 2 nodes:
> [Node 021d374e-1c7e-4935-823f-e9ee662007e4] [Similarity score:             0.236977] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
> [Node ec5b09f1-bc2b-4225-99d8-4bed978ccabd] [Similarity score:             0.236977] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
load_ssl_context verify=True cert=None trust_env=True http2=False
load_verify_locations cafile='/Users/yuwang/Developments/python/llamaindex_object_array_reader/.venv/lib/python3.10/site-packages/certifi/cacert.pem'
conn

2024-02-13 02:06:13,655 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
close.started
close.complete
✅ RESPONSE:
************************************************************

 The provided context consists of 13 rows, each row
having 4 columns. Therefore, the total number of values
in the dataset is 13 x 4 = 52.

************************************************************
 🚩END OF RESPONSE


In [30]:
df.shape

(50, 4)

In [29]:
response = query_engine.query("How many columns' name starts with 'x'?")
print_resp(response.response)

2024-02-08 15:13:24,979 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
✅ RESPONSE:
************************************************************

 There are three columns' names that start with 'x',
which are 'x1', 'x2', and 'x3'.

************************************************************
 🚩END OF RESPONSE


In [24]:
response = query_engine.query("What is the average of column 'x1'?")
print_resp(response.response)

2024-02-08 15:21:03,508 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
✅ RESPONSE:
************************************************************

 To find the average of column 'x1', we need to sum all
the values in column 'x1' and then divide by the total
number of rows (50). Here are all the values in column
'x1':  99.07093312420517, 100.84958819151115,
99.72656237514776, ... , 100.97373793175905  Adding all
the values gives us:  99.07093312420517 +
100.84958819151115 + ... + 100.97373793175905 = (sum of
all x1 values)  Now, we need to divide the sum by the
total number of rows, which is 50:  (sum of all x1
values) / 50 = average of column 'x1'  Without
calculating the exact sum, we can see that the average
value lies between 98.73 (lowest value) and 100.97
(highest value). However, without performing the actual
calculation, we cannot provide an exact numerical
answer for the average of column 'x1'.  Please note
that providing the exact average would require further
calculations that go

In [31]:
df.head()

Unnamed: 0,x1,x2,x3,y
0,97.9822,99.849418,100.972778,360.876509
1,101.00078,99.878749,99.356423,361.50488
2,98.510963,100.74855,99.464651,359.811761
3,100.773359,100.037229,99.866572,362.233696
4,100.973598,99.17248,100.160933,362.139116


In [32]:
print(df['x1'][:5])
print('Mean= ', df['x1'].mean())

0     97.982200
1    101.000780
2     98.510963
3    100.773359
4    100.973598
Name: x1, dtype: float64
Mean=  100.07520939069373


### Sub Question Query Engine

In [33]:
query_engine = index.as_query_engine()

In [34]:
response = query_engine.query(
    "What about the dataset?"
)
print_resp(response.response)

> Top 1 nodes:
> [Node 1d4d4017-e547-4063-8f27-98eca3937345] [Similarity score: 0.21449468596442905] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> [Node 7e1706e8-8321-409c-b1fd-72891a749953] [Similarity score: 0.21449468596442905] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> Top 2 nodes:
> [Node 1d4d4017-e547-4063-8f27-98eca3937345] [Similarity score:             0.214495] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
> [Node 7e1706e8-8321-409c-b1fd-72891a749953] [Similarity score:             0.214495] Welcome to Colosscious. 
We are the expert who spotlight-focus on providing the digital technolog...
load_ssl_context verify=True cert=None trust_env=True http2=False
load_verify_locations cafile='/Users/yuwang/Developments/python/llamaindex_object_array_reader/.venv/lib/python3.10/site-packages/certifi/cacert.pem'
conn

2024-02-13 02:06:31,440 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
close.started
close.complete
✅ RESPONSE:
************************************************************

 In the provided context, there is no mention of a
dataset. The text describes Colosscious as an expert in
digital technology solutions for bio and pharmaceutical
companies, focusing on improving drug development,
quality control, manufacturing processes, and cost
reduction through Big Data.

************************************************************
 🚩END OF RESPONSE


In [35]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine

In [36]:

query_engine_tools = [
    QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name="summary_tool",
            description=f"Return the shape of the dataset and the basic summary of the dataset, such as mean, range, stddev of each columns.",
        ),
    ),
] 

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    service_context=service_context,
    verbose=True,
    use_async=True,
)

In [38]:
response = query_engine.query(
   "What about the dataset?"
)
print_resp(response.response )

load_ssl_context verify=True cert=None trust_env=True http2=False
load_verify_locations cafile='/Users/yuwang/Developments/python/llamaindex_object_array_reader/.venv/lib/python3.10/site-packages/certifi/cacert.pem'
connect_tcp.started host='localhost' port=11434 local_address=None timeout=30.0 socket_options=None
connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x2d80d6260>
send_request_headers.started request=<Request [b'POST']>
send_request_headers.complete
send_request_body.started request=<Request [b'POST']>
send_request_body.complete
receive_response_headers.started request=<Request [b'POST']>
receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Content-Type', b'application/json; charset=utf-8'), (b'Date', b'Tue, 13 Feb 2024 01:07:39 GMT'), (b'Content-Length', b'622')])


2024-02-13 02:07:39,756 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
close.started
close.complete
Generated 2 sub questions.
[1;3;38;2;237;90;200m[summary_tool] Q: What is the shape of the dataset
[0m> Top 1 nodes:
> [Node 021d374e-1c7e-4935-823f-e9ee662007e4] [Similarity score: 0.3079660874249297] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
> [Node ec5b09f1-bc2b-4225-99d8-4bed978ccabd] [Similarity score: 0.3079660874249297] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
> Top 2 nodes:
> [Node 021d374e-1c7e-4935-823f-e9ee662007e4] [Similarity score:             0.307966] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
> [Node ec5b09f1-bc2b-4225-99d8-4bed978ccabd] [Similarity score:      

2024-02-13 02:07:44,691 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
close.started
close.complete
[1;3;38;2;237;90;200m[summary_tool] A:  The given data represents 50 observations, each with four features or columns. Therefore, the shape of the dataset can be described as having 50 rows and 4 columns.
[0m[1;3;38;2;90;149;237m[summary_tool] Q: What are the mean, range, and stddev of each column in the dataset
[0m> Top 1 nodes:
> [Node 021d374e-1c7e-4935-823f-e9ee662007e4] [Similarity score: 0.27269000508346425] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
> [Node ec5b09f1-bc2b-4225-99d8-4bed978ccabd] [Similarity score: 0.27269000508346425] 100.84958819151115, 361.7497627252443
99.07093312420517, 100.95979226903094, 99.95112469407546, 3...
> Top 2 nodes:
> [Node 021d374e-1c7e-4935-823f-e9ee662

2024-02-13 02:08:14,751 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
close.started
close.complete
[1;3;38;2;90;149;237m[summary_tool] A:  To calculate the mean, range, and standard deviation for each column in the given dataset, we need to perform some numerical calculations using the values present in the dataset. Here's how we can find them:

1. Mean: For each column, sum up all the corresponding values and then divide by the total number of observations (number of rows).

2. Range: Find the difference between the maximum value and minimum value for a given column.

3. Standard deviation: First, calculate the variance (average of squared differences from the mean) for each column, then take the square root of the variance to find the standard deviation.

Let's compute these values for the columns in the dataset provided:

Column 'x1':
- Mean: Sum 

2024-02-13 02:08:23,226 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
receive_response_body.started request=<Request [b'POST']>
receive_response_body.complete
response_closed.started
response_closed.complete
close.started
close.complete
✅ RESPONSE:
************************************************************

 The given dataset consists of 50 observations with
each observation having four features or columns
labeled as 'x1', 'x2', 'x3', and 'y'. To calculate
statistical measures such as mean, range, and standard
deviation for each column, we need to perform numerical
calculations using the values present in the dataset.
These calculations involve finding the sum of all
values for a given column and dividing it by the total
number of observations (rows) to find the mean. The
range is found by subtracting the minimum value from
the maximum value for a given column, while standard
deviation can be computed using the variance and square
root. These calculations have been outlined in detail
