In [1]:
import os 
import importlib
import textwrap
from llama_index import VectorStoreIndex, SimpleDirectoryReader, StorageContext, get_response_synthesizer, PromptHelper
from llama_index.text_splitter import SentenceSplitter
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.postprocessor import SimilarityPostprocessor
from llama_index.node_parser import SimpleNodeParser
from llama_index.vector_stores import ChromaVectorStore
from llamaindex_object_array_reader.dataset import simple_ols # import a simple dataset 
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts import PromptTemplate
from llama_index.indices.query.schema import QueryBundle
import torch
from transformers import AutoTokenizer, AutoModel
from transformers import BitsAndBytesConfig
from llama_index.llms import Ollama
from llama_index import ServiceContext, set_global_tokenizer
# from langchain.embeddings import HuggingFaceEmbedding, HuggingFaceInstructEmbeddings
from llama_index.embeddings import HuggingFaceEmbedding
from transformers import AutoTokenizer, AutoModel
from argparse import Namespace
from chromadb import Collection, PersistentClient
from dotenv import load_dotenv


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import logging
import sys
from llamaindex_object_array_reader._logging import logger

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
log = logger

In [3]:
# Obsolete
# if os.path.exists('my_cred.py'):
#     my_cred = importlib.import_module('my_cred')
#     os.environ['OPENAI_API_KEY'] = my_cred.OPENAI_API_KEY
# else:
#     # Set your OPENAI API Key
#     os.environ['OPENAI_API_KEY'] = "vy-...cH5N"

load_dotenv()
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
HF_TOKEN = os.environ['HF_TOKEN']

In [25]:
def print_resp(msg, max_len:int=55):
    """将文本分割为每行最大长度的子字符串
    """
    divider: str = '\n'+ '*'*60+'\n'
    msg = textwrap.fill(msg, width=max_len)
    print(f"""\u2705 RESPONSE:{divider}\n{msg}\n{divider} \U0001F6A9END OF RESPONSE""")

In [4]:
models:Namespace = Namespace(
    BERT_BASE_CHINESE="bert-base-chinese",
    LLAMA2_CHINESE_7B_CHAT="FlagAlpha/Llama2-Chinese-7b-Chat", #18G needed
    LLAMA2_7B_CHAT_HF="meta-llama/Llama-2-7b-chat-hf", #18G needed
    BLOOM_560M="bigscience/bloom-560m", #18G needed
    BLOOMZ_560M="bigscience/bloomz-560m", #18G needed
    GPT2="GPT2", #18G needed
    ALL_MPNET_BASE_V2="sentence-transformers/all-mpnet-base-v2", #18G needed
    MISTRAL_7B_INSTRUCT_V0_1="mistralai/Mistral-7B-Instruct-v0.1", #18G needed
    STARLING_LM_7B="berkeley-nest/Starling-LM-7B-alpha",
)

In [5]:
# Set the check point
check_point:str = models.ALL_MPNET_BASE_V2

In [6]:
tokenizer = AutoTokenizer.from_pretrained(check_point)
set_global_tokenizer(tokenizer)

# Alternatively, using a local LLM
USE_LOCAL:bool = True
if USE_LOCAL:
    # llm = Ollama(model="llama2-chinese")
    llm = Ollama(model="starling-lm:7b-alpha-q3_K_M")
    
else: 
    llm = HuggingFaceLLM(
        model_name=check_point,
        tokenizer_name=check_point,
        context_window=512,
        model_kwargs={
            # 'torch_dtype':torch.float16,
            "token": HF_TOKEN,
            'load_in_8bit':False, #No, the bitsandbytes library only works on CUDA GPU. So it must set to 'False' as running on mac os. 
            'offload_folder':"offload_folder",
            'offload_state_dict':True,
            'is_decoder': True if check_point==models.BERT_BASE_CHINESE else None,
            },
        tokenizer_kwargs={
            "token": HF_TOKEN,
            "return_tensors":'pt',},
        device_map="auto" if check_point!=models.BERT_BASE_CHINESE else "mps", 
    )


In [7]:
embedding_model = HuggingFaceEmbedding(
    model_name=check_point,
    tokenizer=tokenizer,
    cache_folder="cache_folder",
    max_length=512,
    device="mps"
)

In [8]:
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=64)
prompt_helper = PromptHelper(
    context_window=512,
    num_output=256,
    chunk_overlap_ratio=0.1,
    chunk_size_limit=None,
)

In [9]:
documents = SimpleDirectoryReader("test_docs/simple_txt_short_en").load_data()

In [12]:
# Assuming documents have already been loaded
# Initialize the parser
parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)
# Parse documents into nodes
nodes = parser.get_nodes_from_documents(documents)
print('Total nodes:', len(nodes))
for _, n in enumerate(nodes):
    print(n)
    print('---')

Total nodes: 3
Node ID: 6936a8ce-44b0-4526-a435-a669df596f5e
Text: You can do data integration, management, analysis and composing
reports and dashboards with Pharmquer, and then automatize all your
works.
---
Node ID: 56191396-0eeb-4595-8a6f-2d5e5abaf2be
Text: Colosscious' flagship product, Pharmquer, is an enterprise level
software of manufacturing and business intelligence, which is
architected especially for the industry.
---
Node ID: 2b7c00dc-f7bf-4734-b4bd-e9c05f2d47bc
Text: Welcome to Colosscious.  We are the expert who spotlight-focus
on providing the digital technology to bio and pharmaceutical
companies, engaging in boosting the performances of new drug
developments, quality control, manufacturing processes, and reducing
the costs and duration by Big Data.
---


In [13]:
V_DB_NAME = "chromadb"
chroma_client = PersistentClient(V_DB_NAME)
COLLECTION_NAME:str = 'test'
chroma_collection:Collection = chroma_client.get_or_create_collection(COLLECTION_NAME)
vector_store = ChromaVectorStore(chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)


2024-02-07 22:44:21,753 - chromadb.telemetry.product.posthog - [32;20mINFO[0m - (posthog.py:20) - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information. 


Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [14]:
for n in nodes:
    print(storage_context.docstore.document_exists(n.id_))

False
False
False


## Create and store new embeddings to ChromaDB. 

In [None]:
storage_context.docstore.add_documents(nodes)

service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding_model, text_splitter=text_splitter,
    prompt_helper=prompt_helper,)
# index = VectorStoreIndex.from_documents(
#     documents, service_context=service_context, storage_context=storage_context, show_progress=True,
# )
index = VectorStoreIndex(
    nodes, service_context=service_context, storage_context=storage_context, show_progress=True,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Generating embeddings: 100%|██████████| 3/3 [00:00<00:00,  5.29it/s]


Add of existing embedding ID: dc0f865e-90c8-42b0-9239-19625ebcef35




Add of existing embedding ID: 1f7abdb8-4dbb-4f9d-9398-f59fb630b862




Add of existing embedding ID: cb553733-838a-421b-89bf-c582fe90182a


In [None]:
# example: 
# "GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant: {response}<|end_of_turn|>GPT4 Correct User: {follow_up_question}<|end_of_turn|>GPT4 Correct Assistant:"
# ref: https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha
sep = '<|end_of_turn|>'
resp_prompt_temp = "GPT4 Correct Assistant: "

In [None]:
query_engine = index.as_query_engine()

In [None]:
tokenizer(
    ["What Colosscious do?"],
    return_tensors="pt",
    add_special_tokens=False,
).input_ids.to("mps")

tensor([[ 2058,  8906, 15098, 18440,  2083,  1033]], device='mps:0')

In [None]:
query_resp = query_engine.query("What is flagship product of Colosscious")

print_resp(query_resp.response)

2024-02-07 19:52:25,347 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
✅ RESPONSE:
************************************************************

Sorry, I cannot answer your query without using any
more tools.

************************************************************
 🚩END OF RESPONSE


In [None]:
query_engine = index.as_chat_engine()
query_resp = query_engine.query("What is Pharmquer?")
print_resp(query_resp.response)

2024-02-07 19:52:36,318 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
✅ RESPONSE:
************************************************************

PharmQuer is an international pharmacovigilance
electronic system used in more than 80 countries for
the collection and analysis of spontaneous case reports
(adverse reactions to drugs). It is a free, web-based
platform that allows users to report, review and
analyze cases. The primary purpose of PharmQuer is to
facilitate data sharing between regulatory agencies,
pharmaceutical companies, academia, and other
stakeholders in the field of pharmacovigilance.

************************************************************
 🚩END OF RESPONSE


## Load existing embeddings in ChromaDB.

In [22]:
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding_model, text_splitter=text_splitter,
    prompt_helper=prompt_helper,)
# load your index from stored vectors
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context, service_context=service_context
)


In [29]:
# create a query engine
query_engine = index.as_query_engine()

In [31]:
response = query_engine.query("What is Colosscious?")
print_resp(response.response)

2024-02-07 23:46:23,990 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


2024-02-07 23:46:27,636 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


2024-02-07 23:46:30,540 - httpx - [32;20mINFO[0m - (_client.py:1027) - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK" 


HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
✅ RESPONSE:
************************************************************

 Colosconscious is an organization that focuses on
providing digital solutions for bio and pharmaceutical
companies. Their aim is to enhance new drug
development, maintain quality control, streamline
manufacturing processes, and reduce costs in the
biotechnology and pharmaceutical sectors by utilizing
advanced technologies like Big Data analytics.

************************************************************
 🚩END OF RESPONSE
