In [31]:
import os 
import importlib
from llama_index import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.vector_stores import ChromaVectorStore
from llamaindex_object_array_reader.dataset import simple_ols # import a simple dataset 
from llama_index.llms import HuggingFaceLLM, AutoModelForMaskedLM
from llama_index.prompts import PromptTemplate
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
from transformers import BitsAndBytesConfig
from llama_index.llms import Ollama
from llama_index import ServiceContext, set_global_tokenizer
# from langchain.embeddings import HuggingFaceEmbedding, HuggingFaceInstructEmbeddings
from llama_index.embeddings import HuggingFaceEmbedding
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
from argparse import Namespace
from chromadb import Collection, PersistentClient


In [10]:
if os.path.exists('my_cred.py'):
    my_cred = importlib.import_module('my_cred')
    os.environ['OPENAI_API_KEY'] = my_cred.OPENAI_API_KEY
else:
    # Set your OPENAI API Key
    os.environ['OPENAI_API_KEY'] = "vy-...cH5N"

In [9]:
models:Namespace = Namespace(
    BERT_BASE_CHINESE="bert-base-chinese",
    LLAMA2_CHINESE_7B_CHAT="FlagAlpha/Llama2-Chinese-7b-Chat",
    BLOOM_560M="bigscience/bloom-560m",
)

In [11]:
# Set the check point
check_point:str = models.BERT_BASE_CHINESE

In [34]:
chroma_client = PersistentClient('test.chromadb')
COLLECTION_NAME:str = 'test'
chroma_collection:Collection = chroma_client.get_or_create_collection(COLLECTION_NAME)
vector_store = ChromaVectorStore(chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)


In [24]:
tokenizer = AutoTokenizer.from_pretrained(check_point)
set_global_tokenizer(tokenizer)

# Alternatively, using a local LLM
USE_LOCAL:bool = False
if USE_LOCAL:
    llm = Ollama(model="llama2-chinese")
    
else: 
    llm=HuggingFaceLLM(
        model_name=check_point,
        tokenizer_name=check_point,
        context_window=512,
        model_kwargs={
            'torch_dtype':torch.float16,
            'load_in_8bit':False,
            'offload_folder':"offload_folder",
            'offload_state_dict':True,
            'is_decoder': True if check_point==models.BERT_BASE_CHINESE else None,
            },
        tokenizer_kwargs={"token": my_cred.HF_TOKEN},
        device_map="auto" if check_point!=models.BERT_BASE_CHINESE else "mps", 
    )

embedding_model = HuggingFaceEmbedding(
    model_name=check_point,
    tokenizer=tokenizer,
    cache_folder="cache_folder",
    embed_batch_size=64
)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0

In [28]:
print(embedding_model.dict())

{'model_name': 'bert-base-chinese', 'embed_batch_size': 64, 'tokenizer_name': 'bert-base-chinese', 'max_length': 512, 'pooling': <Pooling.CLS: 'cls'>, 'normalize': True, 'query_instruction': None, 'text_instruction': None, 'cache_folder': None, 'class_name': 'HuggingFaceEmbedding'}


In [35]:
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding_model,)
documents = SimpleDirectoryReader("test_docs/simple_txt").load_data()
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context, storage_context=storage_context,
)

In [None]:
query_engine = index.as_query_engine()
query_engine.query("嵙思是")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "FlagAlpha/Llama2-Chinese-7b-Chat",
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_8bit=True,
    offload_folder="offload_folder",
    offload_state_dict=True,
)
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(
    "FlagAlpha/Llama2-Chinese-7b-Chat", use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token
input_ids = tokenizer(
    ["<s>Human: 介绍一下中国\n</s><s>Assistant: "],
    return_tensors="pt",
    add_special_tokens=False,
).input_ids.to("mps")
generate_input = {
    "input_ids": input_ids,
    "max_new_tokens": 512,
    "do_sample": True,
    "top_k": 50,
    "top_p": 0.95,
    "temperature": 0.3,
    "repetition_penalty": 1.3,
    "eos_token_id": tokenizer.eos_token_id,
    "bos_token_id": tokenizer.bos_token_id,
    "pad_token_id": tokenizer.pad_token_id,
}
generate_ids = model.generate(**generate_input)
text = tokenizer.decode(generate_ids[0])
print(text)

Loading checkpoint shards: 100%|██████████| 2/2 [00:51<00:00, 25.92s/it]
generation_config.json: 100%|██████████| 197/197 [00:00<00:00, 186kB/s]


<s>Human: 介绍一下中国
</s><s>Assistant: 中华人民共和国是位于亚洲的大型单制社会主义国家，拥有超过14亿人口。历史上，中国文明已经存在了数千年之久，被誉为“世界古老文化遗产”。现代中国则以其高速发展、全面深入改革而知名，成功实行市场 economy模式，建设出多个重要城镇群地区，如北京、天津、广州等。此外，中国还非常注重科技创新与教育事业，并加强对海洋保护工作的力度。除此之外，中国也因其政治体系所处理不同问题引起关注，例如香港特别行政區及台湾问题。
</s>
