In [2]:
from fastapi import FastAPI

In [1]:
!nvidia-smi

Wed Sep  6 08:39:18 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     Off | 00000000:AF:00.0 Off |                    0 |
|  0%   34C    P0              70W / 300W |      4MiB / 46068MiB |      2%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
from langchain.embeddings import HuggingFaceInstructEmbeddings
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="/root/.cache/torch/sentence_transformers/GanymedeNil_text2vec-large-chinese",
                                                      # local_files_only=True,
                                                      model_kwargs={"device": "cuda"})

  from tqdm.autonotebook import trange
No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/GanymedeNil_text2vec-large-chinese. Creating a new one with MEAN pooling.
2023-09-06 09:45:53.150624: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dir(instructor_embeddings)

['Config',
 '__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_vars__',
 '__config__',
 '__custom_root_type__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__exclude_fields__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_validators__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__include_fields__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__json_encoder__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__post_root_validators__',
 '__pre_root_validators__',
 '__pretty__',
 '__private_attributes__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__repr_args__',
 '__repr_name__',
 '__repr_str__',
 '__rich_repr__',
 '__schema_cache__',
 '__setattr__',
 '__setstate__',
 '__signature__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__try_update_forward_refs__',
 '__validators__',
 '__weakref__',
 '_abc_impl',
 '_calculate_keys',
 '_copy_and_set_values',
 '_decompose

In [7]:
embedded_query = instructor_embeddings.embed_query("What was the name mentioned in the conversation?")


In [9]:
len(embedded_query)

1024

In [3]:
embeddings = instructor_embeddings.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)

In [5]:
len(embeddings[0])

1024

In [1]:
from fastapi import FastAPI
from pydantic import BaseModel, Field
from typing import Optional, List
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain import PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

# import sklearn

app = FastAPI()

model_name = "FlagAlpha/Llama2-Chinese-7b-Chat"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name,
                                          local_files_only=True
                                          # use_auth_token=True,
                                         )
model = AutoModelForCausalLM.from_pretrained(model_name,
                                         local_files_only=True,
                                         device_map='auto',
                                         torch_dtype=torch.float16,
                                         temperature=0.2, # must be strictly positive float
                                         do_sample=True,
                                         # use_auth_token=True,
                                        #  load_in_8bit=True,
                                        #  load_in_4bit=True
                                         )
pipe = pipeline("text-generation",
            model=model,
            tokenizer= tokenizer,
            # return_full_text=True,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            max_new_tokens = 512,
            do_sample=True,
            top_k=30,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id
            )
llm = HuggingFacePipeline(pipeline=pipe)

template = """使用以下上下文来回答最后的问题。如果你不知道答案，就说你不知道，不要试图编造答案。

  {context}

  问题: {question}
  答案:"""


prompt_template = PromptTemplate.from_template(
    template
)

chain = LLMChain(llm=llm, prompt=prompt_template)




@app.get("/")
async def root():
    return {
        "message": "this is get"
    }

2023-09-06 14:53:51.179926: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [14]:
class Prompt(BaseModel):
    context: str
    question: str
    model: str = Field(default="FlagAlpha/Llama2-Chinese-7b-Chat")
    temperature: float = Field(default=0.2)

@app.post("/completion")
async def get_completion(input: Prompt):
    input_dict = input.dict()

    # return input_dict

    ans = chain.invoke(
        {
            "context" : input_dict["context"],
            "question": input_dict["question"],
            "temperature": input_dict["temperature"], # not sure
            # "eos_token_id": tokenizer.eos_token_id,
            # "pad_token_id": tokenizer.pad_token_id,
        }
    )

    return {
        "completion": ans["text"]
    }

In [12]:
import nest_asyncio

# Allow for asyncio to work within the Jupyter notebook cell
nest_asyncio.apply()

In [13]:
import uvicorn
uvicorn.run(app)

INFO:     Started server process [26527]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:24260 - "POST /completion HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [26527]
