In [1]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

In [2]:
response_schemas = [
    ResponseSchema(name="answer", description="answer to the user's question"),
    ResponseSchema(name="source", description="source used to answer the user's question, should be a website.")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

prompt = PromptTemplate(
    template="answer the users question as best as possible.\n{format_instructions}\n{question}",
    input_variables=["question"],
    partial_variables={"format_instructions": format_instructions}
)

In [3]:
model = OpenAI(temperature=0)

In [4]:
_input = prompt.format_prompt(question="what's the capital of france?")
output = model(_input.to_string())
output

'\n\n```json\n{\n\t"answer": "Paris",\n\t"source": "https://www.worldatlas.com/articles/what-is-the-capital-of-france.html"\n}\n```'

In [5]:
print(type(output))
print(output)

<class 'str'>


```json
{
	"answer": "Paris",
	"source": "https://www.worldatlas.com/articles/what-is-the-capital-of-france.html"
}
```


In [6]:
o = output_parser.parse(output)
print(type(o))
print(o)

<class 'dict'>
{'answer': 'Paris', 'source': 'https://www.worldatlas.com/articles/what-is-the-capital-of-france.html'}


In [7]:
chat_model = ChatOpenAI(temperature=0)

In [8]:
prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template("answer the users question as best as possible.\n{format_instructions}\n{question}")  
    ],
    input_variables=["question"],
    partial_variables={"format_instructions": format_instructions}
)

In [9]:
_input = prompt.format_prompt(question="what's the capital of france?")
output = chat_model(_input.to_messages())


In [10]:
print(type(output))
print(output)

<class 'langchain.schema.messages.AIMessage'>
content='```json\n{\n\t"answer": "The capital of France is Paris.",\n\t"source": "https://en.wikipedia.org/wiki/Paris"\n}\n```' additional_kwargs={} example=False


In [11]:
output_parser.parse(output.content)

{'answer': 'The capital of France is Paris.',
 'source': 'https://en.wikipedia.org/wiki/Paris'}

In [12]:
from langchain.prompts import (
    PromptTemplate,
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

In [13]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from typing import List

In [14]:
model_name = "text-davinci-003"
temperature = 0.0
model = OpenAI(model_name=model_name, temperature=temperature)

In [15]:
# Define your desired data structure.
class Joke(BaseModel):
    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

    # You can add custom validation logic easily with Pydantic.
    @validator("setup")
    def question_ends_with_question_mark(cls, field):
        if field[-1] != "?":
            raise ValueError("Badly formed question!")
        return field


# And a query intented to prompt a language model to populate the data structure.
joke_query = "Tell me a joke."

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Joke)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

_input = prompt.format_prompt(query=joke_query)

output = model(_input.to_string())

parser.parse(output)

Joke(setup='Why did the chicken cross the road?', punchline='To get to the other side!')

In [16]:
# Here's another example, but with a compound typed field.
class Actor(BaseModel):
    name: str = Field(description="name of an actor")
    film_names: List[str] = Field(description="list of names of films they starred in")


actor_query = "Generate the filmography for a random actor."

parser = PydanticOutputParser(pydantic_object=Actor)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

_input = prompt.format_prompt(query=actor_query)

output = model(_input.to_string())


In [17]:
print(_input.to_string())

Answer the user query.
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"name": {"title": "Name", "description": "name of an actor", "type": "string"}, "film_names": {"title": "Film Names", "description": "list of names of films they starred in", "type": "array", "items": {"type": "string"}}}, "required": ["name", "film_names"]}
```
Generate the filmography for a random actor.



In [18]:
print(type(output))
print(output)

<class 'str'>

{"name": "Tom Hanks", "film_names": ["Forrest Gump", "Saving Private Ryan", "The Green Mile", "Cast Away", "Toy Story"]}


In [19]:
o = parser.parse(output)
print(type(o))
print(o)


<class '__main__.Actor'>
name='Tom Hanks' film_names=['Forrest Gump', 'Saving Private Ryan', 'The Green Mile', 'Cast Away', 'Toy Story']


# Llama2

In [20]:
import pathlib

# llama2_model_file = "../backend/data/llama-2-7b-chat.ggmlv3.q5_K_M.bin"
# llama2_model_file = "../backend/data/llama-2-13b-chat.ggmlv3.q4_K_M.bin"
# llama2_model_file = "../backend/data/llama-2-13b-chat.ggmlv3.q5_K_M.bin"
llama2_model_file = "../backend/data/llama-2-70b-chat.ggmlv3.q4_K_M.bin"
# llama2_model_file = "../backend/data/llama-2-70b-chat.ggmlv3.q5_K_M.bin"
pathlib.Path(llama2_model_file).exists()




True

In [21]:
from pydantic import BaseModel, Field, validator
from app.llama2cpp.component.outputparser import PydanticOutputParserCustom


# Define your desired data structure.
class Joke(BaseModel):
    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

    # You can add custom validation logic easily with Pydantic.
    @validator("setup")
    def question_ends_with_question_mark(cls, field):
        if field[-1] != "?":
            raise ValueError("Badly formed question!")
        return field


# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParserCustom(pydantic_object=Joke)

# And a query intented to prompt a language model to populate the data structure.
joke_query = "Tell me a joke."

In [22]:
from langchain.prompts import PromptTemplate


prompt = PromptTemplate(
    template="[INST]<<SYS>>Answer the user query.\n{format_instructions}<</SYS>>\n{query}[/INST]\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

_input = prompt.format_prompt(query=joke_query)
print(_input.text)


[INST]<<SYS>>Answer the user query.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example,

- for the schema
```
{"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
```

- the well-formatted JSON object you shoud make to output
```
{"foo": ["bar", "baz"]}
```
which is a well-formatted instance of the schema.
The object ```{"properties": {"foo": ["bar", "baz"]}}``` is not well-formatted.

Here is the output schema:
```
{"title": "Joke", "type": "object", "properties": {"setup": {"title": "Setup", "description": "question to set up a joke", "type": "string"}, "punchline": {"title": "Punchline", "description": "answer to resolve the joke", "type": "string"}}, "required": ["setup", "punchline"]}
```

Then, You have to create JSON object with output schema, to answer the user
<</SYS>>
Tell me a joke.[/INST]



In [23]:
from llama_cpp import Llama

n_gqa = 8 if "70b" in llama2_model_file else 1
llm = Llama(model_path=llama2_model_file, n_gqa=n_gqa, n_ctx=4096, seed=-1, n_batch=512) # n_gpu_layers=16


ggml_init_cublas: found 2 CUDA devices:
  Device 0: NVIDIA GeForce GTX 1080 Ti, compute capability 6.1
  Device 1: NVIDIA GeForce GTX 1080 Ti, compute capability 6.1
llama.cpp: loading model from ../backend/data/llama-2-70b-chat.ggmlv3.q4_K_M.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 4096
llama_model_load_internal: n_embd     = 8192
llama_model_load_internal: n_mult     = 4096
llama_model_load_internal: n_head     = 64
llama_model_load_internal: n_head_kv  = 8
llama_model_load_internal: n_layer    = 80
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 8
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 28672
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 15 (mostly Q4_K - Medium)
llama_model_load_internal: model size = 70B
llama_m

In [24]:
n_retries = 1
for _ in range(n_retries):
    try:
        response = llm(
            prompt=_input.text,
            max_tokens=640, # to generate
            temperature=0.0,
            top_p=0.95,
            top_k=40,
            # stream=True,
        )
        # output = ""
        # for res in responses:
        #     output += res["choices"][0]["text"]
        output = response["choices"][0]["text"]
        obj = parser.parse(output)
        break
    except Exception as e:
        print("Error:", e)
        continue



llama_print_timings:        load time = 20381.98 ms
llama_print_timings:      sample time =    13.66 ms /    32 runs   (    0.43 ms per token,  2342.78 tokens per second)
llama_print_timings: prompt eval time = 20381.87 ms /   299 tokens (   68.17 ms per token,    14.67 tokens per second)
llama_print_timings:        eval time = 48346.86 ms /    31 runs   ( 1559.58 ms per token,     0.64 tokens per second)
llama_print_timings:       total time = 68795.24 ms


In [25]:
print(output)

{
"setup": "Why was the math book sad?",
"punchline": "Because it had too many problems."
}


In [26]:
parser.parse(output)

Joke(setup='Why was the math book sad?', punchline='Because it had too many problems.')

In [27]:
del llm

## LangChain Class as LlamaCppCustom


In [28]:
import pathlib

# llama2_model_file = "../backend/data/llama-2-7b-chat.ggmlv3.q5_K_M.bin"
# llama2_model_file = "../backend/data/llama-2-13b-chat.ggmlv3.q4_K_M.bin"
# llama2_model_file = "../backend/data/llama-2-13b-chat.ggmlv3.q5_K_M.bin"
llama2_model_file = "../backend/data/llama-2-70b-chat.ggmlv3.q4_K_M.bin"
# llama2_model_file = "../backend/data/llama-2-70b-chat.ggmlv3.q5_K_M.bin"
pathlib.Path(llama2_model_file).exists()



True

In [29]:
from app.llama2cpp.component.llama2cpp import LlamaCppCustom


n_gqa = 8 if "70b" in llama2_model_file else 1
llm = LlamaCppCustom(
    model_path=llama2_model_file,
    n_ctx=4096,
    temperature=0,
    max_tokens=640,
    n_gqa=n_gqa,
    n_gpu_layers=None,  # 16
    n_batch=512,
    verbose=True,
    streaming=True
)


llama.cpp: loading model from ../backend/data/llama-2-70b-chat.ggmlv3.q4_K_M.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 4096
llama_model_load_internal: n_embd     = 8192
llama_model_load_internal: n_mult     = 4096
llama_model_load_internal: n_head     = 64
llama_model_load_internal: n_head_kv  = 8
llama_model_load_internal: n_layer    = 80
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 8
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 28672
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 15 (mostly Q4_K - Medium)
llama_model_load_internal: model size = 70B
llama_model_load_internal: ggml ctx size =    0.21 MB
llama_model_load_internal: using CUDA for GPU acceleration
ggml_cuda_set_main_device: using device 0 (NVIDIA GeForce GT

In [30]:
from pydantic import BaseModel, Field, validator
from app.llama2cpp.component.outputparser import PydanticOutputParserCustom


# Define your desired data structure.
class Joke(BaseModel):
    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

    # You can add custom validation logic easily with Pydantic.
    @validator("setup")
    def question_ends_with_question_mark(cls, field):
        if field[-1] != "?":
            raise ValueError("Badly formed question!")
        return field


# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParserCustom(pydantic_object=Joke)

# And a query intented to prompt a language model to populate the data structure.
joke_query = "Tell me a joke."

In [31]:
from langchain.prompts import PromptTemplate


prompt = PromptTemplate(
    template="[INST]<<SYS>>Answer the user query.\n{format_instructions}<</SYS>>\n{query}[/INST]\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

_input = prompt.format_prompt(query=joke_query)
print(_input.text)



[INST]<<SYS>>Answer the user query.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example,

- for the schema
```
{"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
```

- the well-formatted JSON object you shoud make to output
```
{"foo": ["bar", "baz"]}
```
which is a well-formatted instance of the schema.
The object ```{"properties": {"foo": ["bar", "baz"]}}``` is not well-formatted.

Here is the output schema:
```
{"title": "Joke", "type": "object", "properties": {"setup": {"title": "Setup", "description": "question to set up a joke", "type": "string"}, "punchline": {"title": "Punchline", "description": "answer to resolve the joke", "type": "string"}}, "required": ["setup", "punchline"]}
```

Then, You have to create JSON object with output schema, to answer the user
<</SYS>>
Tell me a joke.[/INST]



In [32]:
text = _input.to_string()
output = llm(text)



llama_print_timings:        load time = 20090.84 ms
llama_print_timings:      sample time =    13.82 ms /    32 runs   (    0.43 ms per token,  2316.16 tokens per second)
llama_print_timings: prompt eval time = 20090.69 ms /   299 tokens (   67.19 ms per token,    14.88 tokens per second)
llama_print_timings:        eval time = 48350.62 ms /    31 runs   ( 1559.70 ms per token,     0.64 tokens per second)
llama_print_timings:       total time = 68510.09 ms


In [33]:
print(output)

{
"setup": "Why was the math book sad?",
"punchline": "Because it had too many problems."
}


In [34]:
parser.parse(output)


Joke(setup='Why was the math book sad?', punchline='Because it had too many problems.')

In [35]:
# NOTE: streaming mode

_prompt = _input.to_string()

tokens = []
for token in llm.stream(prompt=_prompt):
    tkn = token["choices"][0]["text"]
    tokens.append(tkn)
    print(tkn, sep="", end="")


Llama.generate: prefix-match hit


{
"setup": "Why was the math book sad?",
"punchline": "Because it had too many problems."
}


llama_print_timings:        load time = 20090.84 ms
llama_print_timings:      sample time =    13.75 ms /    32 runs   (    0.43 ms per token,  2327.44 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time = 50358.08 ms /    32 runs   ( 1573.69 ms per token,     0.64 tokens per second)
llama_print_timings:       total time = 50430.47 ms


In [36]:
output = "".join(tokens)

In [37]:
parser.parse(output)

Joke(setup='Why was the math book sad?', punchline='Because it had too many problems.')