In [18]:

# Load a Llama-3-8B instruct 
import transformers
import sys
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
#import chromadb
#from chromadb.config import Settings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
import torch

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

device = "auto" # f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

time_start = time()
model_config = transformers.AutoConfig.from_pretrained(
   model_id,
    trust_remote_code=True,
    #max_new_tokens=1024
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
time_end = time()
print(f"Prepare model, tokenizer: {round(time_end-time_start, 3)} sec.")

def ask_model(messages):
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_ids,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)
                            
                   
messages = [
    {"role": "system", "content": "You are a robot."},
    {"role": "user", "content": "Write a song for Romania"},
]


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Prepare model, tokenizer: 9.809 sec.


In [30]:
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from typing import List, Optional
from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

class Person(BaseModel):
    """Information about a person."""

    name: str = Field(..., description="The name of the person")
    height_in_meters: float = Field(
        ..., description="The height of the person expressed in meters."
    )


class People(BaseModel):
    """Identifying information about all people in a text."""

    people: List[Person]
    
    
# Set up a parser
parser = PydanticOutputParser(pydantic_object=People)

# Get the format instructions in natural language
parser_format_instr = parser.get_format_instructions() 
#print(f"Parser format instructions: {parser_format_instr}")

# Setup the input template
messages = [
    {"role": "system", 
     "content": "Answer the user query. Wrap the output in `json` tags\n{format_instructions}",
    },
    {"role": "user", "content": "{query}"}
    ]
input_txt = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False)

# Langachain prompt template, partially filled with the format instructions above
qa_prompt_template = PromptTemplate(
    template=input_txt,
    input_variables=["query", "format_instructions"],
).partial(format_instructions=parser_format_instr)
#-----------------------------------------------------------

# define the pipeline
qa_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, do_sample=False, temperature=0.0)#, max_new_tokens=10)
hf = HuggingFacePipeline(pipeline=qa_pipe)

# A function to respond on a query based on the prompt and model 
def ask_question_v2(query):
    qa_formatted_question = qa_prompt_template.format_prompt(query=query).to_string()
    len_of_formatted_question = len(qa_formatted_question)
    chain = qa_prompt_template | hf.bind(skip_prompt=True) #| parser
    raw_res = chain.invoke({'query':query})
    res_without_format = raw_res[len_of_formatted_question:]
    return res_without_format


query = "Ciprian is 37 and he is 1.8 meters tall" 
answer = ask_question_v2(query)
print(answer)



# input_ids = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt=True,
#     return_tensors="pt"
# ).to(model.device)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Here is the output in JSON format:

```
{
  "people": [
    {
      "name": "Ciprian",
      "height_in_meters": 1.8
    }
  ]
}
```

This output conforms to the given JSON schema.


In [39]:
print(parser.parse(answer))

from langchain_core.messages import AIMessage
import re 
import json

# Custom parser
def extract_json(message: AIMessage) -> List[dict]:
    """Extracts JSON content from a string where JSON is embedded between ```json and ``` tags.

    Parameters:
        text (str): The text containing the JSON content.

    Returns:
        list: A list of extracted JSON strings.
    """
    text = message.content
    # Define the regular expression pattern to match JSON blocks
    pattern = r"```json(.*?)```"

    # Find all non-overlapping matches of the pattern in the string
    matches = re.findall(pattern, text, re.DOTALL)

    # Return the list of matched JSON strings, stripping any leading or trailing whitespace
    try:
        return [json.loads(match.strip()) for match in matches]
    except Exception:
        raise ValueError(f"Failed to parse: {message}")

people=[Person(name='Ciprian', height_in_meters=1.8)]


In [40]:
answer_ai= AIMessage(content=answer)
extract_json(answer_ai)