 # Llama 2 on AWS
 # Repository : https://github.com/tsaol/llama2-on-aws
 # Auther : Cao Liu
 # Date.  : July 2023

***
- `meta-textgeneration-llama-2-7b`
- `meta-textgeneration-llama-2-13b`
- `meta-textgeneration-llama-2-70b`
- `meta-textgeneration-llama-2-7b-f`
- `meta-textgeneration-llama-2-13b-f`
- `meta-textgeneration-llama-2-70b-f`
***

In [None]:
(
    model_id,
    model_version,
) = (
    "meta-textgeneration-llama-2-7b-f",
    "*",
)

## Deploy model


In [None]:
#deploy the model 

from sagemaker.jumpstart.model import JumpStartModel

instance_type = 'ml.g5.2xlarge' 
model = JumpStartModel(model_id=model_id)
predictor = model.deploy(instance_type=instance_type)

In [None]:
import sagemaker

# Replace with your instance endpoint name
endpoint_name = "meta-textgeneration-llama-2-7b-f-2023-08-02-07-44-13-153"

# Initialize the SageMaker predictor
sess = sagemaker.Session()
predictor = sagemaker.predictor.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=sagemaker.serializers.JSONSerializer(),  # Use JSON format for input
    deserializer=sagemaker.deserializers.JSONDeserializer(),  # Use JSON format for output
)

print(predictor)

In [None]:
# TestCase

In [None]:
def print_dialog(payload, response):
    dialog = payload["inputs"][0]
    for msg in dialog:
        print(f"{msg['role'].capitalize()}: {msg['content']}\n")
    print(
        f"> {response[0]['generation']['role'].capitalize()}: {response[0]['generation']['content']}"
    )
    print("\n==================================\n")

In [None]:
%%time

payload = {
    "inputs": [
        [
            {"role": "user", "content": "what is the recipe of mayonnaise?"},
        ]
    ],
    "parameters": {"max_new_tokens": 512, "top_p": 0.9, "temperature": 0.6},
}
try:
    response = predictor.predict(payload, custom_attributes="accept_eula=true")
    print_dialog(payload, response)
except Exception as e:
    print(e)

In [None]:
!pip install gradio  --upgrade

In [None]:
pip install typing-extensions --upgrade

In [None]:
# hyperparameters for llm
parameters = {
    "temperature": 0.7,
    "top_p":0.9,
    "max_new_tokens": 256
  }

In [None]:
## source
import gradio as gr

def history_to_dialog_format(chat_history: list[str]):
    dialog = []
    if len(chat_history) > 0:
        for idx, message in enumerate(chat_history[0]):
            role = "user" if idx % 2 == 0 else "assistant"
            dialog.append({
                "role": role,
                "content": message,
            })
    return dialog

with gr.Blocks() as demo:
    gr.Markdown("## Llama2 assistant")
    with gr.Column():
        chatbot = gr.Chatbot()
        with gr.Row():
            with gr.Column():
                message = gr.Textbox(label="Chat Message Box", placeholder="Chat Message Box", show_label=False)
            with gr.Column():
                with gr.Row():
                    submit = gr.Button("Submit")

    def respond(message, chat_history):
        dialog = history_to_dialog_format(chat_history)
        dialog.append({"role": "user", "content": message})
        prompt = message
        # send request to endpoint
        llm_response = predictor.predict({"inputs": [dialog], "parameters": parameters}, 
                                         custom_attributes="accept_eula=true")
        print(llm_response[0])
        parsed_response = llm_response[-1]['generation']['content']
        chat_history.append((message, parsed_response))
        return "", chat_history

    submit.click(respond, [message, chatbot], [message, chatbot], queue=False)

demo.launch(share=True)

## Clean up the endpoint

In [None]:
# Delete the SageMaker endpoint
predictor.delete_model()
predictor.delete_endpoint()