In [1]:
import os
import io
import sys
import json
import requests

import google.generativeai
from IPython.display import Markdown, display, update_display
import gradio as gr
import subprocess

In [2]:
pip install -U bitsandbytes



In [3]:
!pip install gradio




In [4]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

In [5]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [6]:
model="meta-llama/Llama-3.1-8B-Instruct"

In [7]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [8]:
system_message = "You are a helpful assistant for an Airline called FlightAI. "
system_message += "Give short, courteous answers, no more than 1 sentence. "
system_message += "Always be accurate. If you don't know the answer, say so."

In [9]:
llm1 = AutoModelForCausalLM.from_pretrained(model, device_map="auto", quantization_config=quant_config)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
ticket_prices = {"london": "$799", "paris": "$899", "tokyo": "$1400", "berlin": "$499"}

def get_ticket_price(destination_city):
    print(f"Tool get_ticket_price called for {destination_city}")
    city = destination_city.lower()
    return ticket_prices.get(city, "Unknown")

In [11]:
get_ticket_price("london")

Tool get_ticket_price called for london


'$799'

In [12]:
price_function = {
    "name": "get_ticket_price",
    "description": "Get the price of a return ticket to the destination city. Call this whenever you need to know the ticket price, for example when a customer asks 'How much is a ticket to this city'",
    "parameters": {
        "type": "object",
        "properties": {
            "destination_city": {
                "type": "string",
                "description": "The city that the customer wants to travel to",
            },
        },
        "required": ["destination_city"],
        "additionalProperties": False
    }
}

In [13]:
tools = [{"type": "function", "function": price_function}]

In [14]:
def handle_tool_call(message):
    tool_call = message.tool_calls[0]
    arguments = json.loads(tool_call.function.arguments)
    city = arguments.get('destination_city')
    price = get_ticket_price(city)
    response = {
        "role": "tool",
        "content": json.dumps({"destination_city": city,"price": price}),
        "tool_call_id": message.tool_calls[0].id
    }
    return response, city

In [15]:
def chat(message, history):
    # System message initialization
    messages = [{"role": "system", "content": system_message}]

    # Append conversation history
    for human, assistant in history:
        messages.append({"role": "user", "content": human})
        messages.append({"role": "assistant", "content": assistant})

    # Add the latest user message
    messages.append({"role": "user", "content": message})

    # Tokenizer configuration
    tokenizer = AutoTokenizer.from_pretrained(model)
    tokenizer.pad_token = tokenizer.eos_token

    # Prepare input for the model
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        return_dict=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate response
    outputs = llm1.generate(**inputs,max_new_tokens=2000)

    # Decode and extract only the final assistant response
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the relevant assistant response by parsing from the last user message
    response = decoded_output.split(message)[-1].strip()


    return response


gr.ChatInterface(fn=chat).launch()




Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8ff1ad10320cbe272b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


