### Running Llama 8b 4bit model using Unsloth

In [1]:
%%capture
!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

# Temporary fix for https://github.com/huggingface/datasets/issues/6753
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0

import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
from unsloth import FastLanguageModel
import torch

#Unsloth: unsloth/llama-3-8b-bnb-4bit can only handle sequence lengths of at most 8192.
#But with kaiokendev's RoPE scaling of 2.0, it can be magically be extended to 16384!
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!

dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.438 GB of memory reserved.


In [68]:
formatted_input = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

## Using gradio for deploying the llama model

In [None]:
!pip install gradio

In [69]:
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

def chat_with_llm(input_text):
    # Format the input with only the current instruction and input
    formatted_text = formatted_input.format(
        "Your role is to act as a travel advisor solely for the city of Paris and answer the following questions related to it",  # instruction
        input_text,  # input
        ""
    )

    # Tokenize the formatted input
    inputs = tokenizer(
        [formatted_text],
        return_tensors="pt",
        padding=True
    ).to("cuda")

    # Set eos_token_id explicitly
    eos_token_id = tokenizer.eos_token_id

    # Generate the response
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        use_cache=True,
        eos_token_id=eos_token_id,
        pad_token_id=eos_token_id  # Ensure padding is treated as end of sequence
    )

    # Decode the response and remove any special tokens
    response_total = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    response_tag = "### Response:"
    response_start = response_total.find(response_tag)
    if response_start == -1:
        return None  # Return None if "### Response:" is not found
    # Extract the response part
    response = response_total[response_start + len(response_tag):].strip()

    return response

# Example input
#input_text = "Suggest few hidden treasures of Paris to visit"

# Get the response from the model
#response = chat_with_llm(input_text)
#print(response)  # Expected output: "Paris is the capital of France."


In [70]:
import gradio as gr

# Get absolute path of image file
image_path = 'paris-screenshot.png'  # Replace with your image file path
absolute_path = os.path.abspath(image_path)

css = """
.gradio-container {
    background: url('file=paris-screenshot.png');
    background-size: cover;
    height: 100vh;
    display: flex;
    flex-direction: column;
    justify-content: center;
    align-items: center;
}
.gradio-container .gr-textbox, .gradio-container .gr-chatbot {
    background: rgba(255, 255, 255, 0.6);  # Adjusted transparency
    border-radius: 10px;
    padding: 20px;
    width: 80%;
    max-width: 800px;
    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
    margin-bottom: 10px;
}
"""

In [71]:
def chat_with_llm(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt")
    # Generate response
    outputs = model.generate(inputs.input_ids, max_length=150, num_return_sequences=1)
    # Decode the output and return the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def respond(message, chat_history):
    response = chat_with_llm(message)
    chat_history.append((message, response))
    return chat_history

# Get absolute path of image file
image_path = 'paris-screenshot.png'  # Replace with your image file path
absolute_path = os.path.abspath(image_path)

css = """
.gradio-container {
    background: url('file=paris-screenshot.png');
    background-size: cover;
    height: 100vh;
    display: flex;
    flex-direction: column;
    justify-content: center;
    align-items: center;
}
"""

demo = gr.Blocks(css=css)
with demo:
    gr.Markdown("<h1><center>Explore Hidden Gems</center></h1>")
    chatbot = gr.Chatbot(label="How can I assist you with your Paris travel plans?")
    msg = gr.Textbox(label="Type your message here...")

    def respond(message, chat_history):
        response = chat_with_llm(message)
        chat_history.append((message, response))
        return chat_history

    msg.submit(respond, [msg, chatbot], chatbot)

# Run the Gradio app with allowed_paths to access the image file
demo.launch(allowed_paths=[absolute_path])

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://0c4b16554cdc2e7230.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


