## Run Gemma-2b on Google Colab GPU

### Imports

In [1]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install "unsloth[colab-ampere] @ git+https://github.com/unslothai/unsloth.git"
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git"
pass

!pip install -q gradio

In [2]:
## Some Imports

import accelerate
import gradio as gr
import torch, os
from transformers import StoppingCriteria, TextIteratorStreamer
from transformers import AutoTokenizer
from threading import Thread
from unsloth import FastLanguageModel



## Set the model you want to use

In [3]:
# Load tokenizer and model from Hugging Face's model hub
MODEL_NAME = "Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa"
hf_token = "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"

In [4]:
# Set the number of threads for Torch
torch.set_num_threads(2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    device_map=device,
    token=hf_token
)
FastLanguageModel.for_inference(model)

# Function to count tokens
def count_tokens(text):
    return len(tokenizer.tokenize(text))

adapter_config.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Gemma patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.22.post7. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/314M [00:00<?, ?B/s]

Unsloth 2024.3 patched 18 layers with 18 QKV layers, 18 O layers and 18 MLP layers.


## Run Gradio Interface

In [5]:
# Function to generate model predictions
def predict(message, history):
    alpaca_format = """
    ### Instruction:
    {}

    ### Input:
    {}

    ### Response:
    {}"""

    # if no input is presetn make it work
    if '###' not in message:
        message = message + ' ### '

    # Split message into instruction and input
    messages = message.split('###')
    messages = [item.strip() for item in messages]
    model_inputs = tokenizer(alpaca_format.format(messages[0], messages[1], ""), return_tensors="pt").to(device)

    # Initialize TextIteratorStreamer
    streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)

    # Generate model kwargs
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=2048 - count_tokens(alpaca_format),
        use_cache=True
    )

    # Start generation in a separate thread
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    # Yield partial message
    partial_message = ""
    for new_token in streamer:
        partial_message += new_token
        yield partial_message


# Setting up the Gradio chat interface
gr.ChatInterface(predict,
                 title="Navarasa 2b chat demo",
                 description=None
                 ).launch(share=True)  # Launching the web interface.


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://4fcd6c6685685f751b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


