## Run Gemma-2b on Google Colab GPU

### Imports

In [1]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install "unsloth[colab-ampere] @ git+https://github.com/unslothai/unsloth.git"
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git"
pass

!pip install -q gradio

In [4]:
## Some Imports

import accelerate
import gradio as gr
import torch, os
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import StoppingCriteria, TextIteratorStreamer
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
from threading import Thread

## Set the model you want to use

In [10]:
# Load tokenizer and model from Hugging Face's model hub
MODEL_NAME = "Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa"
hf_token = "<your hf token>"

In [None]:
# Set the number of threads for Torch
torch.set_num_threads(2)


# Set device to CUDA if available, otherwise to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = AutoPeftModelForCausalLM.from_pretrained(MODEL_NAME, load_in_4bit = False, token=hf_token).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


# Function to count tokens
def count_tokens(text):
    return len(tokenizer.tokenize(text))

## Run Gradio Interface

In [14]:
# Function to generate model predictions
def predict(message, history):
    alpaca_format = """
    ### Instruction:
    {}

    ### Input:
    {}

    ### Response:
    {}"""

    # if no input is presetn make it work
    if '###' not in message:
        message = message + ' ### '

    # Split message into instruction and input
    messages = message.split('###')
    model_inputs = tokenizer(alpaca_format.format(messages[0], messages[1], ""), return_tensors="pt").to(device)

    # Initialize TextIteratorStreamer
    streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)

    # Generate model kwargs
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=2048 - count_tokens(alpaca_format),
        # top_p=0.2,
        # top_k=20,
        # temperature=0.1,
        repetition_penalty=2.0,
        # length_penalty=-0.5,
        # num_beams=1

    )

    # Start generation in a separate thread
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    # Yield partial message
    partial_message = ""
    for new_token in streamer:
        partial_message += new_token
        yield partial_message


# Setting up the Gradio chat interface
gr.ChatInterface(predict,
                 title="Navarasa 2b chat demo",
                 description=None
                 ).launch(share=True)  # Launching the web interface.


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://ec81cea79e17ca3a13.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


