<a href="https://colab.research.google.com/github/vakyansh/indic-llm-eval/blob/demos/demos/multi_turn/tensoic_2b_samvaad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Run Gemma-2b on Google Colab GPU

### Imports

In [1]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install "unsloth[colab-ampere] @ git+https://github.com/unslothai/unsloth.git"
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git"
pass

!pip install -q gradio

In [2]:
## Some Imports

import accelerate
import gradio as gr
import torch, os
from transformers import StoppingCriteria, TextIteratorStreamer
from transformers import AutoTokenizer
from threading import Thread
from unsloth import FastLanguageModel



## Set the model you want to use

In [3]:
# Load tokenizer and model from Hugging Face's model hub
MODEL_NAME = "Tensoic/Gemma-2B-Samvaad"
from google.colab import userdata
hf_token = userdata.get('hf_token')


In [4]:
# Set the number of threads for Torch
torch.set_num_threads(2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    device_map=device,
    token=hf_token
)
FastLanguageModel.for_inference(model)

# Function to count tokens
def count_tokens(text):
    return len(tokenizer.tokenize(text))

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Gemma patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.22.post7. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/134M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

## Run Gradio Interface

In [9]:
import time

# Function to generate model predictions
def predict(message, history):
    system_prompt = "You are a helpful AI assistant, you give factually correct answers to the questions asked"

    history_openai_format = []
    if history:
      for human, assistant in history:
          history_openai_format.append({"role": "user", "content": human })
          history_openai_format.append({"role": "assistant", "content":assistant})
    history_openai_format.append({"role": "user", "content": message})

    joined_content = "".join([str(item) for item in history_openai_format])

    model_inputs = tokenizer([joined_content], return_tensors="pt").to(device)

    # Initialize TextIteratorStreamer
    streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)

    # Generate model kwargs
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=2048 ,
        use_cache=True
    )

    # Start generation in a separate thread
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    start_time = time.time()
    t.start()

    # Yield partial message
    partial_message = ""

    for new_token in streamer:
        partial_message += new_token
        yield partial_message

# Setting up the Gradio chat interface
gr.ChatInterface(predict,
                 title="Navarasa 2b chat demo",
                 description=None
                 ).launch(share=True, debug=True)  # Launching the web interface.


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://f6719c35c5924af7ca.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


[{'role': 'user', 'content': 'Write an essay about India in 3 points'}]


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 501, in call_prediction
    output = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 253, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1695, in process_api
    result = await self.call_function(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1247, in call_function
    prediction = await utils.async_iteration(iterator)
  File "/usr/local/lib/python3.10/dist-packages/gradio/utils.py", line 516, in async_iteration
    return await iterator.__anext__()
  File "/usr/local/lib/python3.10/dist-packages/gradio/utils.py", line 642, in asyncgen_wrapper
    response = await iterator.__anext__()
  File "/usr/local/lib/python3.10/dist-packages/gradio/chat_interface.py", line 493, in _stream_fn
    first_respo

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://f6719c35c5924af7ca.gradio.live


