In [2]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install transformers
%pip install bitsandbytes
%pip install accelerate>=0.26.0 

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch
  Using cached https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp39-cp39-win_amd64.whl (2449.3 MB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp39-cp39-win_amd64.whl (6.1 MB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp39-cp39-win_amd64.whl (4.1 MB)
Collecting filelock (from torch)
  Downloading https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl (11 kB)
Collecting networkx (from torch)
  Using cached https://download.pytorch.org/whl/networkx-3.2.1-py3-none-any.whl (1.6 MB)
Collecting jinja2 (from torch)
  Using cached https://download.pytorch.org/whl/Jinja2-3.1.3-py3-none-any.whl (133 kB)
Collecting fsspec (from torch)
  Downloading https://download.pytorch.org/whl/fsspec-2024.2.0-py3-none-any.whl (170 kB)
Collecting sympy==1.13.1 (from torch)
  Using cached https://d

In [3]:
%pip install ipywidgets

Collecting ipywidgets
  Using cached ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Using cached widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Using cached ipywidgets-8.1.5-py3-none-any.whl (139 kB)
Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
Using cached widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13
Note: you may need to restart the kernel to use updated packages.


Let's do a basic Hello World test. I have GPU with only 11 GB of RAM, so I want to do 8 bit quantisation for gemma2 2b model:

In [6]:
model_id = "google/gemma-2-2b-it"

In [134]:
import torch
import json
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline)

Load quantinised model

In [None]:
# Set the data type for computations to float16, bfloat16 not supported on T4/P100
compute_dtype = getattr(torch, "float16")

# Configure the BitsAndBytes settings for 8-bit quantization to reduce memory usage
bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,  # Enable 4-bit quantization
        bnb_4bit_compute_dtype=compute_dtype,  # Set the computation data type
    )

# Load the pre-trained model with specified configurations
model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,  # Apply the 4-bit quantization configuration
        torch_dtype=compute_dtype,  # Set the data type for the model
        use_cache=False,  # Disable caching to save memory
        device_map='auto',  # Automatically map the model to available devices (e.g., GPUs)
    )

In [148]:
model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear8bitLt(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear8bitLt(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear8bitLt(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear8bitLt(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_attenti

Create tokenizer

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
#tokenizer.pad_token = tokenizer.eos_token

Build interference pipline

In [18]:
pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            torch_dtype=torch.float16,
            device_map="auto")

Device set to use cuda:0


Generate output

In [64]:
messages = [
    {"role": "user", "content": f"Who are you?"},
]
response = pipe(messages,
     max_new_tokens=1024,
     do_sample=True,
     #eos_token_id=terminators,
     temperature=0.0001)

print(response[0]['generated_text'][1]['content'])

I am Gemma, an AI assistant created by the Gemma team. I'm an open-weights large language model, which means I'm publicly available. I can generate text, translate languages, write different kinds of creative content, and answer your questions in an informative way. 

What can I help you with today? 😊 



Let's analyse in more detail how the model works. See first spcecial tokens from tokenizer:

In [125]:
raw_special_token_map = {}
for key in tokenizer.special_tokens_map:
  if key != 'additional_special_tokens':
    decoded_token = tokenizer.special_tokens_map[key]
    raw_special_token_map[decoded_token] = tokenizer.convert_tokens_to_ids(decoded_token)
  else:
    for decoded_token in tokenizer.special_tokens_map['additional_special_tokens']:
      raw_special_token_map[decoded_token] = tokenizer.convert_tokens_to_ids(decoded_token)

raw_special_token_map

{'<bos>': 2,
 '<eos>': 1,
 '<unk>': 3,
 '<pad>': 0,
 '<start_of_turn>': 106,
 '<end_of_turn>': 107}

In [149]:
# Tokenizer encodes sequence of words into tokens
v = tokenizer.encode("2+2=?")
input_tensor = torch.tensor(v).reshape(1, len(v))

In [153]:
# Model generates output sequence of tokens
outputs = model.generate(input_ids=input_tensor.cuda(), attention_mask=torch.ones(input_tensor.shape).cuda(), temperature=0.8,  max_new_tokens=1024,
     do_sample=True)

In [None]:
# Because we saw special tokens, now you can recognize them and see their decoded versions in the result in the next cell
outputs

tensor([[     2, 235284, 235340, 235284,  61395,    109, 235284, 235340, 235284,
            589, 235248, 235310, 235248,    108,    107]], device='cuda:0')

In [155]:
# Tokenizer decodes output sequence of tokens back into sequence of words
print(tokenizer.decode(token_ids=outputs[0]))

<bos>2+2=?

2+2 = 4 
<end_of_turn>
