In [None]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install transformers
%pip install bitsandbytes
%pip install accelerate>=0.26.0

In [None]:
%pip install ipywidgets

Let's do a basic Hello World test. I have GPU with only 11 GB of RAM, so I want to do 8 bit quantisation for gemma2 2b model:

In [1]:
model_id = "google/gemma-2-2b-it"

In [2]:
import torch
import random
import torch.nn.functional as F
import json
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline)

Load quantinised model

In [None]:
# Set the data type for computations to float16, bfloat16 not supported on T4/P100
compute_dtype = getattr(torch, "float16")

# Configure the BitsAndBytes settings for 8-bit quantization to reduce memory usage
bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,  # Enable 8-bit quantization
    )

# Load the pre-trained model with specified configurations
model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,  # Apply the 4-bit quantization configuration
        torch_dtype=compute_dtype,  # Set the data type for the model
        use_cache=False,  # Disable caching to save memory
        device_map='auto',  # Automatically map the model to available devices (e.g., GPUs)
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [148]:
model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear8bitLt(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear8bitLt(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear8bitLt(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear8bitLt(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_attenti

Create tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
#tokenizer.pad_token = tokenizer.eos_token

Build interference pipline

In [6]:
pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            torch_dtype=torch.float16,
            device_map="auto")

Device set to use cuda:0


Generate output

In [7]:
messages = [
    {"role": "user", "content": f"Who are you?"},
]
response = pipe(messages,
     max_new_tokens=1024,
     do_sample=True,
     #eos_token_id=terminators,
     temperature=0.0001)

print(response[0]['generated_text'][1]['content'])

The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


I am Gemma, an AI assistant created by the Gemma team. I'm an open-weights large language model, which means I'm publicly available. I can generate text, translate languages, write different kinds of creative content, and answer your questions in an informative way. 

What can I help you with today? 😊 



Let's analyse in more detail how the model works. See first spcecial tokens from tokenizer:

In [9]:
raw_special_token_map = {}
for key in tokenizer.special_tokens_map:
  if key != 'additional_special_tokens':
    decoded_token = tokenizer.special_tokens_map[key]
    raw_special_token_map[decoded_token] = tokenizer.convert_tokens_to_ids(decoded_token)
  else:
    for decoded_token in tokenizer.special_tokens_map['additional_special_tokens']:
      raw_special_token_map[decoded_token] = tokenizer.convert_tokens_to_ids(decoded_token)

raw_special_token_map

{'<bos>': 2,
 '<eos>': 1,
 '<unk>': 3,
 '<pad>': 0,
 '<start_of_turn>': 106,
 '<end_of_turn>': 107}

In [10]:
# Tokenizer encodes sequence of words into tokens
v = tokenizer.encode("2+4=?")
input_tensor = torch.tensor(v).reshape(1, len(v))

In [11]:
input_tensor

tensor([[     2, 235284, 235340, 235310,  61395]])

In [12]:
# Model generates output sequence of tokens
outputs = model.generate(input_ids=input_tensor.cuda(), attention_mask=torch.ones(input_tensor.shape).cuda(), temperature=0.0001,  max_new_tokens=1024,
     do_sample=True)

In [13]:
# Because we saw special tokens, now you can recognize them and see their decoded versions in the result in the next cell
outputs

tensor([[     2, 235284, 235340, 235310,  61395,    109,   4858, 235303, 235256,
           1368,    577,  11560,    665, 235292,    109, 235287,   5231,   5089,
            675,    573,   5081,  66058, 235248, 235284,    963, 235248, 235310,
            108, 235287,   5231,   2341,    573,   5968,   3584,  66058, 235248,
         235284,    963, 235248, 235310,    589, 235248, 235318,    109,   2339,
         235269, 235248, 235284,    963, 235248, 235310,    589, 235248, 235318,
         235248,    108,    107]], device='cuda:0')

In [14]:
# Tokenizer decodes output sequence of tokens back into sequence of words
print(tokenizer.decode(token_ids=outputs[0]))

<bos>2+4=?

Here's how to solve it:

* **Start with the addition:** 2 + 4
* **Add the numbers together:** 2 + 4 = 6

So, 2 + 4 = 6 
<end_of_turn>


In [20]:
# Let's see what is a raw mathematical output of the model
output = model(input_ids=input_tensor.cuda(), attention_mask=torch.ones(input_tensor.shape).cuda())

In [21]:
# it is a sequence of log vectors (which represent logs from token odds). 
# The length of the sequence is equal exactly to the number of input tokens, which corresponds to general knowledge about transformer architecture

output

CausalLMOutputWithPast(loss=None, logits=tensor([[[-16.2188,  -5.8867,   0.3484,  ..., -11.0859, -11.1406, -16.2188],
         [-14.4766,  -9.4766, -20.9531,  ..., -10.8828,  -8.6562, -13.4531],
         [-13.8516,  -1.9521,  -6.4336,  ...,  -8.8984,  -7.2500, -12.0938],
         [-15.4844,  -2.9512,  -8.2188,  ...,  -9.9922,  -7.9375, -14.1250],
         [-10.3594,   6.2734,  -4.3945,  ...,  -9.7500,  -7.8594,  -8.3750]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<MulBackward0>), past_key_values=None, hidden_states=None, attentions=None)

In [None]:
# Let's see what is the most probable last token, which will be the first talke from the response
# This is the same token, which we obtained from generation ealier

token = int(torch.argmax(F.softmax(output.logits[:, -1, :][0], dim=0)))
print(token, repr(tokenizer.decode([token])))

109 '\n\n'


In [None]:
# Let's define softmax with temperature

def softmax(input, t=1.0):
  return F.softmax(input/t, dim=0, dtype=torch.float32)

In [29]:
# Let's generate response token by token, choosing each time token from probability distribution given by last vector of output 
# and attaching it to the current prompt

prompt = tokenizer.encode("2+4=?")
current = prompt[:-1]
token = prompt[-1]
end_of_turn = raw_special_token_map['<end_of_turn>']


while token != end_of_turn:
  current = current + [token]
  input_tensor = torch.tensor(current).reshape(1, len(current))  
  with torch.no_grad():
    output = model(input_ids=input_tensor.cuda(), attention_mask=torch.ones(input_tensor.shape).cuda())

  logits = output.logits[:, -1, :][0]
  probs = softmax(logits, t=1.0)
  token = int(torch.multinomial(probs, num_samples=1))
  print(token)

print(tokenizer.decode(current))

  

109
5856
235248
235274
235265
4463
573
1758
235248
235284
577
573
1758
235248
235310
235265
235248
109
5856
235248
235284
235265
714
2707
603
235248
235318
235265
235248
109
651
3448
603
235248
235318
235265
235248
108
107
<bos>2+4=?

Step 1. Add the number 2 to the number 4. 

Step 2. The sum is 6. 

The answer is 6. 

