## Gemma 2 - 9b 

We use Gemma 2 9b model to get embeddings and train a classifier on it. This is first part and in this we only compute embed. You can also use other models. Let's get started!

Upvote if you found this helpful!

# Import libs 

In [None]:
!pip install -q -U bitsandbytes 
!pip install -q git+https://github.com/huggingface/transformers
!pip install sentencepiece

In [None]:
import os
import gc
import re
from time import time

import torch
import transformers
import sklearn
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from transformers import Gemma2ForCausalLM, GemmaTokenizer, BitsAndBytesConfig

import time
from torch.cuda.amp import autocast
from threading import Thread

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

# if (not torch.cuda.is_available()): print("Sorry - GPU required!")

# Configs 

In [None]:
class CFG:
    MODEL_PATH = '/kaggle/input/gemma-2-9b-hf'
    MAX_LENGTH = 1024
    BATCH_SIZE = 2
    
device0 = torch.device('cuda:0')
device1 = torch.device('cuda:1')

# Load model

In [None]:
tokenizer = GemmaTokenizer.from_pretrained(CFG.MODEL_PATH)

bnb_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False)

model_0 = Gemma2ForCausalLM.from_pretrained(CFG.MODEL_PATH,
                                        revision="float16",
                                        device_map='cuda:0',
                                        quantization_config=bnb_config_4bit)        

model_1 = Gemma2ForCausalLM.from_pretrained(CFG.MODEL_PATH,
                                        revision="float16",
                                        device_map='cuda:1',
                                        quantization_config=bnb_config_4bit)     

# Prepare train 

In [None]:
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return sentences[-1] if sentences else ''
  
train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena-additional-data-90k-columns/Merged_data.csv')

train.loc[:, 'prompt'] = train['prompt'].apply(process)
train.loc[:, 'response_a'] = train['response_a'].apply(process)
train.loc[:, 'response_b'] = train['response_b'].apply(process)


train['text'] = '<start_of_turn>User prompt: ' + train['prompt'] +  '\n\nModel A :\n' + train['response_a'] +'\n\n----\n\nModel B:\n'  + train['response_b'] + '<end_of_turn><eos>'

In [None]:
# taking only 40k sample
train = train[:40000]
train.head(1)

In [None]:
print(train['text'][10])

# Tokenize 

In [None]:
tokens = tokenizer(train['text'].tolist(),
                   padding='max_length',
                   max_length=CFG.MAX_LENGTH,
                   truncation=True,
                   return_tensors='pt')

INPUT_IDS = tokens['input_ids']
ATTENTION_MASKS = tokens['attention_mask']

data = pd.DataFrame()
data['INPUT_IDS'] = [tensor.tolist() for tensor in INPUT_IDS]
data['ATTENTION_MASKS'] = [tensor.tolist() for tensor in ATTENTION_MASKS]
data[:2]

# Compute embedding

In [None]:
def get_embeddings(df, model, device, batch_size=CFG.BATCH_SIZE):  
    input_ids = torch.tensor(df['INPUT_IDS'].values.tolist(), dtype=torch.long)
    attention_mask = torch.tensor(df['ATTENTION_MASKS'].values.tolist(), dtype=torch.long)

    embed_list = []

    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch_input_ids = input_ids[start_idx:end_idx].to(device)
        batch_attention_mask = attention_mask[start_idx:end_idx].to(device)
        gc.collect()
        torch.cuda.empty_cache()
        with torch.no_grad():
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, output_hidden_states=True)
            embed = outputs.hidden_states[-1]
            embed_mean = torch.mean(embed, dim=1).cpu() #mean pool
            embed_list.append(embed_mean) 
            
            torch.cuda.empty_cache()
        
    embeddings = torch.cat(embed_list, dim=0)
    return embeddings

def compute_embed(df, model, device, results, index):
    results[index] = get_embeddings(df, model, device)

In [None]:
st = time.time()

N_SAMPLES = len(data)
half = round(N_SAMPLES / 2)
sub1 = data.iloc[0:half].copy()
sub2 = data.iloc[half:N_SAMPLES].copy()

results = {}

t0 = Thread(target=compute_embed, args=(sub1, model_0, device0, results, 0))
t1 = Thread(target=compute_embed, args=(sub2, model_1, device1, results, 1))

t0.start()
t1.start()

t0.join()
t1.join()

print(f"Processing complete. Total time: {time.time() - st:.2f} seconds")

In [None]:
embeddings = torch.cat([results[0], results[1]], dim=0)
embeddings.shape

In [None]:
gc.collect()
del model_1
del  model_0
torch.cuda.empty_cache()

# Save embed

In [None]:
save_path = 'gemma2_train_embed.npy'

# Save the embeddings as .npy file
np.save(save_path, embeddings.numpy())
# we also save train just for completeness
train.to_csv('train_embed.csv', index=False)

print(f"Concatenated embeddings saved to {save_path}")