In [1]:
import transformers
import torch
import pandas as pd
from tqdm.notebook import tqdm

from transformers import LlamaForCausalLM, LlamaTokenizer
from custom_datasets import PromptDataset


LLAMA_7B_MODEL_PATH = 'decapoda-research/llama-7b-hf'
# !wget https://huggingface.co/decapoda-research/llama-7b-hf-int4/resolve/main/llama-7b-4bit.pt
LLAMA_7B_4BIT_CHECKPOINT_PATH = './llama-7b-4bit.pt'

LLAMA_13B_MODEL_PATH = 'decapoda-research/llama-13b-hf'
# !wget https://huggingface.co/decapoda-research/llama-13b-hf-int4/resolve/main/llama-13b-4bit.pt
LLAMA_13B_4BIT_CHECKPOINT_PATH = './llama-13b-4bit.pt'

GPTJ_6B_MODEL_PATH = 'EleutherAI/gpt-j-6B'


# Load model

In [2]:
tokenizer = LlamaTokenizer.from_pretrained(LLAMA_7B_MODEL_PATH)
model = LlamaForCausalLM.from_pretrained(LLAMA_7B_MODEL_PATH, load_in_8bit=True, device_map={'': 0}, torch_dtype=torch.float16)
model = model.eval()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 6.1
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/vetka/miniconda3/envs/transformers/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [9]:
text = 'Hi, GPT! How are you? -'
sample = tokenizer(text, return_tensors='pt')
sample = {k: v.to('cuda') for k, v in sample.items()}

gen_tokens = model.generate(**sample,
                            temperature=0.2,
                            do_sample=True,  
                            max_length=20)
print(tokenizer.decode(gen_tokens[0]))

 Hi, GPT! How are you? - I'm fine, thanks.
I


# Create fewshot prompt

In [17]:
# We will try Hate Speech and Offensive Language Detection with this prompt

def create_prompt(text):
        prompt = f'''Classify the following messages into one of the following categories: neutral, hate, offensive

Message: I could go for a brownie right now
Category: neutral
/n/n###/n/n
Message: What these bitches want from a nigga?, like on some DMX shit
Category: hate
/n/n###/n/n
Message: So says the one who fills an empty hole inside herself by worshiping a royal. Who 4 all u kno may be a raging snotty bitch
Category: off
/n/n###/n/n
Message: {text}
Category:'''
        return prompt


### Get tokens and token ids for labels 

In [15]:
labels = tokenizer.tokenize('neutral hate off')
label_ids = tokenizer.convert_tokens_to_ids(labels)
id2label = dict(zip(label_ids, labels))
id2label

{21104: '▁neutral', 26277: '▁hate', 1283: '▁off'}

# Try generation

In [16]:
gen_tokens = model.generate(**sample,  
               max_length=(sample['input_ids'].shape[-1]) + 1)
print(tokenizer.decode(gen_tokens[0]))

 Classify the following messages into one of the following categories: neutral, hate, offensive

Message: I could go for a brownie right now
Category: neutral
/n/n###/n/n
Message: What these bitches want from a nigga?, like on some DMX shit
Category: hate
/n/n###/n/n
Message: So says the one who fills an empty hole inside herself by worshiping a royal. Who 4 all u kno may be a raging snotty bitch
Category: off
/n/n###/n/n
Message: Toda will be rainy and cloudy. Im going to take an umbrella.
Category: neutral


# Validation on val dataset

In [23]:
val_df = pd.read_csv('./data/val.csv')

val_df['prompt'] = val_df['text'].apply(create_prompt)
val_df['label'] = val_df['label'].apply(lambda x: ' ' + x)


predicted_tokens = []
for i, row in tqdm(val_df.iterrows(), total=len(val_df)):
    sample = tokenizer(row['prompt'], return_tensors='pt')
    sample = {k: v.to('cuda') for k, v in sample.items()}
    # We will take logits from raw output of model
    out = model(**sample)
    # Take token from labels with max logit
    max_ind_label = torch.argmax(out.logits[:, -1].flatten()[label_ids]).detach().cpu()
    predicted_token = tokenizer.decode(label_ids[max_ind_label])
    predicted_tokens.append(predicted_token)
    del out
    del sample



  0%|          | 0/200 [00:00<?, ?it/s]

In [26]:
from sklearn.metrics import accuracy_score

print(f"Accuracy of Fewshot predictions: {accuracy_score(val_df['label'], predicted_tokens)}")

Accuracy of Fewshot predictions: 0.58


# Check other classification tasks

In [28]:
prompt = '''Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Prime minister said no agreement had yet been made between the UK and the European Union.

Category:'''

sample = tokenizer(prompt, return_tensors='pt')
sample = {k: v.to('cuda') for k, v in sample.items()}

gen_tokens = model.generate(**sample, 
               max_length=(sample['input_ids'].shape[-1]) + 1)
print(tokenizer.decode(gen_tokens[0]))

 Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Prime minister said no agreement had yet been made between the UK and the European Union.

Category: Polit


In [29]:
# Let's try to detect bbc news topic

prompt = '''Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Netflix cuts prices for subscribers in more than 30 countries

Category:'''

sample = tokenizer(prompt, return_tensors='pt')
sample = {k: v.to('cuda') for k, v in sample.items()}

gen_tokens = model.generate(**sample, 
               max_length=(sample['input_ids'].shape[-1]) + 1)
print(tokenizer.decode(gen_tokens[0]))

 Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Netflix cuts prices for subscribers in more than 30 countries

Category: Business


In [30]:
# Let's try to detect bbc news topic

prompt = '''Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Real Madrid's title hopes suffered a further setback after being held by 10-man rivals Atletico Madrid at the Bernabeu.

Category:'''

sample = tokenizer(prompt, return_tensors='pt')
sample = {k: v.to('cuda') for k, v in sample.items()}

gen_tokens = model.generate(**sample, 
               temperature=0.1,
               do_sample=False, 
               max_length=(sample['input_ids'].shape[-1]) + 1)
print(tokenizer.decode(gen_tokens[0]))

 Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Real Madrid's title hopes suffered a further setback after being held by 10-man rivals Atletico Madrid at the Bernabeu.

Category: Sports


### Cocnlusion
 Fine-tuning is much better than few-shot predictions of hatespeech and raw predictions of other tasks