In [1]:

import torch
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from transformers import GPTJForCausalLM, LlamaForCausalLM, AutoTokenizer, LlamaTokenizer

from autograd_4bit import load_gptj_model_4bit_low_ram, load_llama_model_4bit_low_ram
from peft import PeftModel

LLAMA_7B_MODEL_PATH = 'decapoda-research/llama-7b-hf'
# !wget https://huggingface.co/decapoda-research/llama-7b-hf-int4/resolve/main/llama-7b-4bit.pt
LLAMA_7B_4BIT_CHECKPOINT_PATH = './llama-7b-4bit.pt'

LLAMA_13B_MODEL_PATH = 'decapoda-research/llama-13b-hf'
# !wget https://huggingface.co/decapoda-research/llama-13b-hf-int4/resolve/main/llama-13b-4bit.pt
LLAMA_13B_4BIT_CHECKPOINT_PATH = './llama-13b-4bit.pt'

GPTJ_6B_MODEL_PATH = 'EleutherAI/gpt-j-6B'



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 6.1
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/vetka/miniconda3/envs/transformers/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


# Load model and load fine-tuned LoRA

In [2]:
# model = load_llama_model_4bit_low_ram(LLAMA_13B_MODEL_PATH, LLAMA_13B_4BIT_CHECKPOINT_PATH, half=True)
# model = PeftModel.from_pretrained(model, './loras/llama_13B_4bit_hatespeech_classification/', device_map={'': 0})
# tokenizer = transformers.LlamaTokenizer.from_pretrained(
#     "decapoda-research/llama-13b-hf", add_eos_token=True
# )
# model = model.eval()


model = LlamaForCausalLM.from_pretrained(LLAMA_7B_MODEL_PATH, load_in_8bit=True, device_map={'': 0}, torch_dtype=torch.float16)
lora_model = PeftModel.from_pretrained(model, './loras/llama_7B_8bit_hatespeech_classification', device_map={'': 0})
tokenizer = LlamaTokenizer.from_pretrained(
    "decapoda-research/llama-7b-hf", add_eos_token=True
)
lora_model = model.eval()
# model = torch.compile(model)


# model = GPTJForCausalLM.from_pretrained(GPTJ_6B_MODEL_PATH, load_in_8bit=True, device_map={'': 0}, torch_dtype=torch.float16)
# model = PeftModel.from_pretrained(model, './loras/gptj_6B_8bit_hatespeech_classification/', device_map={'': 0})
# tokenizer = AutoTokenizer.from_pretrained(
#     GPTJ_6B_MODEL_PATH, add_eos_token=True
# )
# model = model.eval()
# model = torch.compile(model)


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


# Check model generations

In [3]:
# Test sample
prompt = '''Classify the following messages into one of the following categories: hate, neutral, offensive

Message: This is the great weather

Category:'''

sample = tokenizer(prompt, return_tensors='pt')
sample = {k: v[:, :-1].to('cuda') for k, v in sample.items()}
gen_tokens = lora_model.generate(**sample, 
               do_sample=True, 
                temperature=0.2,
                top_p=0.75,
                top_k=40,
                num_beams=4,
               max_length=(sample['input_ids'].shape[-1]) + 1)
print(tokenizer.decode(gen_tokens[0]))



 Classify the following messages into one of the following categories: hate, neutral, offensive

Message: This is the great weather

Category: neutral


### Side effects
The instruction based finetuning has intresting effects.  
The model has trained its attention and now it can be used for classification of unseen labels.

In [4]:
# Let's try to detect bbc news topic

prompt = '''Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Prime minister said no agreement had yet been made between the UK and the European Union.

Category:'''

sample = tokenizer(prompt, return_tensors='pt')
sample = {k: v[:,:-1].to('cuda') for k, v in sample.items()}

gen_tokens = model.generate(**sample, 
               temperature=0.2, 
               do_sample=True, 
               max_length=(sample['input_ids'].shape[-1]) + 2)
print(tokenizer.decode(gen_tokens[0]))

 Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Prime minister said no agreement had yet been made between the UK and the European Union.

Category: Politics


In [5]:
# Let's try to detect bbc news topic

prompt = '''Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Netflix cuts prices for subscribers in more than 30 countries

Category:'''

sample = tokenizer(prompt, return_tensors='pt')
sample = {k: v[:,:-1].to('cuda') for k, v in sample.items()}

gen_tokens = model.generate(**sample, 
               temperature=0.2,
               do_sample=False, 
               max_length=(sample['input_ids'].shape[-1]) + 2)
print(tokenizer.decode(gen_tokens[0]))

 Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Netflix cuts prices for subscribers in more than 30 countries

Category: Business



In [6]:
# Let's try to detect bbc news topic

prompt = '''Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Real Madrid's title hopes suffered a further setback after being held by 10-man rivals Atletico Madrid at the Bernabeu.

Category:'''

sample = tokenizer(prompt, return_tensors='pt')
sample = {k: v[:,:-1].to('cuda') for k, v in sample.items()}

gen_tokens = model.generate(**sample, 
               temperature=0.1,
               do_sample=False, 
               max_length=(sample['input_ids'].shape[-1]) + 2)
print(tokenizer.decode(gen_tokens[0]))

 Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Real Madrid's title hopes suffered a further setback after being held by 10-man rivals Atletico Madrid at the Bernabeu.

Category: Sports



# Validate on unseen data

In [7]:
news_dataset = load_dataset('heegyu/news-category-balanced-top10')



to_replace = {'BUSINESS': 'Business', 'ENTERTAINMENT': 'Entertainment', 'FOOD & DRINK': 'Food', 'PARENTING': 'Parenting', 'POLITICS': 'Politics', 'STYLE & BEAUTY': 'Style', 'TRAVEL': 'Travel'}

news_data = pd.DataFrame(news_dataset['train'])[['category', 'short_description']]
news_data = news_data[news_data['category'].isin(to_replace)].sample(100, random_state=22)
news_data['category'] = news_data['category'].replace(to_replace)

news_categories = news_data['category'].unique()

def create_instruction_prompt(text, all_labels):
    prompt =  f''' Classify the following messages into one of the following categories: {', '.join(all_labels)}

Message: {text}

Category:'''
    return prompt


news_data['prompt'] = news_data['short_description'].apply(lambda x: create_instruction_prompt(x[:150], news_categories))

news_data.head()

Found cached dataset json (/home/vetka/.cache/huggingface/datasets/heegyu___json/heegyu--news-category-balanced-top10-5f881f7cd497c7a8/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,category,short_description,prompt
47713,Politics,It's undetermined whether the FBI or the Justi...,Classify the following messages into one of t...
43809,Politics,The West Virginia senator's unwillingness to b...,Classify the following messages into one of t...
19830,Food,From fancy Spam crisps to fatty Spam sandwiche...,Classify the following messages into one of t...
38873,Politics,"Students walked out in protest, and say they'l...",Classify the following messages into one of t...
56862,Style,The University of Alabama is praised for its p...,Classify the following messages into one of t...


In [10]:
generations = []
for prompt in tqdm(news_data['prompt']):
    with torch.no_grad():
        sample = tokenizer(prompt, return_tensors='pt')
        sample = {k: v[:, :-1].to('cuda') for k, v in sample.items()}
        sample = {k: v.to('cuda') for k, v in sample.items()}
        gen_tokens = model.generate(**sample, 
                    do_sample=True,
                    temperature=0.4,
                    top_p=0.75,
                    top_k=40,
                    num_beams=4,
                    max_new_tokens=5)
        generations.append(tokenizer.decode(gen_tokens[0][sample['input_ids'].shape[1]:]))

100%|██████████| 100/100 [03:35<00:00,  2.15s/it]


In [11]:
def gen_accuracy(true_labels, gens):
    total = len(true_labels)
    correct = 0
    for i in range(total):
        len_true = len(true_labels[i])
        if true_labels[i].lower() == gens[i].strip()[:len_true].lower():
            correct += 1
    return round(correct / total, 3)
        
    
print(f"NEWS CATEGORIZING ACCURACY: {gen_accuracy(list(news_data['category']), generations)}")

NEWS CATEGORIZING ACCURACY: 0.54
