In [1]:
import transformers
import torch
import pandas as pd
from tqdm.notebook import tqdm

from gpt_quant_modules import GPTJForCausalLM
from custom_datasets import PromptDataset



# Load model

In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = GPTJForCausalLM.from_pretrained("hivemind/gpt-j-6B-8bit", low_cpu_mem_usage=True)
model.to('cuda')

lm_head Linear(in_features=4096, out_features=50400, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, bias=False)
fc_in Linear(in_features=4096, out_features=16384, bias=True)
fc_out Linear(in_features=16384, out_features=4096, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, bias=False)
fc_in Linear(in_features=4096, out_features=16384, bias=True)
fc_out Linear(in_features=16384, out_features=4096, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, b

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): FrozenBNBEmbedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): FrozenBNBLinear(4096, 4096)
          (v_proj): FrozenBNBLinear(4096, 4096)
          (q_proj): FrozenBNBLinear(4096, 4096)
          (out_proj): FrozenBNBLinear(4096, 4096)
        )
        (mlp): GPTJMLP(
          (fc_in): FrozenBNBLinear(4096, 16384)
          (fc_out): FrozenBNBLinear(16384, 4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
 

# Create fewshot prompt

In [3]:
# We will try Hate Speech and Offensive Language Detection with this prompt

def create_prompt(text):
        prompt = f'''Classify the following messages into one of the following categories: neutral, hate, offensive

Message: I could go for a brownie right now
Category: neutral
/n/n###/n/n
Message: What these bitches want from a nigga?, like on some DMX shit
Category: hate
/n/n###/n/n
Message: So says the one who fills an empty hole inside herself by worshiping a royal. Who 4 all u kno may be a raging snotty bitch
Category: offensive
/n/n###/n/n
Message: {text}
Category:'''
        return prompt

prompt = create_prompt('Toda will be rainy and cloudy. Im going to take an umbrella.')
sample = tokenizer(prompt, return_tensors='pt')
sample = {k: v.to('cuda') for k, v in sample.items()}



### Get tokens and token ids for labels 

In [4]:
labels = tokenizer.tokenize(' neutral hate offensive')
label_ids = tokenizer.convert_tokens_to_ids(labels)
id2label = dict(zip(label_ids, labels))
id2label

{8500: 'Ġneutral', 5465: 'Ġhate', 5859: 'Ġoffensive'}

# Try generation

In [5]:
gen_tokens = model.generate(**sample,  
               max_length=(sample['input_ids'].shape[-1]) + 1)
print(tokenizer.decode(gen_tokens[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Classify the following messages into one of the following categories: neutral, hate, offensive

Message: I could go for a brownie right now
Category: neutral
/n/n###/n/n
Message: What these bitches want from a nigga?, like on some DMX shit
Category: hate
/n/n###/n/n
Message: So says the one who fills an empty hole inside herself by worshiping a royal. Who 4 all u kno may be a raging snotty bitch
Category: offensive
/n/n###/n/n
Message: Toda will be rainy and cloudy. Im going to take an umbrella.
Category: neutral


# Validation on val dataset

In [6]:
val_df = pd.read_csv('./data/val.csv')

val_df['prompt'] = val_df['text'].apply(create_prompt)
val_df['label'] = val_df['label'].apply(lambda x: ' ' + x)


predicted_tokens = []
for i, row in tqdm(val_df.iterrows(), total=len(val_df)):
    sample = tokenizer(row['prompt'], return_tensors='pt')
    sample = {k: v.to('cuda') for k, v in sample.items()}
    # We will take logits from raw output of model
    out = model(**sample)
    # Take token from labels with max logit
    max_ind_label = torch.argmax(out.logits[:, -1].flatten()[label_ids]).detach().cpu()
    predicted_token = tokenizer.decode(label_ids[max_ind_label])
    predicted_tokens.append(predicted_token)
    del out
    del sample



  0%|          | 0/200 [00:00<?, ?it/s]

In [7]:
from sklearn.metrics import accuracy_score

print(f"Accuracy of Fewshot predictions: {accuracy_score(val_df['label'], predicted_tokens)}")

Accuracy of Fewshot predictions: 0.415


# Check other classification tasks

In [8]:
prompt = '''Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Prime minister said no agreement had yet been made between the UK and the European Union.

Category:'''

sample = tokenizer(prompt, return_tensors='pt')
sample = {k: v.to('cuda') for k, v in sample.items()}

gen_tokens = model.generate(**sample, 
               max_length=(sample['input_ids'].shape[-1]) + 1)
print(tokenizer.decode(gen_tokens[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Classify the following messages into one of the following categories: Politics, Sports, BusinessSpace, Tech, Social

Message: Prime minister said no agreement had yet been made between the UK and the European Union.

Category: Politics


In [9]:
# Let's try to detect bbc news topic

prompt = '''Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Netflix cuts prices for subscribers in more than 30 countries

Category:'''

sample = tokenizer(prompt, return_tensors='pt')
sample = {k: v.to('cuda') for k, v in sample.items()}

gen_tokens = model.generate(**sample, 
               max_length=(sample['input_ids'].shape[-1]) + 1)
print(tokenizer.decode(gen_tokens[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Classify the following messages into one of the following categories: Politics, Sports, BusinessSpace, Tech, Social

Message: Netflix cuts prices for subscribers in more than 30 countries

Category: Politics


In [10]:
# Let's try to detect bbc news topic

prompt = '''Classify the following messages into one of the following categories: Politics, Sports, Business, Space, Tech, Social

Message: Real Madrid's title hopes suffered a further setback after being held by 10-man rivals Atletico Madrid at the Bernabeu.

Category:'''

sample = tokenizer(prompt, return_tensors='pt')
sample = {k: v.to('cuda') for k, v in sample.items()}

gen_tokens = model.generate(**sample, 
               temperature=0.1,
               do_sample=False, 
               max_length=(sample['input_ids'].shape[-1]) + 1)
print(tokenizer.decode(gen_tokens[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Classify the following messages into one of the following categories: Politics, Sports, BusinessSpace, Tech, Social

Message: Real Madrid's title hopes suffered a further setback after being held by 10-man rivals Atletico Madrid at the Bernabeu.

Category: Politics


### Finetuning is much better than Fewshot predictions of hatespeech and raw predictions of other tasks