<a href="https://colab.research.google.com/github/unicamp-dl/IA025_2022S1/blob/main/Final_project/Fernando_Fortes_Granado/Pruning_GPT_J_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Sun Jul  3 18:23:00 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    40W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import torch
torch.cuda.empty_cache()

In [6]:
from transformers import AutoTokenizer, GPTNeoXForCausalLM
from transformers import GPTJForCausalLM, GPTNeoXTokenizerFast
from transformers import GPT2Tokenizer, GPT2Model

model = GPTJForCausalLM.from_pretrained(
    "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True
)

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

device = torch.device("cuda")
model.to(device)

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): Embedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (out_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): GPTJMLP(
          (fc_in): Linear(in_features=4096, out_features=16384, bias=True)
          (fc_out): Linear(in_features=16384, out_features=4096, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPTJBlock(
  

In [7]:
from typing import *
from datasets import load_dataset
import numpy as np

class Dataset:
  def __init__(self, dataset, max_tokens):
    self.max_tokens = max_tokens
    self.dataset = dataset
    self.token_dataset = [
        (
            tokenizer(dataset[idx]["text"], return_tensors="pt"),
            "Positive" if dataset[idx]["label"] else "Negative",
        ) for idx in range(len(dataset))
    ]
    num_tokens = np.array([self.token_dataset[idx][0].input_ids.shape[1] for idx in range(len(dataset))])
    self.dataset_idx = np.argwhere(num_tokens <= max_tokens).reshape(-1)
  
  def __getitem__(self, idx):
    return self.token_dataset[int(self.dataset_idx[idx])]

  def get_raw_item(self, idx):
    item = self.dataset[int(self.dataset_idx[idx])]
    return item["text"], "Positive" if item["label"] else "Negative"

  def __len__(self):
    return len(self.dataset_idx)

  
class InferencePromptGen:
  def __init__(self, learning_examples: List[Tuple[str, str]], dataset: Dataset):
    self.dataset = dataset
    self.learning_examples = learning_examples
    self.fixed_text = self._get_fixed_text()

  def __getitem__(self, idx):
    return f"{self.fixed_text}\n\n{self._get_inference_text(idx)}", self.dataset[idx][1]

  def _get_fixed_text(self):
    # TODO: pre-salvar os tokens das amostras e dos prompts
    instruction = "Instruction: Given a movie review, answer if the sentiment of the review is positive or negative."
    examples = "\n\n".join([f"""Example {i}:\nReview: {text}\nSentiment: {label}""" for i, (text, label) in enumerate(self.learning_examples)])
    return f"{instruction}\n\n{examples}"

  def _get_inference_text(self, idx):
    return f"Example {len(self.learning_examples)}:\nReview: {self.dataset.get_raw_item(idx)[0]}\nSentiment"


def predict(prompt, model, tokenizer):
  tokenized = tokenizer(prompt, return_tensors="pt")
  input_ids = tokenized.input_ids.to(device)
  attention_mask = tokenized.attention_mask.to(device)

  with torch.no_grad():
    gen_tokens = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False,
        temperature=1,
        max_length=2048,
    )
  gen_text = tokenizer.batch_decode(gen_tokens)[0]
  return gen_text.split("\n")[len(prompt.split("\n")) - 1].split()[1]


imdb = load_dataset("imdb")
MAX_TOKENS = 300
NUM_INFERENCE_SAMPLES = 200
NUM_LEARNING_SAMPLES = 7

train_dataset = Dataset(
    dataset=imdb["train"].shuffle(seed=42),
    max_tokens=MAX_TOKENS,
)

test_dataset = Dataset(
    dataset=imdb["test"].shuffle(seed=42),
    max_tokens=MAX_TOKENS,
)

test_prompt_gen = InferencePromptGen(
    learning_examples=[train_dataset.get_raw_item(i) for i in range(NUM_LEARNING_SAMPLES)],
    dataset=test_dataset,
)

samples = list(range(200))
prompts = [test_prompt_gen[i][0] for i in samples]
labels = [test_prompt_gen[i][1] for i in samples]

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-f700cd6f222bd13f.arrow
Token indices sequence length is longer than the specified maximum sequence length for this model (2174 > 2048). Running this sequence through the model will result in indexing errors
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-b23cfeb68a931a8d.arrow


In [8]:
#last_hidden_states = outputs.last_hidden_state
# torch.cuda.empty_cache()

# tokenized = tokenizer(prompts[5], return_tensors="pt")
# input_ids = tokenized.input_ids.to(device)
# attention_mask = tokenized.attention_mask.to(device)

# with torch.no_grad():
#     outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
# loss = outputs.loss
# logits = outputs.logits

In [9]:
# Perplexity
# torch.exp(loss)

In [10]:
import torch
torch.cuda.empty_cache()

def evaluate(model, tokenizer, prompts: List[str], labels: List[str]):
  num_correct_preds = 0
  for idx in range(len(prompts)):
    if ((idx % 10) == 0) and (idx != 0):
      print(f"Number of samples evaluated: {idx}, Number of correct preds: {num_correct_preds}, Accuracy: {num_correct_preds / idx}")
    output = predict(prompts[idx], model, tokenizer)
    print(labels[idx], output, labels[idx] == output)

    if labels[idx] == output:
      num_correct_preds += 1
    else:
      print("\n_____________________________\n")
      print("PROMPT")
      print(prompts[idx])
      print("\n_____________________________\n")
      print("LABEL")
      print(labels[idx])
      print("OUTPUT")
      print(output)
      print("\n_____________________________\n")
  return num_correct_preds / len(prompts)


def evaluate_perplexity(model, tokenizer, dataset: Dataset):
  acc_loss = 0
  for idx in range(len(dataset)):
    if ((idx % 10) == 0) and (idx != 0):
      print(f"Number of samples evaluated: {idx}, Average perplexity: {torch.exp(acc_loss / idx)}")

    input_ids = dataset[idx][0].input_ids.to(device)
    attention_mask = dataset[idx][0].attention_mask.to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
    loss = outputs.loss
    logits = outputs.logits
    acc_loss += loss

  return torch.exp(acc_loss / len(dataset))

In [11]:
#evaluate(model, tokenizer, prompts, labels)

Positive Positive True
Negative Negative True
Positive Positive True


KeyboardInterrupt: ignored

In [None]:
torch.cuda.empty_cache()
train_dataset = Dataset(
    dataset=imdb["train"].shuffle(seed=42),
    max_tokens=2048,
)
evaluate_perplexity(model, tokenizer, train_dataset)

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-f700cd6f222bd13f.arrow


Number of samples evaluated: 10, Average perplexity: 20.6875
Number of samples evaluated: 20, Average perplexity: 20.84375
Number of samples evaluated: 30, Average perplexity: 20.4375
Number of samples evaluated: 40, Average perplexity: 19.3125
Number of samples evaluated: 50, Average perplexity: 20.046875
Number of samples evaluated: 60, Average perplexity: 19.921875
Number of samples evaluated: 70, Average perplexity: 19.625
Number of samples evaluated: 80, Average perplexity: 19.703125
Number of samples evaluated: 90, Average perplexity: 19.421875
Number of samples evaluated: 100, Average perplexity: 19.234375
Number of samples evaluated: 110, Average perplexity: 19.28125
Number of samples evaluated: 120, Average perplexity: 19.703125
Number of samples evaluated: 130, Average perplexity: 19.8125
Number of samples evaluated: 140, Average perplexity: 19.8125
Number of samples evaluated: 150, Average perplexity: 19.734375
Number of samples evaluated: 160, Average perplexity: 19.546875


In [None]:
#last_hidden_states = outputs.last_hidden_state
torch.cuda.empty_cache()

tokenized = tokenizer(prompts[0], return_tensors="pt")
input_ids = tokenized.input_ids.to(device)
attention_mask = tokenized.attention_mask.to(device)

with torch.no_grad():
  output = model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      # pad_token_id=tokenizer.eos_token_id,
      # do_sample=False,
      # temperature=1,
      # max_length=2048,
  )
output

In [None]:
tokenizer(prompts[0], return_tensors="pt").attention_mask.shape

In [None]:
# Verificar balanceamento
# Obter baseline (perplexidade) com predict de proxima palavra no IMDB em paralelo. Fixar N amostras para obter essas avaliações.
# Implementar um pruning de uma cabeça especificada por um indice

In [None]:
# Acuracia no treino: 93%, NUM_INFERENCE_SAMPLES: 200 5 examples, 93,5% com 7 exemplos

In [None]:
# Acuracia no teste: 93,5%, NUM_INFERENCE_SAMPLES: 200 5 examples