# Experimentos trocando pesos de apenas uma camada

- Esse notebook é destinado à experimentos rápidos de avaliação da performance do modelo baseline pré-treinado e fine-tuned trocando apenas os pesos de uma cabeça do Transformer;
- O modelo baseline é o BERT base (bert-base-uncased) fine-tuned no dataset do IMDB de classificação de sentimentos;
- Para troca de pesos, utiliza-se os pesos do Wav2Vec2 e inicializações randômicas (Uniforme, Normal e Xavier), dessa forma podemos verificar a degradação de performance do modelo, e de certa forma, verificar se os pesos do Wav2Vec2 irão trazer algum benefício, que no caso seria uma mínima degradação na performance se comparado com outros tipos de inicialização;

---
## Bibliotecas e Instalações necessárias

In [None]:
! pip -q install transformers

In [None]:
import os
import copy
import random
import torch
import torch.nn.functional as F
import numpy as np

from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2ForCTC
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
def reset_seed():
    random.seed(123)
    np.random.seed(123)
    torch.manual_seed(123)

reset_seed()

---
## Preparando o dataset do IMDB

Download do dataset

In [None]:
!wget -nc http://files.fast.ai/data/aclImdb.tgz 
!tar -xzf aclImdb.tgz

In [None]:
max_valid = 5000

def load_texts(folder):
    texts = []
    for path in os.listdir(folder):
        with open(os.path.join(folder, path)) as f:
            texts.append(f.read())
    return texts

x_train_pos = load_texts('aclImdb/train/pos')
x_train_neg = load_texts('aclImdb/train/neg')
x_test_pos = load_texts('aclImdb/test/pos')
x_test_neg = load_texts('aclImdb/test/neg')

x_train = x_train_pos + x_train_neg
x_test = x_test_pos + x_test_neg
y_train = [True] * len(x_train_pos) + [False] * len(x_train_neg)
y_test = [True] * len(x_test_pos) + [False] * len(x_test_neg)

# Embaralhamos o treino para depois fazermos a divisão treino/valid.
c = list(zip(x_train, y_train))
random.shuffle(c)
x_train, y_train = zip(*c)

x_valid = x_train[-max_valid:]
y_valid = y_train[-max_valid:]
x_train = x_train[:-max_valid]
y_train = y_train[:-max_valid]

print(len(x_train), 'amostras de treino.')
print(len(x_valid), 'amostras de desenvolvimento.')
print(len(x_test), 'amostras de teste.')

print('3 primeiras amostras treino:')
for x, y in zip(x_train[:3], y_train[:3]):
    print(y, x[:100])

print('3 últimas amostras treino:')
for x, y in zip(x_train[-3:], y_train[-3:]):
    print(y, x[:100])

print('3 primeiras amostras validação:')
for x, y in zip(x_valid[:3], y_test[:3]):
    print(y, x[:100])

print('3 últimas amostras validação:')
for x, y in zip(x_valid[-3:], y_valid[-3:]):
    print(y, x[:100])

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
class IMDBDataset():
  def __init__(self, x, y):
    self.x = x
    self.y = y
  
  def __len__(self):
    return len(self.x)
  
  def __getitem__(self, idx):
    return self.x[idx], int(self.y[idx])

In [None]:
def create_dataloader(x, y, tokenizer, batch_size, shuffle=False, max_length=250):
  def data_collator(batch):
    x, y = zip(*batch)
    tokenized_x = tokenizer(x, padding='longest', truncation=True, max_length=max_length, return_tensors='pt')
    return tokenized_x['input_ids'], tokenized_x['attention_mask'], torch.LongTensor(y)
  dataset = IMDBDataset(x, y)
  return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=data_collator)

In [None]:
def test_model(model, test_loader):
    model.to(device)
    model.eval()
    acc = 0
    with torch.no_grad():
        for tokens, mask, label in tqdm(test_loader):
            tokens = tokens.to(device)
            mask = mask.to(device)
            label = label.to(device)
        
            pred = model(tokens, mask)['logits']

            prediction = pred.argmax(dim=1)

            acc += (prediction == label).sum() # ACC

        test_acc = acc / len(test_loader.dataset)

        print("ACC: ", test_acc.item())

---
---
---

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
print(f"Using {device}")

---

## Load Wav2Vec

In [None]:
 wav2vec2 = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base")

## Load BERT

Utilizando os pesos do BERT pré-treinado ('bert-base-uncased') fine-tuned no dataset do IMDB:

In [None]:
import gdown

In [None]:
url = "https://drive.google.com/u/0/uc?id=1jTI9u2nvnHXx2AYNLFjgEG1j4a8YXn9Z"

gdown.download(url, 'IMDBBert_finetuned.pt', quiet=False)

Utilizando o pesos do BERT já fine-tuned da Aula 6:

In [None]:
states = torch.load('./IMDBBert_finetuned.pt')
bert_imdb.load_state_dict(states)

# IMDB: Baseline: Evaluate original fine-tuned BERT

In [None]:
test_loader = create_dataloader(x_test, y_test, tokenizer, hparams['bs'], shuffle=False, max_length=hparams['max_length'])

In [None]:
test_model(bert_imdb, test_loader)

# IMDB: Teste1: Evaluate original fine-tuned BERT with a layer from Wav2Vec2
- Teste sugerido pelo Rodrigo durante a primeira apresentação do projeto

In [None]:
bert_changed = copy.deepcopy(bert_imdb)

Qual o layer do Wav2Vec que vamos utilizar?

In [None]:
# Mapping dos nomes só para alterarmos no carregamento dos pesos
MAP_WAV2VEC_TO_BERT_NAMES = {
    'attention.k_proj.weight': 'attention.self.key.weight',
    'attention.k_proj.bias': 'attention.self.key.bias',
    'attention.v_proj.weight': 'attention.self.value.weight',
    'attention.v_proj.bias': 'attention.self.value.bias',
    'attention.q_proj.weight': 'attention.self.query.weight',
    'attention.q_proj.bias': 'attention.self.query.bias',
    'attention.out_proj.weight': 'attention.output.dense.weight',
    'attention.out_proj.bias': 'attention.output.dense.bias',
    'layer_norm.weight': 'attention.output.LayerNorm.weight',
    'layer_norm.bias': 'attention.output.LayerNorm.bias',
    'feed_forward.intermediate_dense.weight': 'intermediate.dense.weight',
    'feed_forward.intermediate_dense.bias': 'intermediate.dense.bias',
    'feed_forward.output_dense.weight': 'output.dense.weight',
    'feed_forward.output_dense.bias': 'output.dense.bias',
    'final_layer_norm.weight': 'output.LayerNorm.weight',
    'final_layer_norm.bias': 'output.LayerNorm.bias',
}

In [None]:
wav2vec_11_attention_layer = wav2vec2.wav2vec2.encoder.layers[11]
w2v_layer_states = wav2vec_11_attention_layer.state_dict()

In [None]:
for k, v in w2v_layer_states.items():
    bert_layer_state = 'bert.encoder.layer.11.' + MAP_WAV2VEC_TO_BERT_NAMES[k]
    states[bert_layer_state] = v

In [None]:
# bert_changed.bert.encoder.layer[11].output.LayerNorm = torch.nn.LayerNorm((768,), eps=1e-5, elementwise_affine=True)
# bert_changed.bert.encoder.layer[11].attention.output.LayerNorm = torch.nn.LayerNorm((768,), eps=1e-5, elementwise_affine=True)

In [None]:
bert_changed.load_state_dict(states)

In [None]:
test_model(bert_changed, test_loader)

# IMDB: Teste 2: Evaluate original fine-tuned BERT with same layer initialized with RANDN (Normal Distribution)

In [None]:
bert_changed = copy.deepcopy(bert_imdb)

In [None]:
for k, v in w2v_layer_states.items():
    bert_layer_state = 'bert.encoder.layer.11.' + MAP_WAV2VEC_TO_BERT_NAMES[k]
    states[bert_layer_state] = torch.randn_like(v)

In [None]:
bert_changed.load_state_dict(states)

In [None]:
test_model(bert_changed, test_loader)

# IMDB: Teste 3: Evaluate original fine-tuned BERT with same layer initialized with RAND (Uniform Distribution)

In [None]:
bert_changed = copy.deepcopy(bert_imdb)

In [None]:
for k, v in w2v_layer_states.items():
    bert_layer_state = 'bert.encoder.layer.11.' + MAP_WAV2VEC_TO_BERT_NAMES[k]
    states[bert_layer_state] = torch.rand_like(v)

In [None]:
bert_changed.load_state_dict(states)

In [None]:
test_model(bert_changed, test_loader)

# IMDB: Teste 4: Evaluate original fine-tuned BERT with same layer initialized using Xavier Init

In [None]:
bert_changed = copy.deepcopy(bert_imdb)

In [None]:
for i, child in enumerate(bert_changed.bert.encoder.layer.children()):
  if i == 11:
      print(f"Initializing head {i}")
      # SELF ATTENTION
      torch.nn.init.xavier_uniform_(child.attention.self.query.weight)
      child.attention.self.query.bias.data.fill_(0.01)

      torch.nn.init.xavier_uniform_(child.attention.self.key.weight)
      child.attention.self.key.bias.data.fill_(0.01)

      torch.nn.init.xavier_uniform_(child.attention.self.value.weight)
      child.attention.self.value.bias.data.fill_(0.01)

      # ATTENTION - OUT
      torch.nn.init.xavier_uniform_(child.attention.output.dense.weight)
      child.attention.output.dense.bias.data.fill_(0.01)

      child.attention.output.LayerNorm.reset_parameters()

      torch.nn.init.xavier_uniform_(child.intermediate.dense.weight)
      child.intermediate.dense.bias.data.fill_(0.01)
      
      torch.nn.init.xavier_uniform_(child.output.dense.weight)
      child.output.dense.bias.data.fill_(0.01)

      child.output.LayerNorm.reset_parameters()

In [None]:
test_model(bert_changed, test_loader)