In [None]:
%tensorflow_version 1.x
SIZE = "2k4"
C = "b"
FOLDER = "/" #path to model folder
FOLDS_PATH = "/" #path where the folds are saved
DATA_FOLDER = "/" #path where the entire dataset is saved
BATCH_SIZE = 32 
# LANG = "BERT-MULTILINGUAL"
# model_name = "bert-base-multilingual-uncased"
LANG = "BERT-PORTUGUESE"
model_name = "neuralmind/bert-base-portuguese-cased"
CLASS_NAME = {2:['Sem Risco', 'Risco Potencial'], 3:['Sem Risco', 'Risco Potencial', 'Risco Alto']}

In [None]:
!pip install transformers

# Carregando o Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import os
from time import ctime

# HuggingFace Transformers

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig

## Integração com FastAi

In [None]:
from fastai import *
from fastai.text import *
from fastai.callbacks import *

**Tokenizador:** classe responsável por tokenizar o texto, sendo criada a partir da classe BaseTokenizer do FastAi. Além de tokenizar o texto utilizando um tokenizador já treinado, o mesmo adiciona os tokens especiais CLS e SEP necessários para o BERT.


In [None]:
class TransformersBaseTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, pretrained_tokenizer, seq_len=512, model_type = 'bert', **kwargs):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = seq_len
        self.model_type = model_type

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length and add the spesial tokens"""
        CLS = self._pretrained_tokenizer.cls_token
        SEP = self._pretrained_tokenizer.sep_token
        if self.model_type in ['roberta']:
            tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)[:self.max_seq_len - 2]
        else:
            tokens = self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2]
        return [CLS] + tokens + [SEP]
        
transformer_tokenizer = BertTokenizer.from_pretrained(model_name)
transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
fastai_tokenizer = Tokenizer(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[])

**Numericalizer:** classe responsável por converter tokens em índices(inteiros) que serão utilizados como entrada para o BERT. A conversão em índices é feita baseando-se em um vocabulário pré-definido.

In [None]:
class TransformersVocab(Vocab):
    def __init__(self, tokenizer):
        super(TransformersVocab, self).__init__(itos = [])
        self.tokenizer = tokenizer
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        "Convert a list of tokens `t` to their ids."
        return self.tokenizer.convert_tokens_to_ids(t)
        #return self.tokenizer.encode(t)

    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        "Convert a list of `nums` to their tokens."
        nums = np.array(nums).tolist()
        return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) if sep is not None else self.tokenizer.convert_ids_to_tokens(nums)

# FastAi

In [None]:
bs = BATCH_SIZE

In [None]:
transformer_vocab =  TransformersVocab(tokenizer=transformer_tokenizer)
numericalize_processor = NumericalizeProcessor(vocab=transformer_vocab)
# False para prevenir a adição de tokens desnecessários pelo processador
tokenize_processor = TokenizeProcessor(tokenizer=fastai_tokenizer, 
                                       include_bos=False, 
                                       include_eos=False)

transformer_processor = [tokenize_processor, numericalize_processor]

In [None]:
pad_first = bool(model_type in ['xlnet'])
pad_idx = transformer_tokenizer.pad_token_id

# Encapsulando o BERT

In [None]:
class BERT(nn.Module):
  
    def __init__(self, transformer_model):
        super(BERT,self).__init__()
        self.transformer = transformer_model
        
    def forward(self, input_ids):
        # Return only the logits from the transfomer
        logits = self.transformer(input_ids)[0]   
        return logits

#Carregando os folds e realizando o treino em cada um

In [None]:
from transformers import AdamW

folds = int(len([name for name in os.listdir(FOLDS_PATH) if os.path.isfile(os.path.join(FOLDS_PATH,name))]) / 2)

val_acc = []
val_prec = []
val_rec = []
val_f1 = []

for f in range(folds):
  
  print("Fold",f,'\n')
  train_df = pd.read_csv(FOLDS_PATH + "train"+str(f) + ".csv", sep="\t", index_col=False)
  test_df = pd.read_csv(FOLDS_PATH + "test"+str(f) + ".csv", sep="\t", index_col=False)
  CLASSES = len(test_df['y'].unique())

  transformer_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=CLASSES)
  transformer_model = BERT(transformer_model=transformer_model)
  transformer_model = transformer_model.cuda()

  
  databunch = (TextList.from_df(train_df, cols='text', processor=transformer_processor)
             .split_by_rand_pct(0.1,seed=9999)
             .label_from_df(cols= 'y')
             .add_test(test_df)
             .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))
  
  learner = Learner(databunch, 
                  transformer_model, 
                  opt_func = lambda input: AdamW(input, correct_bias=False), 
                  metrics=[accuracy, Precision(average="macro"), Recall(average="macro"), FBeta(average="macro", beta=1)])
  # learner.lr_find()
  # learner.recorder.plot()
  learner.fit_one_cycle(5, max_lr=1e-5)

  loss_value, acc_value, prec_value, rec_value, f1_value = learner.validate()
  val_acc.append(acc_value.item())
  val_prec.append(prec_value.item())
  val_rec.append(rec_value.item())
  val_f1.append(f1_value.item())
  learner.destroy()

#Classificador com Holdout

In [None]:
learner.destroy()
train_df = pd.read_csv(DATA_FOLDER + "ideacao-{}-{}-train.csv".format(SIZE, C), sep="\t", index_col=False)
test_df = pd.read_csv(DATA_FOLDER + "ideacao-{}-{}-test.csv".format(SIZE, C), sep="\t", index_col=False)

CLASSES = len(test_df['y'].unique())

transformer_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=CLASSES)
transformer_model = BERT(transformer_model=transformer_model)
transformer_model = transformer_model.cuda()
databunch = (TextList.from_df(train_df, cols='text', processor=transformer_processor)
            .split_by_rand_pct(0.1,seed=9999)
            .label_from_df(cols= 'y')
            .add_test(test_df)
            .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))

learner = Learner(databunch, 
                transformer_model, 
                opt_func = lambda input: AdamW(input, correct_bias=False), 
                metrics=[accuracy, Precision(average="macro"), Recall(average="macro"), FBeta(average="macro", beta=1)])

learner.fit_one_cycle(5, max_lr=1e-5)
# learner.lr_find()
# learner.recorder.plot()

loss, acc, prec, rec, f1 = learner.validate()

#Salvando resultados no arquivo

In [None]:
# LANG = "BERT-MULTILINGUAL"
created_at = ctime()
created_at = created_at.replace(' ', '-')
name = "{}-{}-{}-{}-class-results.csv".format(LANG, SIZE, created_at, CLASSES)
with open(os.path.join(FOLDER, 'results', name), 'w') as f:
  f.write("folds, acc, prec, rec, f1\n")
  f.write("{},{},{},{},{}\n".format(folds, np.mean(val_acc), np.mean(val_prec), np.mean(val_rec), np.mean(val_f1)))  
  f.write("{},{},{},{},{}\n".format('-', acc, prec, rec, f1))  
