In [1]:
%tensorflow_version 1.x
SIZE = "7k"
C = "b"
CLASSES = 2
FOLDER = ""
FOLDS_PATH = "".format(SIZE,C)
DATA_FOLDER = ""
BATCH_SIZE = 48
# LANG = "BERT-MULTILINGUAL"
# model_name = "bert-base-multilingual-cased"
# LANG = "BERT-PORTUGUESE"
# model_name = "neuralmind/bert-base-portuguese-cased"

# BATCH_SIZE = 4
# LANG = "BERT-LARGE-MULTILINGUAL"
# model_name = "bert-large-cased"
# LANG = "BERT-LARGE-PORTUGUESE"
# model_name = "neuralmind/bert-large-portuguese-cased"
CLASS_NAME = {2:['Sem Risco', 'Risco Potencial'], 3:['Sem Risco', 'Risco Potencial', 'Risco Alto']}

TensorFlow 1.x selected.


In [2]:
!pip install transformers



# Carregando o Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import confusion_matrix
from time import ctime

# HuggingFace Transformers

In [5]:
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig

## Integração com FastAi

In [6]:
from fastai import *
from fastai.text import *
from fastai.callbacks import *

**Tokenizador:** classe responsável por tokenizar o texto, sendo criada a partir da classe BaseTokenizer do FastAi. Além de tokenizar o texto utilizando um tokenizador já treinado, o mesmo adiciona os tokens especiais CLS e SEP necessários para o BERT.


In [7]:
class TransformersBaseTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, pretrained_tokenizer, seq_len=512, model_type = 'bert', **kwargs):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = seq_len
        self.model_type = model_type

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length and add the spesial tokens"""
        CLS = self._pretrained_tokenizer.cls_token
        SEP = self._pretrained_tokenizer.sep_token
        if self.model_type in ['roberta']:
            tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)[:self.max_seq_len - 2]
        else:
            tokens = self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2]
        return [CLS] + tokens + [SEP]
        
transformer_tokenizer = BertTokenizer.from_pretrained(model_name)
transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
fastai_tokenizer = Tokenizer(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[])

**Numericalizer:** classe responsável por converter tokens em índices(inteiros) que serão utilizados como entrada para o BERT. A conversão em índices é feita baseando-se em um vocabulário pré-definido.

In [8]:
class TransformersVocab(Vocab):
    def __init__(self, tokenizer):
        super(TransformersVocab, self).__init__(itos = [])
        self.tokenizer = tokenizer
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        "Convert a list of tokens `t` to their ids."
        return self.tokenizer.convert_tokens_to_ids(t)
        #return self.tokenizer.encode(t)

    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        "Convert a list of `nums` to their tokens."
        nums = np.array(nums).tolist()
        return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) if sep is not None else self.tokenizer.convert_ids_to_tokens(nums)

# FastAi

In [9]:
bs = BATCH_SIZE

In [10]:
transformer_vocab =  TransformersVocab(tokenizer=transformer_tokenizer)
numericalize_processor = NumericalizeProcessor(vocab=transformer_vocab)
# False para prevenir a adição de tokens desnecessários pelo processador
tokenize_processor = TokenizeProcessor(tokenizer=fastai_tokenizer, 
                                       include_bos=False, 
                                       include_eos=False)

transformer_processor = [tokenize_processor, numericalize_processor]

In [11]:
pad_first = bool(model_type in ['xlnet'])
pad_idx = transformer_tokenizer.pad_token_id

# Encapsulando o BERT

In [12]:
class BERT(nn.Module):
  
    def __init__(self, transformer_model):
        super(BERT,self).__init__()
        self.transformer = transformer_model
        
    def forward(self, input_ids):
        # Return only the logits from the transfomer
        logits = self.transformer(input_ids)[0]   
        return logits

#Carregando os folds e realizando o treino em cada um

In [None]:
from transformers import AdamW

folds = int(len([name for name in os.listdir(FOLDS_PATH) if os.path.isfile(os.path.join(FOLDS_PATH,name))]) / 2)

val_acc = []
val_prec = []
val_rec = []
val_f1 = []
conf_matrix = np.zeros(CLASSES*CLASSES).reshape(CLASSES, -1)

for f in range(folds):
  
  print("Fold",f,'\n')
  train_df = pd.read_csv(FOLDS_PATH + "train"+str(f) + ".csv", sep="\t", index_col=False)
  test_df = pd.read_csv(FOLDS_PATH + "test"+str(f) + ".csv", sep="\t", index_col=False)
  y_true = test_df['y'].tolist()
  texts = test_df['text'].tolist()
  CLASSES = len(test_df['y'].unique())

  transformer_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=CLASSES)
  transformer_model = BERT(transformer_model=transformer_model)
  transformer_model = transformer_model.cuda()

  
  databunch = (TextList.from_df(train_df, cols='text', processor=transformer_processor)
             .split_by_rand_pct(0.1,seed=9999)
             .label_from_df(cols= 'y')
             .add_test(test_df)
             .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))
  
  learner = Learner(databunch, 
                  transformer_model, 
                  opt_func = lambda input: AdamW(input, correct_bias=False), 
                  metrics=[accuracy, Precision(average="macro"), Recall(average="macro"), FBeta(average="macro", beta=1)])
  # learner.lr_find()
  # learner.recorder.plot()
  learner.fit_one_cycle(1, max_lr=1e-5)
  # learner.fit_one_cycle(2, max_lr=1e-6)
  # learner.fit_one_cycle(3, max_lr=1e-6)

  loss_value, acc_value, prec_value, rec_value, f1_value = learner.validate()
  val_acc.append(acc_value.item())
  val_prec.append(prec_value.item())
  val_rec.append(rec_value.item())
  val_f1.append(f1_value.item())

  preds = [learner.predict(text) for text in texts]
  p = np.array(preds)
  y_pred = [int(x) for x in p[:,0]]
  
  conf_matrix += confusion_matrix(y_true, y_pred)
  




  learner.destroy()
print(conf_matrix)

Fold 0 



Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at 

epoch,train_loss,valid_loss,accuracy,precision,recall,f_beta,time
0,0.691455,0.665068,0.618297,,0.5,0.382066,11:39


	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  idx_min = (t != self.pad_idx).nonzero().min()


this Learner object self-destroyed - it still exists, but no longer usable
Fold 1 



Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at 

epoch,train_loss,valid_loss,accuracy,precision,recall,f_beta,time


In [None]:
saida = {'acc': val_acc, 'prec': val_prec, 'rec': val_rec, 'f1':val_f1}
df = pd.DataFrame(saida)
res = {'acc_avg': np.mean(val_acc), 'prec_avg': np.mean(val_prec), 'rec_avg':np.mean(val_rec), 'f1_avg':np.mean(val_f1), 'acc_std': np.std(val_acc), 'prec_std': np.std(val_prec), 'rec_std':np.std(val_rec), 'f1_std':np.std(val_f1)}

In [None]:
res

In [None]:
created_at = ctime()
created_at = created_at.replace(' ', '-')
mean_std = pd.DataFrame(res, index=[0])
mean_std.head()
name = "{}-{}-{}-{}-class-results.csv".format(LANG, SIZE, created_at, CLASSES)
path = os.path.join(FOLDER, 'results', name)
mean_std.to_csv(path)

In [None]:
 p = np.array(preds)
y_pred = [int(x) for x in p[:,0]]
y_true = test_df['y'].tolist()
matrix = confusion_matrix(y_true, y_pred)
print(matrix)
print(sum(sum(matrix)))

#Salvando resultados no arquivo

In [None]:
# LANG = "BERT-MULTILINGUAL"
created_at = ctime()
created_at = created_at.replace(' ', '-')
name = "{}-{}-{}-{}-class-results.csv".format(LANG, SIZE, created_at, CLASSES)
with open(os.path.join(FOLDER, 'results', name), 'w') as f:
  f.write("folds, acc, prec, rec, f1\n")
  f.write("{},{},{},{},{}\n".format(folds, np.mean(val_acc), np.mean(val_prec), np.mean(val_rec), np.mean(val_f1)))  
  f.write("{},{},{},{},{}\n".format('-', acc, prec, rec, f1))  


## Salvando modelo

In [None]:
# name = "{}-{}-{}-{}-class-model".format(LANG, SIZE, created_at, CLASSES)
# learner.save(FOLDER+name, return_path=True)

In [None]:
# learner.recorder.metrics

# Gerando predições

1.   0: Sem Risco;
2.   1: Risco em Potencial;
3.   2: Risco alto;



In [None]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    """
    the get_preds method does not yield the elements in order by default
    we borrow the code from the RNNLearner to resort the elements into their correct order
    """
    preds = learner.get_preds(ds_type)[0].detach().cpu().numpy()
    sampler = [i for i in databunch.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    return preds[reverse_sampler, :]

In [None]:
test_preds = get_preds_as_nparray(DatasetType.Test)

In [None]:
test_df["prediction"] = np.argmax(test_preds,axis=1)

In [None]:
print(test_df['y'].iloc[0])

In [None]:
for i in range(100):
  print(test_df['text'].iloc[i])
  print("label: {}, prediction: {}".format(test_df['y'].iloc[i], test_df['prediction'].iloc[i], '\n\n'))

In [None]:
for i in range(CLASSES):
  suicide_true = test_df.loc[test_df['y'] == i]['y'].count()
  suicide_pred = test_df.loc[test_df['y'] == i].loc[test_df['prediction']==i]['y'].count()
  print("Accuracy on class", CLASS_NAME[CLASSES][i], ':', suicide_pred/suicide_true)

In [None]:
test_df['y'].size