<a href="https://colab.research.google.com/github/vtsimoes/class_victor_dataset/blob/main/G_ULMFiT_FT_VICTOR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook para ajuste fino de modelo ULMFiT pré-treinado em português, para o dataset VICTOR

##Instalação das bibliotecas

In [None]:
!pip install wandb

In [None]:
!pip install fastai==1.0.61

In [None]:
!pip install spacy==2.3.9

In [None]:
!pip show spacy

In [None]:
!pip install colormap

## Carregamento das bibliotecas

In [None]:
import pickle
from google.colab import drive
import os
import IPython
from fastai import *
from fastai.text import *
from fastai.callbacks import *
from multiprocessing import cpu_count
from fastai.utils.ipython import *
from fastai.basics import *
import re
import warnings
import wandb
from fastai.metrics import *
from fastai.metrics import CMScores
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from datetime import datetime, timedelta
import matplotlib.cm as cm
import time
%matplotlib inline

## Nome do treinamento e locais de salvamento

In [None]:
path_dados = '' #path com dados do dataset pre-processados
path_model = '' #path do model ULMFiT
path_results = path_model + 'ulmfit_victor_finetunned/'
path_corpus = ''
#Montando o google drive para obter os dados
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
def save_pickle(data_to_save,path='',filename='file.pkl'):
  open_file = open(path + filename, "wb")
  pickle.dump(data_to_save, open_file)
  open_file.close()

def load_pickle(path='',filename='file.pkl'):
  open_file = open(path + filename, "rb")
  pkl_file = pickle.load(open_file)
  open_file.close()
  return pkl_file

In [None]:
CLASSIFICADOR = 'ULMFiT-FT'
DIRECTION = 'FW'
BATCH_SIZE = 32
EPOCHS = 12
FT = 'VICTOR'
WD = 0.01
BPTT = 70
NOME_RUNNING = f'{CLASSIFICADOR}-{FT}-{DIRECTION}-{BATCH_SIZE}-{BPTT}-{EPOCHS}'
experiment = {'CLASSIFICADOR':CLASSIFICADOR,'FINE_TUNNING':FT,'BATCH_SIZE':BATCH_SIZE,'BPTT':BPTT,'WD':WD,'EPOCHS':EPOCHS,'HISTORY_FILE':f'{NOME_RUNNING}'}
path_experiment = path_results + NOME_RUNNING
NOME_RUNNING, path_experiment

In [None]:
if not os.path.exists(path_experiment):
  os.makedirs(path_experiment)

In [None]:
bs = BATCH_SIZE
data_path = Path(path_model)
print(data_path)

drive/MyDrive/MestradoTucurui/Experiments/ULMFiT


In [None]:
lang = 'pt'

path = data_path

modelsdir = path/Path('ulmfit_portuguese/')

lm_fns2_fwd = [f'{modelsdir}/{lang}_wt_60k_ulmfit_fwd', f'{modelsdir}/{lang}_wt_vocab_60k_ulmfit_fwd']
print(lm_fns2_fwd)

In [None]:
torch.cuda.set_device(0)
!nvidia-smi

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import joblib
if DIRECTION == 'FW':
  with open(path / 'ulmfit_portuguese/vocab-fwd.joblib', "rb") as f:
    vocab = joblib.load(f)
else:
  with open(path / 'ulmfit_portuguese/vocab-bwd.joblib', "rb") as f:
    vocab = joblib.load(f)


In [None]:
import gc
gc.collect()

65

## Carregamento dos Dados

In [None]:
doc_train = load_pickle(path = path_dados,filename='train_processed_concat_pages_sw.pkl')
doc_valid = load_pickle(path = path_dados,filename='valid_processed_concat_pages_sw.pkl')

In [None]:
doc_train.rename(columns={'body':'text','document_type':'label'},inplace=True)
doc_valid.rename(columns={'body':'text','document_type':'label'},inplace=True)
doc_valid = doc_valid.sample(frac = 0.1, replace = False) #Pegando apenas 10% dos dados de validação

In [None]:
len(doc_train),len(doc_valid)

(38815, 2545)

## Montagem dos datasets para treinamento

In [None]:
doc_train['is_valid'] = False
doc_valid['is_valid'] = True
df_train = doc_train.append(doc_valid,ignore_index=True)
df_train.columns

Index(['file_name', 'text', 'pages', 'label', 'is_valid'], dtype='object')

In [None]:
%%time
backwards = True if DIRECTION == 'BW' else False
data_fine = (TextList.from_df(df_train, path_experiment, vocab=vocab, cols='text').split_from_df(col='is_valid').label_for_lm().databunch(bs=bs, num_workers=14, backwards=backwards,bptt=BPTT))

CPU times: user 10.4 s, sys: 3.58 s, total: 14 s
Wall time: 1min 55s


In [None]:
data_fine.show_batch()

In [None]:
data_fine.export(f'export_textlist_60k_ulmfit_{DIRECTION}.pkl')

In [None]:
%%time
data_fine.save(f'textlist_60k_ulmfit_{DIRECTION}.pkl')

## Cáculo de métricas

In [None]:
perplexity = Perplexity()

## Carregamento do modelo pré-treinado

In [None]:
drop_mult = 0.5
opt_func = partial(optim.Adam, betas = (0.8, 0.99))

In [None]:
#para carregamento de modelo pré-treinado
pretrained = ['ULMFiT-FT-VICTOR-FW-32-70-12', f'{lang}_wt_vocab_60k_ulmfit_fwd']
learn = language_model_learner(data_fine, AWD_LSTM,drop_mult=drop_mult, pretrained=True, metrics=[error_rate, accuracy, perplexity],opt_func=opt_func,pretrained_fnames=pretrained).to_fp16()

In [None]:
learn = language_model_learner(data_fine, AWD_LSTM,drop_mult=drop_mult, pretrained=True, metrics=[error_rate, accuracy, perplexity],opt_func=opt_func,pretrained_fnames=lm_fns2_bwd).to_fp16()

In [None]:
from torch.autograd import backward

### Definindo Callbacks

In [None]:
callbacks = [CSVLogger(learn,filename=experiment['HISTORY_FILE'],append=True),ShowGraph(learn),SaveModelCallback(learn,monitor='perplexity', name=NOME_RUNNING, mode='min')]

### Identificando o melhor parâmetro de taxa de aprendizado

In [None]:
with gpu_mem_restore_ctx():
    learn.lr_find()
learn.recorder.plot(suggestion=True)

In [None]:
lr = 1e-3
print(lr, bs)
wd = WD
experiment['lr'] = lr
print(wd)

0.001 32
0.01


In [None]:
lrm = 2.6
lrs = np.array([lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])

## Executando uma primeira época com as camadas internas congeladas

In [None]:
learn.freeze_to(-1)
lr = 1e-3
learn.fit_one_cycle(1, lr, wd=0.1, moms=(0.8,0.99),callbacks=callbacks)

## Executando as demais épocas com todas as camadas descongeladas

In [None]:
learn.unfreeze();

In [None]:
tempo1 = time.time()
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(5, lrs, wd=wd,
                          moms=(0.8,0.7),
                          callbacks=callbacks,final_div=25000)

In [None]:
total_time = time.time() - tempo1

In [None]:
learn.save('ft_victor_fw')
learn.save_encoder('encoder_ft_victor_fw')
learn.export('export_ft_victor_fw')