-
Notifications
You must be signed in to change notification settings - Fork 1
/
bertweetbr_train.py
211 lines (151 loc) · 5.97 KB
/
bertweetbr_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# Valores para configuracao
model_checkpoint = 'neuralmind/bert-base-portuguese-cased'
tokenizer_checkpoint = 'neuralmind/bert-base-portuguese-cased'
chunk_size = 128
batch_size = 32
train_size = 1000
test_size = int(0.1 * train_size)
learning_rate = 2e-5
weight_decay = 0.01
output_dir = "BERTweetBR" # Nao use caracteres especiais, nem . ou /
logging_dir = "BERTweetBR_logs" # Nao use caracteres especiais, nem . ou /
evaluation_strategy="steps"
overwrite_output_dir=True
fp16=False
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# Funcao para tokenizacao
def tokenize_function(examples):
result = tokenizer(examples["text"])
if tokenizer.is_fast:
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
return result
# Funcao para usar o spacy e detectar novos tokens
def spacy_tokenizer(document, nlp=nlp):
# tokenize the document with spaCY
doc = nlp(document)
# Remove stop words and punctuation symbols
tokens = [
token.text for token in doc if (
token.is_stop == False and \
token.is_punct == False and \
token.text.strip() != '' and \
token.text.find("\n") == -1)]
return tokens
# Funcao para agrupar textos por chunk
def group_texts(examples):
# Concatenate all texts
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
# Compute length of concatenated texts
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the last chunk if it's smaller than chunk_size
total_length = (total_length // chunk_size) * chunk_size
# Split by chunks of max_len
result = {
k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
for k, t in concatenated_examples.items()
}
# Create a new labels column
result["labels"] = result["input_ids"].copy()
return result
# Pega o model
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
# Pega o tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
# Pega o Data Collator
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
# Prepara datasets
from datasets import load_dataset
raw_dataset = load_dataset('text', data_files={'train': ['./tweets/text/text_1.txt','./tweets/text/text_2.txt','./tweets/text/text_3.txt','./tweets/text/text_4.txt','./tweets/text/text_5.txt','./tweets/text/text_6.txt','./tweets/text/text_7.txt']}, split=[f"train[{k}%:{k+10}%]" for k in range(0, 100, 10)])
print(raw_dataset)
# Diminuir tamanho do dataset
downsampled_dataset = raw_dataset["train"].train_test_split(train_size=train_size,test_size=test_size, seed=42)
print(downsampled_dataset)
# Tokenizando datasets
tokenized_datasets = downsampled_dataset.map(
tokenize_function, batched=True, remove_columns=["text"]
)
print(tokenized_datasets)
# Aplicando group_texts para dataset tokenizado
final_dataset = tokenized_datasets.map(group_texts, batched=True)
print(final_dataset)
# Adicionando palavras novas no tokenizer
import spacy
nlp = spacy.load("pt_core_news_sm")
# https://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting
tfidf_vectorizer = TfidfVectorizer(lowercase=False, tokenizer=spacy_tokenizer,
norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
# parse matrix of tfidf
length = len(final_dataset)
result = tfidf_vectorizer.fit_transform(final_dataset)
# idf
idf = tfidf_vectorizer.idf_
idf_sorted_indexes = sorted(range(len(idf)), key=lambda k: idf[k])
idf_sorted = idf[idf_sorted_indexes]
new_tokens = np.array(tfidf_vectorizer.get_feature_names())[idf_sorted_indexes]
added_tokens = tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(tokenizer))
# Carrega metrica de perplexidade
from datasets import load_metric
metric = load_metric("perplexity")
# Muda verbosidade do transformers
import transformers
transformers.logging.set_verbosity_info()
# Mostra log a cada step definido abaixo
logging_steps = len(final_dataset["train"]) // batch_size
# Prepara os TrainingArguments
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir=output_dir,
logging_dir = logging_dir,
overwrite_output_dir=overwrite_output_dir,
evaluation_strategy=evaluation_strategy,
learning_rate=learning_rate,
weight_decay=weight_decay,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
fp16=fp16,
logging_steps=logging_steps,
)
# Prepara o Trainer
from transformers import Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=final_dataset["train"],
eval_dataset=final_dataset["test"],
data_collator=data_collator,
)
# Coleta perplexidade antes de treinar, somente avaliando
import math
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
initial_perplexity = math.exp(eval_results['eval_loss'])
# Treina
train_result = trainer.train()
# Coleta perplexidade apos treinar, avaliando
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
# Coletando metricas do resultado de train()
metrics = train_result.metrics
metrics["train_samples"] = len(final_dataset["train"])
# Save train results
trainer.log_metrics("all", metrics)
trainer.save_metrics("all", metrics)
# Cria log do historico do obj do Trainer
with open(str(logging_dir)+'/trainer_logs.txt', 'w') as f:
for obj in trainer.state.log_history:
f.write(str(obj))
f.write('\n')
f.write('\n\n\n')
f.write(str(metrics))
f.write('\n\n\n')
f.write('Initial Perplexity = '+str(initial_perplexity))
f.write('\n')
f.write('Final Perplexity = '+str(math.exp(eval_results['eval_loss'])))
# Salva modelo treinado
trainer.save_model(output_dir)