-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
135 lines (116 loc) · 3.76 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# %% IMPORTS
import math
from pathlib import Path
import transformers
from transformers import (
Trainer,
TrainingArguments,
TrainerCallback,
DataCollatorForLanguageModeling,
trainer_utils,
PreTrainedModel,
)
from guitartab import load_model, MODEL, load_tokenizer, prep_dataset, TOKEN
from generate import generate_song
import logging
DRY_RUN = False
if TOKEN:
logging.warning("Cannot push to hub without HUB_TOKEN")
# %% PREP DATASET
dataset = prep_dataset()
# %% SET UP TRAINER
save_dir = Path(".saves") / MODEL
save_dir.mkdir(parents=True, exist_ok=True)
if transformers.utils.is_torch_cuda_available():
# Optimal configuration for T4 Colab GPU with 15G memory
training_args = TrainingArguments(
output_dir=str(save_dir),
overwrite_output_dir=True,
push_to_hub=TOKEN is not None,
hub_model_id=MODEL,
hub_token=TOKEN,
report_to=["wandb"],
run_name=MODEL.split("/")[-1],
skip_memory_metrics=False,
evaluation_strategy="steps",
save_strategy="steps",
save_steps=500,
eval_steps=500,
eval_accumulation_steps=5,
logging_steps=50,
logging_first_step=True,
save_total_limit=2,
load_best_model_at_end=True,
optim="adamw_torch",
lr_scheduler_type="linear",
warmup_steps=200,
# Most optimal configuration per sweeps
# https://wandb.ai/vsavelyev/guitartab-sweeps/sweeps/bx7jbzna?workspace=user
# -vsavelyev
# https://wandb.ai/vsavelyev/guitartab-sweeps/sweeps/meecv8s2?workspace=user
# -vsavelyev
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
# gradient_checkpointing=True, # 10x < mem, 40% > runtime, = loss
# gradient_accumulation_steps=8, # < runtime, = mem, < loss
fp16=True,
ignore_data_skip=True,
)
else:
# For debugging on a CPU.
training_args = TrainingArguments(
output_dir=save_dir,
report_to=[],
evaluation_strategy="steps",
eval_steps=1,
logging_steps=1,
logging_first_step=True,
optim="adamw_torch",
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
)
testing_dir = Path(".testing")
testing_dir.mkdir(exist_ok=True)
class MyCallback(TrainerCallback):
def on_evaluate(self, args, state, control, **kwargs):
if metrics := kwargs.get("metrics"):
model: PreTrainedModel = kwargs["model"]
loss = metrics["eval_loss"]
print(f"Eval loss: {loss:.4f}")
print(f"Perplexity: {math.exp(loss):.2f}")
generate_song(
out_dir=testing_dir,
title=f"Step {state.global_step}, loss {loss:.4f}",
device=str(model.device),
model=model,
tokenizer=load_tokenizer(),
max_length=500,
num_return_sequences=1,
)
if state.best_metric:
print(f"Best loss so far: {state.best_metric:.4f}")
trainer = Trainer(
model_init=load_model,
data_collator=DataCollatorForLanguageModeling(
tokenizer=load_tokenizer(),
mlm=False,
),
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
callbacks=[MyCallback],
args=training_args,
)
def mem():
import torch
allocated = torch.cuda.memory_allocated()
max_allocated = torch.cuda.max_memory_allocated()
print(f"Mem: {allocated / 1024 ** 3:.2f}G, max: {max_allocated / 1024 ** 3:.2f}G")
mem()
trainer.evaluate() # to early test if something crashes
mem()
# %% TRAIN
if not DRY_RUN:
trainer.train(resume_from_checkpoint=trainer_utils.get_last_checkpoint(save_dir))
if TOKEN:
trainer.save_model() # also calls push_to_hub
mem()