In [1]:
# !pip install torch --extra-index-url https://download.pytorch.org/whl/cu113
# !pip install transformers
# !pip install datasets
# !pip install deepspeed
# !pip install mpi4py
# !pip install accelerate

In [1]:
import os, re
import torch
import numpy as np

from datasets import load_dataset
from transformers import BloomTokenizerFast, BloomForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
root_path = os.getcwd()
model_root_path = os.path.join(root_path, "models")
data_root_path = os.path.join(root_path, "data")

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [3]:
model_name = "bloom-1b7"
model_path = os.path.join(model_root_path, model_name)

In [4]:
model = BloomForCausalLM.from_pretrained(model_path).to(device)

In [5]:
# model = BloomForCausalLM.from_pretrained(f"bigscience/{model_name}").to(device)
# model.save_pretrained(model_path)

In [6]:
data_file = "lm_generated_data.jsonl"
dataset_raw = load_dataset("json", data_files=data_file, data_dir=data_root_path)
dataset_split = dataset_raw["train"].train_test_split(test_size=0.1)

Using custom data configuration default-8b59437041d29b59
Found cached dataset json (/home/gordon/.cache/huggingface/datasets/json/default-8b59437041d29b59/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
100%|████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 272.68it/s]


In [7]:
dataset_split

DatasetDict({
    train: Dataset({
        features: ['email_prompt', 'email', 'summary'],
        num_rows: 3600
    })
    test: Dataset({
        features: ['email_prompt', 'email', 'summary'],
        num_rows: 400
    })
})

In [8]:
tokenizer = BloomTokenizerFast.from_pretrained(f"bigscience/{model_name}")

In [9]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [10]:
def tokenize_data(examples):
    relevant_texts = zip(examples["email"], examples["summary"])
    concat_texts = [t[0] + "\n\n===\n\n" + t[1] + "\nEND" for t in relevant_texts]
    results = tokenizer(concat_texts, padding="max_length", max_length=400, truncation=True)
    results["labels"] = results["input_ids"].copy()
    return results

In [11]:
dataset_tokenized = dataset_split.map(tokenize_data, batched=True, remove_columns=dataset_raw["train"].column_names)

 75%|█████████████████████████████████████████████████████████▊                   | 3/4 [00:01<00:00,  2.58ba/s]
  0%|                                                                                     | 0/1 [00:00<?, ?ba/s]


In [12]:
dataset_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3600
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 400
    })
})

In [13]:
seen = set()
for x in dataset_tokenized["train"]:
    y = len(x["input_ids"])
    if y not in seen:
        print(y)
        seen.add(y)

400


In [14]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [15]:
model_save_path = os.path.join(model_root_path, f"finetuned-{model_name}-003-e2")
ds_config = "ds_config_zero2.json"

training_args = TrainingArguments(
    output_dir=model_save_path,
    num_train_epochs=2,
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    weight_decay=1e-3,
    warmup_ratio=0.1,
    deepspeed=ds_config,
    evaluation_strategy="steps",
    save_strategy="epoch",
    logging_steps=25,
    eval_steps=25,
)

[2022-10-22 23:20:54,285] [INFO] [comm.py:618:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2022-10-22 23:20:54,784] [INFO] [comm.py:675:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.116, master_port=29500
[2022-10-22 23:20:54,787] [INFO] [comm.py:635:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
    data_collator=data_collator,
)

In [17]:
trainer.train()
model.save_pretrained(model_save_path)

[2022-10-22 23:20:54,826] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.3, git-hash=unknown, git-branch=unknown
[2022-10-22 23:20:54,898] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
Using /home/gordon/.cache/torch_extensions/py37_cu113 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/gordon/.cache/torch_extensions/py37_cu113/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module cpu_adam...
Time to load cpu_adam op: 2.898439407348633 seconds
[2022-10-22 23:21:00,053] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
[2022-10-22 23:21:00,067] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__}
[2022-10-22 23

[2022-10-22 23:21:13,952] [INFO] [config.py:991:print]   eigenvalue_tol ............... 0.01
[2022-10-22 23:21:13,952] [INFO] [config.py:991:print]   eigenvalue_verbose ........... False
[2022-10-22 23:21:13,953] [INFO] [config.py:991:print]   elasticity_enabled ........... False
[2022-10-22 23:21:13,953] [INFO] [config.py:991:print]   flops_profiler_config ........ {
    "enabled": false, 
    "profile_step": 1, 
    "module_depth": -1, 
    "top_modules": 1, 
    "detailed": true, 
    "output_file": null
}
[2022-10-22 23:21:13,954] [INFO] [config.py:991:print]   fp16_auto_cast ............... None
[2022-10-22 23:21:13,955] [INFO] [config.py:991:print]   fp16_enabled ................. False
[2022-10-22 23:21:13,955] [INFO] [config.py:991:print]   fp16_master_weights_and_gradients  False
[2022-10-22 23:21:13,956] [INFO] [config.py:991:print]   global_rank .................. 0
[2022-10-22 23:21:13,956] [INFO] [config.py:991:print]   gradient_accumulation_steps .. 4
[2022-10-22 23:21:13

***** Running training *****
  Num examples = 3600
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 450
You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Using /home/gordon/.cache/torch_extensions/py37_cu113 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0013315677642822266 seconds
Adam Optimizer #0 is created with AVX2 arithmetic capability.
Config: alpha=0.000100, betas=(0.900000, 0.999000), weight_decay=0.001000, adam_w=1


Step,Training Loss,Validation Loss
25,2.0222,1.646748
50,1.606,1.55876
75,1.5519,1.492738
100,1.527,1.49086
125,1.4879,1.437143
150,1.4136,1.443047
175,1.4221,1.400741
200,1.3794,1.355497
225,1.3656,1.36596
250,1.0803,1.34396


[2022-10-22 23:21:44,109] [INFO] [timer.py:207:stop] 0/10, RunningAvgSamplesPerSec=1.3767221858278997, CurrSamplesPerSec=1.5810638914233877, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:22:15,377] [INFO] [timer.py:207:stop] 0/20, RunningAvgSamplesPerSec=1.3219861217977666, CurrSamplesPerSec=0.8167501881198236, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:22:43,754] [INFO] [timer.py:207:stop] 0/30, RunningAvgSamplesPerSec=1.3526160947109316, CurrSamplesPerSec=1.579682997850785, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:23:14,807] [INFO] [logging.py:68:log_dist] [Rank 0] step=10, skipped=0, lr=[6.048829123101417e-05], mom=[[0.9, 0.999]]
[2022-10-22 23:23:14,855] [INFO] [timer.py:207:stop] 0/40, RunningAvgSamplesPerSec=1.3348834599133477, CurrSamplesPerSec=0.8305181208978628, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:23:43,044] [INFO] [timer.py:207:stop] 0/50, RunningAvgSamplesPerSec=1.35198462831991, CurrSamplesPerSec=1.58

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-22 23:27:40,687] [INFO] [timer.py:207:stop] 0/110, RunningAvgSamplesPerSec=1.3541653082521325, CurrSamplesPerSec=1.604771879884223, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:28:11,537] [INFO] [logging.py:68:log_dist] [Rank 0] step=30, skipped=0, lr=[8.9348540639004e-05], mom=[[0.9, 0.999]]
[2022-10-22 23:28:11,584] [INFO] [timer.py:207:stop] 0/120, RunningAvgSamplesPerSec=1.3490443299561423, CurrSamplesPerSec=0.8272536046728355, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:28:39,539] [INFO] [timer.py:207:stop] 0/130, RunningAvgSamplesPerSec=1.3552380341926427, CurrSamplesPerSec=1.5974438181091257, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:29:10,262] [INFO] [timer.py:207:stop] 0/140, RunningAvgSamplesPerSec=1.3513513094846756, CurrSamplesPerSec=0.8362178081845615, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:29:38,259] [INFO] [timer.py:207:stop] 0/150, RunningAvgSamplesPerSec=1.3564537859909174, CurrSamplesPerSec

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-22 23:33:34,992] [INFO] [timer.py:207:stop] 0/210, RunningAvgSamplesPerSec=1.3578216434949022, CurrSamplesPerSec=1.6025201375071996, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:34:05,733] [INFO] [timer.py:207:stop] 0/220, RunningAvgSamplesPerSec=1.3552043787821746, CurrSamplesPerSec=0.8377328346248194, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:34:33,620] [INFO] [timer.py:207:stop] 0/230, RunningAvgSamplesPerSec=1.3585705212472279, CurrSamplesPerSec=1.6125596760504763, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:35:04,353] [INFO] [logging.py:68:log_dist] [Rank 0] step=60, skipped=0, lr=[9.654320987654321e-05], mom=[[0.9, 0.999]]
[2022-10-22 23:35:04,401] [INFO] [timer.py:207:stop] 0/240, RunningAvgSamplesPerSec=1.3560523083065419, CurrSamplesPerSec=0.8318218404043146, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:35:32,355] [INFO] [timer.py:207:stop] 0/250, RunningAvgSamplesPerSec=1.3590028523244435, CurrSamplesPer

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-22 23:39:28,899] [INFO] [timer.py:207:stop] 0/310, RunningAvgSamplesPerSec=1.3596116629906667, CurrSamplesPerSec=1.6125009358959117, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:39:59,601] [INFO] [logging.py:68:log_dist] [Rank 0] step=80, skipped=0, lr=[9.160493827160494e-05], mom=[[0.9, 0.999]]
[2022-10-22 23:39:59,648] [INFO] [timer.py:207:stop] 0/320, RunningAvgSamplesPerSec=1.3577439375510827, CurrSamplesPerSec=0.8378603528456846, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:40:27,562] [INFO] [timer.py:207:stop] 0/330, RunningAvgSamplesPerSec=1.3599772459906359, CurrSamplesPerSec=1.610706388912277, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:40:58,240] [INFO] [timer.py:207:stop] 0/340, RunningAvgSamplesPerSec=1.3582993227007285, CurrSamplesPerSec=0.8369612745393692, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:41:26,144] [INFO] [timer.py:207:stop] 0/350, RunningAvgSamplesPerSec=1.3604109879373, CurrSamplesPerSec=

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-22 23:45:22,809] [INFO] [timer.py:207:stop] 0/410, RunningAvgSamplesPerSec=1.360604816210531, CurrSamplesPerSec=1.609948715872036, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:45:53,560] [INFO] [timer.py:207:stop] 0/420, RunningAvgSamplesPerSec=1.3591568459120626, CurrSamplesPerSec=0.8353756129428026, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:46:21,453] [INFO] [timer.py:207:stop] 0/430, RunningAvgSamplesPerSec=1.3608589471156118, CurrSamplesPerSec=1.6024699324654108, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:46:52,124] [INFO] [logging.py:68:log_dist] [Rank 0] step=110, skipped=0, lr=[8.419753086419754e-05], mom=[[0.9, 0.999]]
[2022-10-22 23:46:52,172] [INFO] [timer.py:207:stop] 0/440, RunningAvgSamplesPerSec=1.3594969883094064, CurrSamplesPerSec=0.8334049561249757, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:47:20,041] [INFO] [timer.py:207:stop] 0/450, RunningAvgSamplesPerSec=1.361148778192876, CurrSamplesPerSe

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-22 23:51:16,865] [INFO] [timer.py:207:stop] 0/510, RunningAvgSamplesPerSec=1.361030524091926, CurrSamplesPerSec=1.6129729929286605, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:51:47,503] [INFO] [logging.py:68:log_dist] [Rank 0] step=130, skipped=0, lr=[7.925925925925926e-05], mom=[[0.9, 0.999]]
[2022-10-22 23:51:47,551] [INFO] [timer.py:207:stop] 0/520, RunningAvgSamplesPerSec=1.3599054440079983, CurrSamplesPerSec=0.8377393183744921, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:52:15,439] [INFO] [timer.py:207:stop] 0/530, RunningAvgSamplesPerSec=1.361278364483905, CurrSamplesPerSec=1.605646842825209, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:52:46,155] [INFO] [timer.py:207:stop] 0/540, RunningAvgSamplesPerSec=1.3601656800481758, CurrSamplesPerSec=0.8358177547186204, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:53:14,117] [INFO] [timer.py:207:stop] 0/550, RunningAvgSamplesPerSec=1.361426659682411, CurrSamplesPerSec

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-22 23:57:10,843] [INFO] [timer.py:207:stop] 0/610, RunningAvgSamplesPerSec=1.3614133287866916, CurrSamplesPerSec=1.6091216852275476, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:57:41,531] [INFO] [timer.py:207:stop] 0/620, RunningAvgSamplesPerSec=1.360467065138804, CurrSamplesPerSec=0.8358707648933618, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:58:09,364] [INFO] [timer.py:207:stop] 0/630, RunningAvgSamplesPerSec=1.3616526150495682, CurrSamplesPerSec=1.6112710111039605, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:58:40,090] [INFO] [logging.py:68:log_dist] [Rank 0] step=160, skipped=0, lr=[7.185185185185186e-05], mom=[[0.9, 0.999]]
[2022-10-22 23:58:40,138] [INFO] [timer.py:207:stop] 0/640, RunningAvgSamplesPerSec=1.3606677475389968, CurrSamplesPerSec=0.8347129859657881, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-22 23:59:08,056] [INFO] [timer.py:207:stop] 0/650, RunningAvgSamplesPerSec=1.3617564881943403, CurrSamplesPer

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-23 00:03:04,979] [INFO] [timer.py:207:stop] 0/710, RunningAvgSamplesPerSec=1.361724398605645, CurrSamplesPerSec=1.6092644556167373, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:03:35,704] [INFO] [logging.py:68:log_dist] [Rank 0] step=180, skipped=0, lr=[6.691358024691359e-05], mom=[[0.9, 0.999]]
[2022-10-23 00:03:35,752] [INFO] [timer.py:207:stop] 0/720, RunningAvgSamplesPerSec=1.3608490902242654, CurrSamplesPerSec=0.8352063545954675, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:04:03,829] [INFO] [timer.py:207:stop] 0/730, RunningAvgSamplesPerSec=1.3617163880387853, CurrSamplesPerSec=1.6080268230411165, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:04:34,639] [INFO] [timer.py:207:stop] 0/740, RunningAvgSamplesPerSec=1.36084347398075, CurrSamplesPerSec=0.8243891432241853, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:05:02,526] [INFO] [timer.py:207:stop] 0/750, RunningAvgSamplesPerSec=1.3618040012531234, CurrSamplesPerSe

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-23 00:08:59,382] [INFO] [timer.py:207:stop] 0/810, RunningAvgSamplesPerSec=1.3616599486994336, CurrSamplesPerSec=1.6211006084967305, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:09:30,111] [INFO] [timer.py:207:stop] 0/820, RunningAvgSamplesPerSec=1.360915470498993, CurrSamplesPerSec=0.8334509532078234, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:09:57,967] [INFO] [timer.py:207:stop] 0/830, RunningAvgSamplesPerSec=1.3617964602951, CurrSamplesPerSec=1.6077354297797986, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:10:28,666] [INFO] [logging.py:68:log_dist] [Rank 0] step=210, skipped=0, lr=[5.950617283950618e-05], mom=[[0.9, 0.999]]
[2022-10-23 00:10:28,714] [INFO] [timer.py:207:stop] 0/840, RunningAvgSamplesPerSec=1.3610604492052227, CurrSamplesPerSec=0.8313009079778044, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:10:56,577] [INFO] [timer.py:207:stop] 0/850, RunningAvgSamplesPerSec=1.3619149911715684, CurrSamplesPerSec

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8
Saving model checkpoint to /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-225
Configuration saved in /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-225/config.json
Model weights saved in /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-225/pytorch_model.bin


[2022-10-23 00:14:37,474] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step225 is begin to save!
[2022-10-23 00:14:37,480] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-225/global_step225/mp_rank_00_model_states.pt
[2022-10-23 00:14:37,481] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-225/global_step225/mp_rank_00_model_states.pt...


  "Positional args are being deprecated, use kwargs instead. Refer to "


[2022-10-23 00:14:50,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-225/global_step225/mp_rank_00_model_states.pt.
[2022-10-23 00:14:50,072] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-225/global_step225/zero_pp_rank_0_mp_rank_00_optim_states.pt...
[2022-10-23 00:15:17,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-225/global_step225/zero_pp_rank_0_mp_rank_00_optim_states.pt.
[2022-10-23 00:15:17,233] [INFO] [engine.py:3196:_save_zero_checkpoint] zero checkpoint saved /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-225/global_step225/zero_pp_rank_0_mp_rank_00_optim_states.pt
[2022-10-23 00:15:17,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step225 is ready now!
[2022-10-23 00:

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-23 00:21:42,278] [INFO] [timer.py:207:stop] 0/1010, RunningAvgSamplesPerSec=1.3611812034132142, CurrSamplesPerSec=1.6124569223315537, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:22:13,077] [INFO] [timer.py:207:stop] 0/1020, RunningAvgSamplesPerSec=1.3605612228030375, CurrSamplesPerSec=0.8331670612782834, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:22:40,987] [INFO] [timer.py:207:stop] 0/1030, RunningAvgSamplesPerSec=1.3612531383531956, CurrSamplesPerSec=1.611625920090435, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:23:11,698] [INFO] [logging.py:68:log_dist] [Rank 0] step=260, skipped=0, lr=[4.7160493827160495e-05], mom=[[0.9, 0.999]]
[2022-10-23 00:23:11,746] [INFO] [timer.py:207:stop] 0/1040, RunningAvgSamplesPerSec=1.3606624010660286, CurrSamplesPerSec=0.8347154777302698, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:23:39,669] [INFO] [timer.py:207:stop] 0/1050, RunningAvgSamplesPerSec=1.361332783244354, CurrSampl

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-23 00:27:36,456] [INFO] [timer.py:207:stop] 0/1110, RunningAvgSamplesPerSec=1.361343782223026, CurrSamplesPerSec=1.611463846745297, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:28:07,200] [INFO] [logging.py:68:log_dist] [Rank 0] step=280, skipped=0, lr=[4.222222222222222e-05], mom=[[0.9, 0.999]]
[2022-10-23 00:28:07,248] [INFO] [timer.py:207:stop] 0/1120, RunningAvgSamplesPerSec=1.3607772602353059, CurrSamplesPerSec=0.8327905067174856, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:28:35,132] [INFO] [timer.py:207:stop] 0/1130, RunningAvgSamplesPerSec=1.3614161056269978, CurrSamplesPerSec=1.6091250805535273, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:29:05,859] [INFO] [timer.py:207:stop] 0/1140, RunningAvgSamplesPerSec=1.3608856768513997, CurrSamplesPerSec=0.8337286996166398, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:29:33,795] [INFO] [timer.py:207:stop] 0/1150, RunningAvgSamplesPerSec=1.3614892896292672, CurrSample

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-23 00:33:30,737] [INFO] [timer.py:207:stop] 0/1210, RunningAvgSamplesPerSec=1.3614352168923733, CurrSamplesPerSec=1.6028134712937003, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:34:01,421] [INFO] [timer.py:207:stop] 0/1220, RunningAvgSamplesPerSec=1.36095649554723, CurrSamplesPerSec=0.8382696522412395, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:34:29,548] [INFO] [timer.py:207:stop] 0/1230, RunningAvgSamplesPerSec=1.3614477548596744, CurrSamplesPerSec=1.6203171654926432, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:35:00,259] [INFO] [logging.py:68:log_dist] [Rank 0] step=310, skipped=0, lr=[3.481481481481482e-05], mom=[[0.9, 0.999]]
[2022-10-23 00:35:00,306] [INFO] [timer.py:207:stop] 0/1240, RunningAvgSamplesPerSec=1.3609461022064242, CurrSamplesPerSec=0.8276200234603602, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:35:28,285] [INFO] [timer.py:207:stop] 0/1250, RunningAvgSamplesPerSec=1.361485406268552, CurrSamples

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-23 00:39:55,806] [INFO] [logging.py:68:log_dist] [Rank 0] step=330, skipped=0, lr=[2.9876543209876545e-05], mom=[[0.9, 0.999]]
[2022-10-23 00:39:55,854] [INFO] [timer.py:207:stop] 0/1320, RunningAvgSamplesPerSec=1.360999841387951, CurrSamplesPerSec=0.8311135336404935, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:40:23,838] [INFO] [timer.py:207:stop] 0/1330, RunningAvgSamplesPerSec=1.3615065456144488, CurrSamplesPerSec=1.6153634034634208, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:40:54,568] [INFO] [timer.py:207:stop] 0/1340, RunningAvgSamplesPerSec=1.3610515308820499, CurrSamplesPerSec=0.8281796429183358, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:41:22,471] [INFO] [timer.py:207:stop] 0/1350, RunningAvgSamplesPerSec=1.3615758142572376, CurrSamplesPerSec=1.6050806265904582, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:41:53,138] [INFO] [logging.py:68:log_dist] [Rank 0] step=340, skipped=0, lr=[2.7407407407407408e-0

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-23 00:45:19,129] [INFO] [timer.py:207:stop] 0/1410, RunningAvgSamplesPerSec=1.3615974706246652, CurrSamplesPerSec=1.6053752058125375, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:45:49,847] [INFO] [timer.py:207:stop] 0/1420, RunningAvgSamplesPerSec=1.3611713601244342, CurrSamplesPerSec=0.8345900773855067, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:46:17,710] [INFO] [timer.py:207:stop] 0/1430, RunningAvgSamplesPerSec=1.3616781398655822, CurrSamplesPerSec=1.6075585804646155, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:46:48,327] [INFO] [logging.py:68:log_dist] [Rank 0] step=360, skipped=0, lr=[2.246913580246914e-05], mom=[[0.9, 0.999]]
[2022-10-23 00:46:48,374] [INFO] [timer.py:207:stop] 0/1440, RunningAvgSamplesPerSec=1.3612746098318007, CurrSamplesPerSec=0.8378631144964134, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:47:16,291] [INFO] [timer.py:207:stop] 0/1450, RunningAvgSamplesPerSec=1.3617568339531478, CurrSamp

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-23 00:51:12,838] [INFO] [timer.py:207:stop] 0/1510, RunningAvgSamplesPerSec=1.3617547349925851, CurrSamplesPerSec=1.6095993322604087, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:51:43,647] [INFO] [logging.py:68:log_dist] [Rank 0] step=380, skipped=0, lr=[1.7530864197530865e-05], mom=[[0.9, 0.999]]
[2022-10-23 00:51:43,694] [INFO] [timer.py:207:stop] 0/1520, RunningAvgSamplesPerSec=1.36131328291461, CurrSamplesPerSec=0.8325482526388418, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:52:11,679] [INFO] [timer.py:207:stop] 0/1530, RunningAvgSamplesPerSec=1.3617511194660321, CurrSamplesPerSec=1.6021488782714854, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:52:42,461] [INFO] [timer.py:207:stop] 0/1540, RunningAvgSamplesPerSec=1.3613402162505917, CurrSamplesPerSec=0.8301159846653422, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:53:10,416] [INFO] [timer.py:207:stop] 0/1550, RunningAvgSamplesPerSec=1.3617791789951497, CurrSampl

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-23 00:57:07,328] [INFO] [timer.py:207:stop] 0/1610, RunningAvgSamplesPerSec=1.3617305474462025, CurrSamplesPerSec=1.6031107420223167, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:57:38,048] [INFO] [timer.py:207:stop] 0/1620, RunningAvgSamplesPerSec=1.361355534234628, CurrSamplesPerSec=0.8350233666163778, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:58:06,139] [INFO] [timer.py:207:stop] 0/1630, RunningAvgSamplesPerSec=1.361734276620955, CurrSamplesPerSec=1.6136026772167755, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:58:36,857] [INFO] [logging.py:68:log_dist] [Rank 0] step=410, skipped=0, lr=[1.0123456790123458e-05], mom=[[0.9, 0.999]]
[2022-10-23 00:58:36,905] [INFO] [timer.py:207:stop] 0/1640, RunningAvgSamplesPerSec=1.361350898914486, CurrSamplesPerSec=0.8373256471584996, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 00:59:04,908] [INFO] [timer.py:207:stop] 0/1650, RunningAvgSamplesPerSec=1.3617496649667378, CurrSample

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8


[2022-10-23 01:03:02,075] [INFO] [timer.py:207:stop] 0/1710, RunningAvgSamplesPerSec=1.3616408922688963, CurrSamplesPerSec=1.5938916648338939, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 01:03:32,805] [INFO] [logging.py:68:log_dist] [Rank 0] step=430, skipped=0, lr=[5.185185185185185e-06], mom=[[0.9, 0.999]]
[2022-10-23 01:03:32,852] [INFO] [timer.py:207:stop] 0/1720, RunningAvgSamplesPerSec=1.3612745198621452, CurrSamplesPerSec=0.8317874046787228, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 01:04:00,791] [INFO] [timer.py:207:stop] 0/1730, RunningAvgSamplesPerSec=1.3616726011883415, CurrSamplesPerSec=1.603398775537861, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 01:04:31,520] [INFO] [timer.py:207:stop] 0/1740, RunningAvgSamplesPerSec=1.3613227818636409, CurrSamplesPerSec=0.8353402584806334, MemAllocated=8.33GB, MaxMemAllocated=19.94GB
[2022-10-23 01:04:59,406] [INFO] [timer.py:207:stop] 0/1750, RunningAvgSamplesPerSec=1.3617300946322934, CurrSampl

***** Running Evaluation *****
  Num examples = 400
  Batch size = 8
Saving model checkpoint to /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-450
Configuration saved in /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-450/config.json
Model weights saved in /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-450/pytorch_model.bin


[2022-10-23 01:08:40,616] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step450 is begin to save!
[2022-10-23 01:08:40,620] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-450/global_step450/mp_rank_00_model_states.pt
[2022-10-23 01:08:40,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-450/global_step450/mp_rank_00_model_states.pt...
[2022-10-23 01:08:53,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-450/global_step450/mp_rank_00_model_states.pt.
[2022-10-23 01:08:53,211] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/checkpoint-450/global_step450/zero_pp_rank_0_mp_rank_00_optim_states.pt...
[2022-10-23 01:09:21,404] [INFO] [torch_check



Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/config.json
Model weights saved in /home/gordon/pinxi/cs4248/models/finetuned-bloom-1b7-003-e2/pytorch_model.bin


In [23]:
sample_prompt = """Hey John,

Great getting to know you and your team. I'm excited to be part of this amazing department and to be working with you!

In the meantime, could you take a look at the minutes of the budget review discussed today? I'd need to know by Thursday if that's possible.

Also, could you share any progress you have had on the marketing review? I will need to update the management team later. Also will need you to setup a Zoom call tomorrow 2pm for this meeting. 

Thanks.
Peter

===

"""

input_ids = tokenizer.encode(sample_prompt, return_tensors="pt").to(device)
pred_ids = model.generate(
    input_ids,
    max_length = len(input_ids[0]) + 1000,
)
pred_text = tokenizer.decode(pred_ids[0])
start_idx = pred_text.find("===\n\n") + 5
end_idx = pred_text.find("\nEND")
pred_output = pred_text[start_idx:end_idx]
print(pred_output)

- review the attached document by this Thursday
- schedule an online meeting on at 2 pm for tomorrow
- give an update
